aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2026-06-29 07:49:04 +0200
committerMartin Fischer <martin@push-f.com>2026-06-29 09:10:42 +0200
commitd8da5e5ff843aa23b12b66f67eaa63209b21f0b4 (patch)
tree0b71f946c754df16406e38c180f390c177569252
parenta02ee67ecc3a4970d771b88c03b69394c606aa49 (diff)
tweak(tente): put cgit tree for commit behind basic authHEADmaster
-rw-r--r--nixos/hosts/tente/bad-bots.txt3
-rw-r--r--nixos/hosts/tente/git-web.nix31
2 files changed, 23 insertions, 11 deletions
diff --git a/nixos/hosts/tente/bad-bots.txt b/nixos/hosts/tente/bad-bots.txt
deleted file mode 100644
index 780c477..0000000
--- a/nixos/hosts/tente/bad-bots.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-# 7 million requests in 5 days
-User-agent: ClaudeBot
-Disallow: /
diff --git a/nixos/hosts/tente/git-web.nix b/nixos/hosts/tente/git-web.nix
index b648873..c2fc709 100644
--- a/nixos/hosts/tente/git-web.nix
+++ b/nixos/hosts/tente/git-web.nix
@@ -25,18 +25,33 @@ in
enableACME = true;
forceSSL = true;
extraConfig = helpers.mkNginxConfig cfg.domain;
+
+ # LLM companies like to aggressively scrape git web interfaces
+ # (which is stupid since they could just clone the repo).
+ # The good ones respect robots.txt; the bad ones ignore it and
+ # use fake browser user agents and hundreds of IP addresses.
+ # The number of possible URLs is O(commits x files), which can be a lot,
+ # so we guard the /<repo>/tree/<path>?id=<commit> endpoint with Basic Auth.
+ locations."/".extraConfig =
+ let
+ cgitHtpasswd = pkgs.writeText "cgit.htpasswd" "guest:{PLAIN}\n";
+ in
+ ''
+ set $auth off;
+ # Unfortunately mainstream browsers don't display the realm anymore.
+ set $realm "The username is guest, the password is blank.";
+
+ # We disallow /tree/ with URL query parameters.
+ if ($request_uri ~ ^/[^/]+/tree(/[^?]*)?/?\?.+$) { set $auth $realm; }
+ if ($request_uri ~ ^/[^/]+/diff(/|$|\?)) { set $auth $realm; }
+
+ auth_basic $auth;
+ auth_basic_user_file ${cgitHtpasswd};
+ '';
};
services.cgit.main = {
enable = true;
- package = pkgs.runCommand "cgit-with-extended-robots-txt" {} ''
- cp -r ${pkgs.cgit} $out
- robots_txt=$out/cgit/robots.txt
- chmod u+w $robots_txt
- echo >> $robots_txt
- cat ${./bad-bots.txt} >> $robots_txt
- '';
-
user = cfg.user;
group = cfg.group;
nginx.virtualHost = cfg.domain;