diff options
| author | Martin Fischer <martin@push-f.com> | 2026-06-29 07:49:04 +0200 |
|---|---|---|
| committer | Martin Fischer <martin@push-f.com> | 2026-06-29 09:10:42 +0200 |
| commit | d8da5e5ff843aa23b12b66f67eaa63209b21f0b4 (patch) | |
| tree | 0b71f946c754df16406e38c180f390c177569252 /nixos | |
| parent | a02ee67ecc3a4970d771b88c03b69394c606aa49 (diff) | |
Diffstat (limited to 'nixos')
| -rw-r--r-- | nixos/hosts/tente/bad-bots.txt | 3 | ||||
| -rw-r--r-- | nixos/hosts/tente/git-web.nix | 31 |
2 files changed, 23 insertions, 11 deletions
diff --git a/nixos/hosts/tente/bad-bots.txt b/nixos/hosts/tente/bad-bots.txt deleted file mode 100644 index 780c477..0000000 --- a/nixos/hosts/tente/bad-bots.txt +++ /dev/null @@ -1,3 +0,0 @@ -# 7 million requests in 5 days -User-agent: ClaudeBot -Disallow: / diff --git a/nixos/hosts/tente/git-web.nix b/nixos/hosts/tente/git-web.nix index b648873..c2fc709 100644 --- a/nixos/hosts/tente/git-web.nix +++ b/nixos/hosts/tente/git-web.nix @@ -25,18 +25,33 @@ in enableACME = true; forceSSL = true; extraConfig = helpers.mkNginxConfig cfg.domain; + + # LLM companies like to aggressively scrape git web interfaces + # (which is stupid since they could just clone the repo). + # The good ones respect robots.txt; the bad ones ignore it and + # use fake browser user agents and hundreds of IP addresses. + # The number of possible URLs is O(commits x files), which can be a lot, + # so we guard the /<repo>/tree/<path>?id=<commit> endpoint with Basic Auth. + locations."/".extraConfig = + let + cgitHtpasswd = pkgs.writeText "cgit.htpasswd" "guest:{PLAIN}\n"; + in + '' + set $auth off; + # Unfortunately mainstream browsers don't display the realm anymore. + set $realm "The username is guest, the password is blank."; + + # We disallow /tree/ with URL query parameters. + if ($request_uri ~ ^/[^/]+/tree(/[^?]*)?/?\?.+$) { set $auth $realm; } + if ($request_uri ~ ^/[^/]+/diff(/|$|\?)) { set $auth $realm; } + + auth_basic $auth; + auth_basic_user_file ${cgitHtpasswd}; + ''; }; services.cgit.main = { enable = true; - package = pkgs.runCommand "cgit-with-extended-robots-txt" {} '' - cp -r ${pkgs.cgit} $out - robots_txt=$out/cgit/robots.txt - chmod u+w $robots_txt - echo >> $robots_txt - cat ${./bad-bots.txt} >> $robots_txt - ''; - user = cfg.user; group = cfg.group; nginx.virtualHost = cfg.domain; |
