From d8da5e5ff843aa23b12b66f67eaa63209b21f0b4 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Mon, 29 Jun 2026 07:49:04 +0200 Subject: tweak(tente): put cgit tree for commit behind basic auth --- nixos/hosts/tente/bad-bots.txt | 3 --- nixos/hosts/tente/git-web.nix | 31 +++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) delete mode 100644 nixos/hosts/tente/bad-bots.txt diff --git a/nixos/hosts/tente/bad-bots.txt b/nixos/hosts/tente/bad-bots.txt deleted file mode 100644 index 780c477..0000000 --- a/nixos/hosts/tente/bad-bots.txt +++ /dev/null @@ -1,3 +0,0 @@ -# 7 million requests in 5 days -User-agent: ClaudeBot -Disallow: / diff --git a/nixos/hosts/tente/git-web.nix b/nixos/hosts/tente/git-web.nix index b648873..c2fc709 100644 --- a/nixos/hosts/tente/git-web.nix +++ b/nixos/hosts/tente/git-web.nix @@ -25,18 +25,33 @@ in enableACME = true; forceSSL = true; extraConfig = helpers.mkNginxConfig cfg.domain; + + # LLM companies like to aggressively scrape git web interfaces + # (which is stupid since they could just clone the repo). + # The good ones respect robots.txt; the bad ones ignore it and + # use fake browser user agents and hundreds of IP addresses. + # The number of possible URLs is O(commits x files), which can be a lot, + # so we guard the //tree/?id= endpoint with Basic Auth. + locations."/".extraConfig = + let + cgitHtpasswd = pkgs.writeText "cgit.htpasswd" "guest:{PLAIN}\n"; + in + '' + set $auth off; + # Unfortunately mainstream browsers don't display the realm anymore. + set $realm "The username is guest, the password is blank."; + + # We disallow /tree/ with URL query parameters. + if ($request_uri ~ ^/[^/]+/tree(/[^?]*)?/?\?.+$) { set $auth $realm; } + if ($request_uri ~ ^/[^/]+/diff(/|$|\?)) { set $auth $realm; } + + auth_basic $auth; + auth_basic_user_file ${cgitHtpasswd}; + ''; }; services.cgit.main = { enable = true; - package = pkgs.runCommand "cgit-with-extended-robots-txt" {} '' - cp -r ${pkgs.cgit} $out - robots_txt=$out/cgit/robots.txt - chmod u+w $robots_txt - echo >> $robots_txt - cat ${./bad-bots.txt} >> $robots_txt - ''; - user = cfg.user; group = cfg.group; nginx.virtualHost = cfg.domain; -- cgit v1.3.1