package de import ( "fmt" "log/slog" "net/http" "net/url" "strings" "github.com/antchfx/htmlquery" "golang.org/x/text/encoding/charmap" "golang.org/x/text/transform" "push-f.com/lex-surf/internal/lex" "push-f.com/lex-surf/lex-fetch/progress" ) type Fetcher struct{} var pages = []rune("ABCDEFGHIJKLMNOPQRSTUVWYZ123456789") func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) { progress.Total = len(pages) var laws []lex.Law for i := range len(pages) { resp, err := client.Get(fmt.Sprintf("https://www.gesetze-im-internet.de/Teilliste_%c.html", pages[i])) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } if resp.StatusCode != 200 { return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) } reader := transform.NewReader(resp.Body, charmap.ISO8859_1.NewDecoder()) doc, err := htmlquery.Parse(reader) if err != nil { return nil, fmt.Errorf("failed to HTML parse: %w", err) } div := htmlquery.FindOne(doc, "//div[@id='paddingLR12']") if div == nil { return nil, fmt.Errorf("didn't find expected HTML div") } for child := range div.ChildNodes() { if child.FirstChild != nil { href := htmlquery.SelectAttr(child.FirstChild, "href") redir, ok := strings.CutPrefix(href, "./") if !ok { return nil, fmt.Errorf("expected href to start with ./ but found %s", href) } redir, ok = strings.CutSuffix(redir, "/index.html") if !ok { return nil, fmt.Errorf("expected href to end with /index.html but found %s", href) } hrefUrl, _ := url.Parse(href) laws = append(laws, lex.Law{ URL: resp.Request.URL.ResolveReference(hrefUrl).String(), Title: strings.TrimSpace(child.FirstChild.NextSibling.NextSibling.Data), Redir: redir, Abbr: redir, }) } } progress.ReportProgress(i + 1) } return laws, nil }