diff options
Diffstat (limited to 'lex-fetch')
-rw-r--r-- | lex-fetch/at/at.go | 181 | ||||
-rw-r--r-- | lex-fetch/de/de.go | 66 | ||||
-rw-r--r-- | lex-fetch/main.go | 133 | ||||
-rw-r--r-- | lex-fetch/progress/progress.go | 27 | ||||
-rw-r--r-- | lex-fetch/uk/uk.go | 90 |
5 files changed, 497 insertions, 0 deletions
diff --git a/lex-fetch/at/at.go b/lex-fetch/at/at.go new file mode 100644 index 0000000..44efb27 --- /dev/null +++ b/lex-fetch/at/at.go @@ -0,0 +1,181 @@ +package at + +import ( + "encoding/json" + "fmt" + "log/slog" + "maps" + "math" + "net/http" + "net/url" + "slices" + "strconv" + "strings" + "sync" + "time" + + "push-f.com/lex-surf/internal/lex" + "push-f.com/lex-surf/lex-fetch/progress" +) + +type Fetcher struct{} + +const concurrentRequests = 4 + +func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) { + // The API is documented in https://data.bka.gv.at/ris/ogd/v2.6/Documents/Dokumentation_OGD-RIS_API.pdf. + + // Consolidated laws can only be queried via the Bundesrecht endpoint which returns individual paragraphs. + // Since we'll get multiple results for each law, we're saving results into a hash map. + lawsMap := make(map[string]lex.Law) + + // TODO: also query laws from the past and future + date := time.Now().Format("2006-01-02") + + data, err := fetchPage(client, date, 1) + if err != nil { + return nil, fmt.Errorf("failed to fetch first page: %w", err) + } + totalResults, _ := strconv.Atoi(data.Hits.Text) + if totalResults == 0 { + return nil, fmt.Errorf("API returned 0 results") + } + totalPages := int(math.Ceil(float64(totalResults) / 100)) + progress.Total = totalPages + assign(lawsMap, data.OgdDocumentReference) + + semaphore := make(chan struct{}, concurrentRequests) + var wg sync.WaitGroup + var mu sync.Mutex + var retErr error + var retErrPage int + var errOnce sync.Once + + for page := 2; page <= totalPages; page++ { + if retErr != nil { + return nil, fmt.Errorf("failed to fetch page %d: %w", retErrPage, retErr) + } + wg.Add(1) + semaphore <- struct{}{} + + go func(p int) { + defer wg.Done() + defer func() { <-semaphore }() + + data, err := fetchPage(client, date, page) + progress.ReportProgress(page) + if err != nil { + errOnce.Do(func() { + retErr = err + retErrPage = p + }) + } + mu.Lock() + assign(lawsMap, data.OgdDocumentReference) + mu.Unlock() + }(page) + } + laws := slices.SortedFunc(maps.Values(lawsMap), func(a, b lex.Law) int { + if a.Title > b.Title { + return 1 + } else if a.Title < b.Title { + return -1 + } + return 0 + }) + return laws, nil +} + +func fetchPage(client *http.Client, date string, page int) (*ogdDocumentResults, error) { + req, err := http.NewRequest("GET", "https://data.bka.gv.at/ris/api/v2.6/Bundesrecht", nil) + if err != nil { + return nil, fmt.Errorf("build request: %w", err) + } + req.URL.RawQuery = url.Values{ + "Appl": {"BrKons"}, // Bundesrecht konsolidiert + "Typ": {"BG oder BVG"}, // Bundesgesetz or Bundesverfassungsgesetz + "DokumenteProSeite": {"OneHundred"}, + "Seitennummer": {strconv.Itoa(page)}, + "FassungVom": {date}, + }.Encode() + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("send request: %w", err) + } + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) + } + + var data brKonsResult + + err = json.NewDecoder(resp.Body).Decode(&data) + if err != nil { + return nil, fmt.Errorf("JSON decode: %w", err) + } + + result := data.OgdSearchResult + + if result.Error != nil { + return nil, fmt.Errorf("error response: %s", result.Error) + } + + return result.OgdDocumentResults, nil +} + +func assign(laws map[string]lex.Law, paraDocs []document) { + for _, paraDoc := range paraDocs { + para := paraDoc.Data.Metadaten.Bundesrecht + law := lex.Law{ + Title: para.Kurztitel, + URL: para.BrKons.GesamteRechtsvorschriftUrl, + } + if para.BrKons.Abkuerzung != nil { + law.Abbr = *para.BrKons.Abkuerzung + redir := strings.ToLower(*para.BrKons.Abkuerzung) + redir = strings.ReplaceAll(redir, ")", "") + redir = strings.ReplaceAll(redir, "(", "") + redir = strings.ReplaceAll(redir, " – ", "-") + redir = strings.ReplaceAll(redir, " ", "-") + redir = strings.ReplaceAll(redir, "\u00A0", "-") + redir = strings.ReplaceAll(redir, "ä", "ae") + redir = strings.ReplaceAll(redir, "ü", "ue") + redir = strings.ReplaceAll(redir, "ö", "oe") + redir = strings.ReplaceAll(redir, "ß", "ss") + law.Redir = redir + } + laws[para.BrKons.Gesetzesnummer] = law + } +} + +type brKonsResult struct { + OgdSearchResult struct { + Error *struct { + Applikation string + Message string + } + OgdDocumentResults *ogdDocumentResults + } +} + +type ogdDocumentResults struct { + Hits struct { + Text string `json:"#text"` + } + OgdDocumentReference []document +} + +type document struct { + Data struct { + Metadaten struct { + Bundesrecht struct { + Kurztitel string + BrKons struct { + GesamteRechtsvorschriftUrl string + Abkuerzung *string + Gesetzesnummer string + } + } + } + } +} diff --git a/lex-fetch/de/de.go b/lex-fetch/de/de.go new file mode 100644 index 0000000..2c0a033 --- /dev/null +++ b/lex-fetch/de/de.go @@ -0,0 +1,66 @@ +package de + +import ( + "fmt" + "log/slog" + "net/http" + "net/url" + "strings" + + "github.com/antchfx/htmlquery" + "golang.org/x/text/encoding/charmap" + "golang.org/x/text/transform" + "push-f.com/lex-surf/internal/lex" + "push-f.com/lex-surf/lex-fetch/progress" +) + +type Fetcher struct{} + +var pages = []rune("ABCDEFGHIJKLMNOPQRSTUVWYZ123456789") + +func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) { + progress.Total = len(pages) + var laws []lex.Law + for i := range len(pages) { + resp, err := client.Get(fmt.Sprintf("https://www.gesetze-im-internet.de/Teilliste_%c.html", pages[i])) + if err != nil { + return nil, fmt.Errorf("failed to fetch page: %w", err) + } + if resp.StatusCode != 200 { + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) + } + reader := transform.NewReader(resp.Body, charmap.ISO8859_1.NewDecoder()) + doc, err := htmlquery.Parse(reader) + if err != nil { + return nil, fmt.Errorf("failed to HTML parse: %w", err) + } + div := htmlquery.FindOne(doc, "//div[@id='paddingLR12']") + if div == nil { + return nil, fmt.Errorf("didn't find expected HTML div") + } + for child := range div.ChildNodes() { + if child.FirstChild != nil { + href := htmlquery.SelectAttr(child.FirstChild, "href") + redir, ok := strings.CutPrefix(href, "./") + if !ok { + return nil, fmt.Errorf("expected href to start with ./ but found %s", href) + } + redir, ok = strings.CutSuffix(redir, "/index.html") + if !ok { + return nil, fmt.Errorf("expected href to end with /index.html but found %s", href) + } + + hrefUrl, _ := url.Parse(href) + + laws = append(laws, lex.Law{ + URL: resp.Request.URL.ResolveReference(hrefUrl).String(), + Title: strings.TrimSpace(child.FirstChild.NextSibling.NextSibling.Data), + Redir: redir, + Abbr: redir, + }) + } + } + progress.ReportProgress(i + 1) + } + return laws, nil +} diff --git a/lex-fetch/main.go b/lex-fetch/main.go new file mode 100644 index 0000000..57019b4 --- /dev/null +++ b/lex-fetch/main.go @@ -0,0 +1,133 @@ +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "io" + "log/slog" + "maps" + "net" + "net/http" + "os" + "slices" + + "golang.org/x/term" + "push-f.com/lex-surf/internal/lex" + "push-f.com/lex-surf/lex-fetch/at" + "push-f.com/lex-surf/lex-fetch/de" + "push-f.com/lex-surf/lex-fetch/progress" + "push-f.com/lex-surf/lex-fetch/uk" +) + +type Fetcher interface { + Fetch(log *slog.Logger, client *http.Client, reporter *progress.Reporter) ([]lex.Law, error) +} + +var fetchers = map[string]Fetcher{ + "at": &at.Fetcher{}, + "de": &de.Fetcher{}, + "uk": &uk.Fetcher{}, +} + +var logger *slog.Logger + +func printUsage() { + fmt.Printf("usage: %s [options] <country> <out>\n", os.Args[0]) + fmt.Printf("where <country> is one of %v\n", slices.Sorted(maps.Keys(fetchers))) + fmt.Println("options are:") + flag.PrintDefaults() +} + +func main() { + debug := flag.Bool("debug", false, "Enable debug logging") + flag.Usage = printUsage + flag.Parse() + + args := flag.Args() + if len(args) != 2 { + printUsage() + os.Exit(1) + } + + country := args[0] + out := args[1] + + client := http.Client{ + Transport: &CustomTransport{}, + } + + fetcher, ok := fetchers[country] + if !ok { + printUsage() + os.Exit(1) + } + + logOptions := slog.HandlerOptions{} + if *debug { + logOptions.Level = slog.LevelDebug + } + logger = slog.New(slog.NewTextHandler(os.Stderr, &logOptions)) + + var progressReporter progress.Reporter + if term.IsTerminal(int(os.Stdout.Fd())) { + progressReporter = progress.NewReporter(os.Stdout) + } else { + progressReporter = progress.NewReporter(io.Discard) + } + + laws, err := fetcher.Fetch(logger, &client, &progressReporter) + if err != nil { + logger.Error("fetching failed", "error", err) + os.Exit(1) + } + + if len(laws) == 0 { + logger.Error("fetcher found 0 laws") + os.Exit(1) + } + + file, err := os.Create(out) + if err != nil { + logger.Error("failed to create file", "err", err, "path", out) + os.Exit(1) + } + defer file.Close() + + err = json.NewEncoder(file).Encode(laws) + if err != nil { + logger.Error("failed to encode laws as JSON", "err", err) + os.Exit(1) + } + + socketPath := os.Getenv("SOCKET_PATH") + if socketPath == "" { + logger.Info("not notifyng lex-serve because SOCKET_PATH isn't set") + } else { + client = http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, network string, addr string) (net.Conn, error) { + return net.Dial("unix", socketPath) + }, + }, + } + resp, err := client.Get("http://internal.invalid/update?country=" + country) + if err != nil { + logger.Error("failed to update lex-serve", "err", err) + os.Exit(1) + } + if resp.StatusCode != 200 { + logger.Error("unexpected status code from lex-serve", "statusCode", resp.StatusCode) + os.Exit(1) + } + } +} + +type CustomTransport struct{} + +func (t *CustomTransport) RoundTrip(req *http.Request) (*http.Response, error) { + logger.Debug("request", "method", req.Method, "url", req.URL) + req.Header["User-Agent"] = []string{"lex-surf"} + return http.DefaultTransport.RoundTrip(req) +} diff --git a/lex-fetch/progress/progress.go b/lex-fetch/progress/progress.go new file mode 100644 index 0000000..430d36e --- /dev/null +++ b/lex-fetch/progress/progress.go @@ -0,0 +1,27 @@ +package progress + +import ( + "fmt" + "io" + "sync" +) + +type Reporter struct { + Total int + cur int + mu sync.Mutex + writer io.Writer +} + +func NewReporter(writer io.Writer) Reporter { + return Reporter{writer: writer} +} + +func (r *Reporter) ReportProgress(num int) { + r.mu.Lock() + if num > r.cur { + fmt.Fprintf(r.writer, "%d/%d\n", num, r.Total) + r.cur = num + } + r.mu.Unlock() +} diff --git a/lex-fetch/uk/uk.go b/lex-fetch/uk/uk.go new file mode 100644 index 0000000..f8e1510 --- /dev/null +++ b/lex-fetch/uk/uk.go @@ -0,0 +1,90 @@ +package uk + +import ( + "fmt" + "log/slog" + "net/http" + "regexp" + "strconv" + + "github.com/antchfx/xmlquery" + "push-f.com/lex-surf/internal/lex" + "push-f.com/lex-surf/lex-fetch/progress" +) + +type Fetcher struct{} + +func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) { + firstPage, err := fetchPage(client, 1) + if err != nil { + return nil, fmt.Errorf("failed to fetch first page: %w", err) + } + progress.Total = firstPage.totalItems / firstPage.itemsPerPage + + laws := firstPage.entries + + for page := 2; page <= progress.Total; page++ { + feed, err := fetchPage(client, page) + if err != nil { + return nil, fmt.Errorf("failed to fetch page: %w", err) + } + laws = append(laws, feed.entries...) + progress.ReportProgress(page) + } + return laws, nil +} + +func fetchPage(client *http.Client, page int) (*feed, error) { + resp, err := client.Get(fmt.Sprintf("https://www.legislation.gov.uk/ukpga/data.feed?page=%d", page)) + if err != nil { + return nil, fmt.Errorf("failed to fetch page: %w", err) + } + if resp.StatusCode != 200 { + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) + } + + doc, err := xmlquery.Parse(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to HTML parse: %w", err) + } + itemsPerPageEl := xmlquery.FindOne(doc, "//openSearch:itemsPerPage") + if itemsPerPageEl == nil { + return nil, fmt.Errorf("couldn't find openSearch:itemsPerPage") + } + itemsPerPage, _ := strconv.Atoi(itemsPerPageEl.InnerText()) + + facetTypesEl := xmlquery.FindOne(doc, "//leg:facetType") + if facetTypesEl == nil { + return nil, fmt.Errorf("couldn't find leg:facetType") + } + totalItems, _ := strconv.Atoi(facetTypesEl.SelectAttr("value")) + if totalItems == 0 { + return nil, fmt.Errorf("detected 0 total results") + } + + var laws []lex.Law + + for _, entry := range xmlquery.Find(doc, "//entry") { + title := xmlquery.FindOne(entry, "./title").InnerText() + repealed, _ := regexp.MatchString("(?i)\\(repealed( .+)?\\)$", title) + if repealed { + continue + } + laws = append(laws, lex.Law{ + Title: title, + URL: xmlquery.FindOne(entry, "./link[@rel='self']").SelectAttr("href"), + }) + } + + return &feed{ + totalItems: totalItems, + itemsPerPage: itemsPerPage, + entries: laws, + }, nil +} + +type feed struct { + itemsPerPage int + totalItems int + entries []lex.Law +} |