summaryrefslogtreecommitdiff
path: root/lex-fetch
diff options
context:
space:
mode:
Diffstat (limited to 'lex-fetch')
-rw-r--r--lex-fetch/at/at.go181
-rw-r--r--lex-fetch/de/de.go66
-rw-r--r--lex-fetch/main.go133
-rw-r--r--lex-fetch/progress/progress.go27
-rw-r--r--lex-fetch/uk/uk.go90
5 files changed, 497 insertions, 0 deletions
diff --git a/lex-fetch/at/at.go b/lex-fetch/at/at.go
new file mode 100644
index 0000000..44efb27
--- /dev/null
+++ b/lex-fetch/at/at.go
@@ -0,0 +1,181 @@
+package at
+
+import (
+ "encoding/json"
+ "fmt"
+ "log/slog"
+ "maps"
+ "math"
+ "net/http"
+ "net/url"
+ "slices"
+ "strconv"
+ "strings"
+ "sync"
+ "time"
+
+ "push-f.com/lex-surf/internal/lex"
+ "push-f.com/lex-surf/lex-fetch/progress"
+)
+
+type Fetcher struct{}
+
+const concurrentRequests = 4
+
+func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) {
+ // The API is documented in https://data.bka.gv.at/ris/ogd/v2.6/Documents/Dokumentation_OGD-RIS_API.pdf.
+
+ // Consolidated laws can only be queried via the Bundesrecht endpoint which returns individual paragraphs.
+ // Since we'll get multiple results for each law, we're saving results into a hash map.
+ lawsMap := make(map[string]lex.Law)
+
+ // TODO: also query laws from the past and future
+ date := time.Now().Format("2006-01-02")
+
+ data, err := fetchPage(client, date, 1)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch first page: %w", err)
+ }
+ totalResults, _ := strconv.Atoi(data.Hits.Text)
+ if totalResults == 0 {
+ return nil, fmt.Errorf("API returned 0 results")
+ }
+ totalPages := int(math.Ceil(float64(totalResults) / 100))
+ progress.Total = totalPages
+ assign(lawsMap, data.OgdDocumentReference)
+
+ semaphore := make(chan struct{}, concurrentRequests)
+ var wg sync.WaitGroup
+ var mu sync.Mutex
+ var retErr error
+ var retErrPage int
+ var errOnce sync.Once
+
+ for page := 2; page <= totalPages; page++ {
+ if retErr != nil {
+ return nil, fmt.Errorf("failed to fetch page %d: %w", retErrPage, retErr)
+ }
+ wg.Add(1)
+ semaphore <- struct{}{}
+
+ go func(p int) {
+ defer wg.Done()
+ defer func() { <-semaphore }()
+
+ data, err := fetchPage(client, date, page)
+ progress.ReportProgress(page)
+ if err != nil {
+ errOnce.Do(func() {
+ retErr = err
+ retErrPage = p
+ })
+ }
+ mu.Lock()
+ assign(lawsMap, data.OgdDocumentReference)
+ mu.Unlock()
+ }(page)
+ }
+ laws := slices.SortedFunc(maps.Values(lawsMap), func(a, b lex.Law) int {
+ if a.Title > b.Title {
+ return 1
+ } else if a.Title < b.Title {
+ return -1
+ }
+ return 0
+ })
+ return laws, nil
+}
+
+func fetchPage(client *http.Client, date string, page int) (*ogdDocumentResults, error) {
+ req, err := http.NewRequest("GET", "https://data.bka.gv.at/ris/api/v2.6/Bundesrecht", nil)
+ if err != nil {
+ return nil, fmt.Errorf("build request: %w", err)
+ }
+ req.URL.RawQuery = url.Values{
+ "Appl": {"BrKons"}, // Bundesrecht konsolidiert
+ "Typ": {"BG oder BVG"}, // Bundesgesetz or Bundesverfassungsgesetz
+ "DokumenteProSeite": {"OneHundred"},
+ "Seitennummer": {strconv.Itoa(page)},
+ "FassungVom": {date},
+ }.Encode()
+ resp, err := client.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("send request: %w", err)
+ }
+
+ if resp.StatusCode != 200 {
+ return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+ }
+
+ var data brKonsResult
+
+ err = json.NewDecoder(resp.Body).Decode(&data)
+ if err != nil {
+ return nil, fmt.Errorf("JSON decode: %w", err)
+ }
+
+ result := data.OgdSearchResult
+
+ if result.Error != nil {
+ return nil, fmt.Errorf("error response: %s", result.Error)
+ }
+
+ return result.OgdDocumentResults, nil
+}
+
+func assign(laws map[string]lex.Law, paraDocs []document) {
+ for _, paraDoc := range paraDocs {
+ para := paraDoc.Data.Metadaten.Bundesrecht
+ law := lex.Law{
+ Title: para.Kurztitel,
+ URL: para.BrKons.GesamteRechtsvorschriftUrl,
+ }
+ if para.BrKons.Abkuerzung != nil {
+ law.Abbr = *para.BrKons.Abkuerzung
+ redir := strings.ToLower(*para.BrKons.Abkuerzung)
+ redir = strings.ReplaceAll(redir, ")", "")
+ redir = strings.ReplaceAll(redir, "(", "")
+ redir = strings.ReplaceAll(redir, " – ", "-")
+ redir = strings.ReplaceAll(redir, " ", "-")
+ redir = strings.ReplaceAll(redir, "\u00A0", "-")
+ redir = strings.ReplaceAll(redir, "ä", "ae")
+ redir = strings.ReplaceAll(redir, "ü", "ue")
+ redir = strings.ReplaceAll(redir, "ö", "oe")
+ redir = strings.ReplaceAll(redir, "ß", "ss")
+ law.Redir = redir
+ }
+ laws[para.BrKons.Gesetzesnummer] = law
+ }
+}
+
+type brKonsResult struct {
+ OgdSearchResult struct {
+ Error *struct {
+ Applikation string
+ Message string
+ }
+ OgdDocumentResults *ogdDocumentResults
+ }
+}
+
+type ogdDocumentResults struct {
+ Hits struct {
+ Text string `json:"#text"`
+ }
+ OgdDocumentReference []document
+}
+
+type document struct {
+ Data struct {
+ Metadaten struct {
+ Bundesrecht struct {
+ Kurztitel string
+ BrKons struct {
+ GesamteRechtsvorschriftUrl string
+ Abkuerzung *string
+ Gesetzesnummer string
+ }
+ }
+ }
+ }
+}
diff --git a/lex-fetch/de/de.go b/lex-fetch/de/de.go
new file mode 100644
index 0000000..2c0a033
--- /dev/null
+++ b/lex-fetch/de/de.go
@@ -0,0 +1,66 @@
+package de
+
+import (
+ "fmt"
+ "log/slog"
+ "net/http"
+ "net/url"
+ "strings"
+
+ "github.com/antchfx/htmlquery"
+ "golang.org/x/text/encoding/charmap"
+ "golang.org/x/text/transform"
+ "push-f.com/lex-surf/internal/lex"
+ "push-f.com/lex-surf/lex-fetch/progress"
+)
+
+type Fetcher struct{}
+
+var pages = []rune("ABCDEFGHIJKLMNOPQRSTUVWYZ123456789")
+
+func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) {
+ progress.Total = len(pages)
+ var laws []lex.Law
+ for i := range len(pages) {
+ resp, err := client.Get(fmt.Sprintf("https://www.gesetze-im-internet.de/Teilliste_%c.html", pages[i]))
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch page: %w", err)
+ }
+ if resp.StatusCode != 200 {
+ return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+ }
+ reader := transform.NewReader(resp.Body, charmap.ISO8859_1.NewDecoder())
+ doc, err := htmlquery.Parse(reader)
+ if err != nil {
+ return nil, fmt.Errorf("failed to HTML parse: %w", err)
+ }
+ div := htmlquery.FindOne(doc, "//div[@id='paddingLR12']")
+ if div == nil {
+ return nil, fmt.Errorf("didn't find expected HTML div")
+ }
+ for child := range div.ChildNodes() {
+ if child.FirstChild != nil {
+ href := htmlquery.SelectAttr(child.FirstChild, "href")
+ redir, ok := strings.CutPrefix(href, "./")
+ if !ok {
+ return nil, fmt.Errorf("expected href to start with ./ but found %s", href)
+ }
+ redir, ok = strings.CutSuffix(redir, "/index.html")
+ if !ok {
+ return nil, fmt.Errorf("expected href to end with /index.html but found %s", href)
+ }
+
+ hrefUrl, _ := url.Parse(href)
+
+ laws = append(laws, lex.Law{
+ URL: resp.Request.URL.ResolveReference(hrefUrl).String(),
+ Title: strings.TrimSpace(child.FirstChild.NextSibling.NextSibling.Data),
+ Redir: redir,
+ Abbr: redir,
+ })
+ }
+ }
+ progress.ReportProgress(i + 1)
+ }
+ return laws, nil
+}
diff --git a/lex-fetch/main.go b/lex-fetch/main.go
new file mode 100644
index 0000000..57019b4
--- /dev/null
+++ b/lex-fetch/main.go
@@ -0,0 +1,133 @@
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log/slog"
+ "maps"
+ "net"
+ "net/http"
+ "os"
+ "slices"
+
+ "golang.org/x/term"
+ "push-f.com/lex-surf/internal/lex"
+ "push-f.com/lex-surf/lex-fetch/at"
+ "push-f.com/lex-surf/lex-fetch/de"
+ "push-f.com/lex-surf/lex-fetch/progress"
+ "push-f.com/lex-surf/lex-fetch/uk"
+)
+
+type Fetcher interface {
+ Fetch(log *slog.Logger, client *http.Client, reporter *progress.Reporter) ([]lex.Law, error)
+}
+
+var fetchers = map[string]Fetcher{
+ "at": &at.Fetcher{},
+ "de": &de.Fetcher{},
+ "uk": &uk.Fetcher{},
+}
+
+var logger *slog.Logger
+
+func printUsage() {
+ fmt.Printf("usage: %s [options] <country> <out>\n", os.Args[0])
+ fmt.Printf("where <country> is one of %v\n", slices.Sorted(maps.Keys(fetchers)))
+ fmt.Println("options are:")
+ flag.PrintDefaults()
+}
+
+func main() {
+ debug := flag.Bool("debug", false, "Enable debug logging")
+ flag.Usage = printUsage
+ flag.Parse()
+
+ args := flag.Args()
+ if len(args) != 2 {
+ printUsage()
+ os.Exit(1)
+ }
+
+ country := args[0]
+ out := args[1]
+
+ client := http.Client{
+ Transport: &CustomTransport{},
+ }
+
+ fetcher, ok := fetchers[country]
+ if !ok {
+ printUsage()
+ os.Exit(1)
+ }
+
+ logOptions := slog.HandlerOptions{}
+ if *debug {
+ logOptions.Level = slog.LevelDebug
+ }
+ logger = slog.New(slog.NewTextHandler(os.Stderr, &logOptions))
+
+ var progressReporter progress.Reporter
+ if term.IsTerminal(int(os.Stdout.Fd())) {
+ progressReporter = progress.NewReporter(os.Stdout)
+ } else {
+ progressReporter = progress.NewReporter(io.Discard)
+ }
+
+ laws, err := fetcher.Fetch(logger, &client, &progressReporter)
+ if err != nil {
+ logger.Error("fetching failed", "error", err)
+ os.Exit(1)
+ }
+
+ if len(laws) == 0 {
+ logger.Error("fetcher found 0 laws")
+ os.Exit(1)
+ }
+
+ file, err := os.Create(out)
+ if err != nil {
+ logger.Error("failed to create file", "err", err, "path", out)
+ os.Exit(1)
+ }
+ defer file.Close()
+
+ err = json.NewEncoder(file).Encode(laws)
+ if err != nil {
+ logger.Error("failed to encode laws as JSON", "err", err)
+ os.Exit(1)
+ }
+
+ socketPath := os.Getenv("SOCKET_PATH")
+ if socketPath == "" {
+ logger.Info("not notifyng lex-serve because SOCKET_PATH isn't set")
+ } else {
+ client = http.Client{
+ Transport: &http.Transport{
+ DialContext: func(ctx context.Context, network string, addr string) (net.Conn, error) {
+ return net.Dial("unix", socketPath)
+ },
+ },
+ }
+ resp, err := client.Get("http://internal.invalid/update?country=" + country)
+ if err != nil {
+ logger.Error("failed to update lex-serve", "err", err)
+ os.Exit(1)
+ }
+ if resp.StatusCode != 200 {
+ logger.Error("unexpected status code from lex-serve", "statusCode", resp.StatusCode)
+ os.Exit(1)
+ }
+ }
+}
+
+type CustomTransport struct{}
+
+func (t *CustomTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+ logger.Debug("request", "method", req.Method, "url", req.URL)
+ req.Header["User-Agent"] = []string{"lex-surf"}
+ return http.DefaultTransport.RoundTrip(req)
+}
diff --git a/lex-fetch/progress/progress.go b/lex-fetch/progress/progress.go
new file mode 100644
index 0000000..430d36e
--- /dev/null
+++ b/lex-fetch/progress/progress.go
@@ -0,0 +1,27 @@
+package progress
+
+import (
+ "fmt"
+ "io"
+ "sync"
+)
+
+type Reporter struct {
+ Total int
+ cur int
+ mu sync.Mutex
+ writer io.Writer
+}
+
+func NewReporter(writer io.Writer) Reporter {
+ return Reporter{writer: writer}
+}
+
+func (r *Reporter) ReportProgress(num int) {
+ r.mu.Lock()
+ if num > r.cur {
+ fmt.Fprintf(r.writer, "%d/%d\n", num, r.Total)
+ r.cur = num
+ }
+ r.mu.Unlock()
+}
diff --git a/lex-fetch/uk/uk.go b/lex-fetch/uk/uk.go
new file mode 100644
index 0000000..f8e1510
--- /dev/null
+++ b/lex-fetch/uk/uk.go
@@ -0,0 +1,90 @@
+package uk
+
+import (
+ "fmt"
+ "log/slog"
+ "net/http"
+ "regexp"
+ "strconv"
+
+ "github.com/antchfx/xmlquery"
+ "push-f.com/lex-surf/internal/lex"
+ "push-f.com/lex-surf/lex-fetch/progress"
+)
+
+type Fetcher struct{}
+
+func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) {
+ firstPage, err := fetchPage(client, 1)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch first page: %w", err)
+ }
+ progress.Total = firstPage.totalItems / firstPage.itemsPerPage
+
+ laws := firstPage.entries
+
+ for page := 2; page <= progress.Total; page++ {
+ feed, err := fetchPage(client, page)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch page: %w", err)
+ }
+ laws = append(laws, feed.entries...)
+ progress.ReportProgress(page)
+ }
+ return laws, nil
+}
+
+func fetchPage(client *http.Client, page int) (*feed, error) {
+ resp, err := client.Get(fmt.Sprintf("https://www.legislation.gov.uk/ukpga/data.feed?page=%d", page))
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch page: %w", err)
+ }
+ if resp.StatusCode != 200 {
+ return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+ }
+
+ doc, err := xmlquery.Parse(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to HTML parse: %w", err)
+ }
+ itemsPerPageEl := xmlquery.FindOne(doc, "//openSearch:itemsPerPage")
+ if itemsPerPageEl == nil {
+ return nil, fmt.Errorf("couldn't find openSearch:itemsPerPage")
+ }
+ itemsPerPage, _ := strconv.Atoi(itemsPerPageEl.InnerText())
+
+ facetTypesEl := xmlquery.FindOne(doc, "//leg:facetType")
+ if facetTypesEl == nil {
+ return nil, fmt.Errorf("couldn't find leg:facetType")
+ }
+ totalItems, _ := strconv.Atoi(facetTypesEl.SelectAttr("value"))
+ if totalItems == 0 {
+ return nil, fmt.Errorf("detected 0 total results")
+ }
+
+ var laws []lex.Law
+
+ for _, entry := range xmlquery.Find(doc, "//entry") {
+ title := xmlquery.FindOne(entry, "./title").InnerText()
+ repealed, _ := regexp.MatchString("(?i)\\(repealed( .+)?\\)$", title)
+ if repealed {
+ continue
+ }
+ laws = append(laws, lex.Law{
+ Title: title,
+ URL: xmlquery.FindOne(entry, "./link[@rel='self']").SelectAttr("href"),
+ })
+ }
+
+ return &feed{
+ totalItems: totalItems,
+ itemsPerPage: itemsPerPage,
+ entries: laws,
+ }, nil
+}
+
+type feed struct {
+ itemsPerPage int
+ totalItems int
+ entries []lex.Law
+}