summaryrefslogtreecommitdiff
path: root/lex-fetch/uk/uk.go
blob: f8e1510e6fc36b8ac68d079e8bb62d462f353719 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package uk

import (
	"fmt"
	"log/slog"
	"net/http"
	"regexp"
	"strconv"

	"github.com/antchfx/xmlquery"
	"push-f.com/lex-surf/internal/lex"
	"push-f.com/lex-surf/lex-fetch/progress"
)

type Fetcher struct{}

func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) {
	firstPage, err := fetchPage(client, 1)
	if err != nil {
		return nil, fmt.Errorf("failed to fetch first page: %w", err)
	}
	progress.Total = firstPage.totalItems / firstPage.itemsPerPage

	laws := firstPage.entries

	for page := 2; page <= progress.Total; page++ {
		feed, err := fetchPage(client, page)
		if err != nil {
			return nil, fmt.Errorf("failed to fetch page: %w", err)
		}
		laws = append(laws, feed.entries...)
		progress.ReportProgress(page)
	}
	return laws, nil
}

func fetchPage(client *http.Client, page int) (*feed, error) {
	resp, err := client.Get(fmt.Sprintf("https://www.legislation.gov.uk/ukpga/data.feed?page=%d", page))
	if err != nil {
		return nil, fmt.Errorf("failed to fetch page: %w", err)
	}
	if resp.StatusCode != 200 {
		return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
	}

	doc, err := xmlquery.Parse(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("failed to HTML parse: %w", err)
	}
	itemsPerPageEl := xmlquery.FindOne(doc, "//openSearch:itemsPerPage")
	if itemsPerPageEl == nil {
		return nil, fmt.Errorf("couldn't find openSearch:itemsPerPage")
	}
	itemsPerPage, _ := strconv.Atoi(itemsPerPageEl.InnerText())

	facetTypesEl := xmlquery.FindOne(doc, "//leg:facetType")
	if facetTypesEl == nil {
		return nil, fmt.Errorf("couldn't find leg:facetType")
	}
	totalItems, _ := strconv.Atoi(facetTypesEl.SelectAttr("value"))
	if totalItems == 0 {
		return nil, fmt.Errorf("detected 0 total results")
	}

	var laws []lex.Law

	for _, entry := range xmlquery.Find(doc, "//entry") {
		title := xmlquery.FindOne(entry, "./title").InnerText()
		repealed, _ := regexp.MatchString("(?i)\\(repealed( .+)?\\)$", title)
		if repealed {
			continue
		}
		laws = append(laws, lex.Law{
			Title: title,
			URL:   xmlquery.FindOne(entry, "./link[@rel='self']").SelectAttr("href"),
		})
	}

	return &feed{
		totalItems:   totalItems,
		itemsPerPage: itemsPerPage,
		entries:      laws,
	}, nil
}

type feed struct {
	itemsPerPage int
	totalItems   int
	entries      []lex.Law
}