1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
package uk
import (
"fmt"
"log/slog"
"net/http"
"regexp"
"strconv"
"github.com/antchfx/xmlquery"
"push-f.com/lex-surf/internal/lex"
"push-f.com/lex-surf/lex-fetch/progress"
)
type Fetcher struct{}
func (s *Fetcher) Fetch(log *slog.Logger, client *http.Client, progress *progress.Reporter) ([]lex.Law, error) {
firstPage, err := fetchPage(client, 1)
if err != nil {
return nil, fmt.Errorf("failed to fetch first page: %w", err)
}
progress.Total = firstPage.totalItems / firstPage.itemsPerPage
laws := firstPage.entries
for page := 2; page <= progress.Total; page++ {
feed, err := fetchPage(client, page)
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
laws = append(laws, feed.entries...)
progress.ReportProgress(page)
}
return laws, nil
}
func fetchPage(client *http.Client, page int) (*feed, error) {
resp, err := client.Get(fmt.Sprintf("https://www.legislation.gov.uk/ukpga/data.feed?page=%d", page))
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
}
doc, err := xmlquery.Parse(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to HTML parse: %w", err)
}
itemsPerPageEl := xmlquery.FindOne(doc, "//openSearch:itemsPerPage")
if itemsPerPageEl == nil {
return nil, fmt.Errorf("couldn't find openSearch:itemsPerPage")
}
itemsPerPage, _ := strconv.Atoi(itemsPerPageEl.InnerText())
facetTypesEl := xmlquery.FindOne(doc, "//leg:facetType")
if facetTypesEl == nil {
return nil, fmt.Errorf("couldn't find leg:facetType")
}
totalItems, _ := strconv.Atoi(facetTypesEl.SelectAttr("value"))
if totalItems == 0 {
return nil, fmt.Errorf("detected 0 total results")
}
var laws []lex.Law
for _, entry := range xmlquery.Find(doc, "//entry") {
title := xmlquery.FindOne(entry, "./title").InnerText()
repealed, _ := regexp.MatchString("(?i)\\(repealed( .+)?\\)$", title)
if repealed {
continue
}
laws = append(laws, lex.Law{
Title: title,
URL: xmlquery.FindOne(entry, "./link[@rel='self']").SelectAttr("href"),
})
}
return &feed{
totalItems: totalItems,
itemsPerPage: itemsPerPage,
entries: laws,
}, nil
}
type feed struct {
itemsPerPage int
totalItems int
entries []lex.Law
}
|