1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
#!/usr/bin/env python3
import json
import math
import re
from multiprocessing.dummy import Pool as ThreadPool
import requests
import lxml.etree
NAMESPACES = dict(
atom='http://www.w3.org/2005/Atom',
leg='http://www.legislation.gov.uk/namespaces/legislation',
)
sess = requests.session()
def fetch_page(pagenum):
print(pagenum)
res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum))
return lxml.etree.fromstring(res.content)
pages = []
first = fetch_page(1)
pages.append(first)
page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20)
for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)):
pages.append(page)
entries = []
for page in pages:
for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES):
title = entry.find('.//atom:title', namespaces=NAMESPACES).text
if re.search(r'\(repealed( .+)?\)$', title, re.I):
continue
id = entry.find('.//atom:id', namespaces=NAMESPACES).text
entries.append(dict(title=title, url=id))
with open('laws/uk.json', 'w') as f:
json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
|