diff options
Diffstat (limited to 'scrapers/uk.py')
-rwxr-xr-x | scrapers/uk.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/scrapers/uk.py b/scrapers/uk.py new file mode 100755 index 0000000..34e7c4b --- /dev/null +++ b/scrapers/uk.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +import json +import math +import re +from multiprocessing.dummy import Pool as ThreadPool + +import requests +import lxml.etree + +NAMESPACES = dict( + atom='http://www.w3.org/2005/Atom', + leg='http://www.legislation.gov.uk/namespaces/legislation', +) + +sess = requests.session() + +def fetch_page(pagenum): + print(pagenum) + res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum)) + return lxml.etree.fromstring(res.content) + +pages = [] +first = fetch_page(1) +pages.append(first) + +page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20) + +for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): + pages.append(page) + +entries = [] +for page in pages: + for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES): + title = entry.find('.//atom:title', namespaces=NAMESPACES).text + if re.search(r'\(repealed( .+)?\)$', title, re.I): + continue + id = entry.find('.//atom:id', namespaces=NAMESPACES).text + entries.append(dict(title=title, url=id)) + +with open('laws/uk.json', 'w') as f: + json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
\ No newline at end of file |