diff options
Diffstat (limited to 'scrapers/uk.py')
-rwxr-xr-x | scrapers/uk.py | 41 |
1 files changed, 0 insertions, 41 deletions
diff --git a/scrapers/uk.py b/scrapers/uk.py deleted file mode 100755 index 34e7c4b..0000000 --- a/scrapers/uk.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -import json -import math -import re -from multiprocessing.dummy import Pool as ThreadPool - -import requests -import lxml.etree - -NAMESPACES = dict( - atom='http://www.w3.org/2005/Atom', - leg='http://www.legislation.gov.uk/namespaces/legislation', -) - -sess = requests.session() - -def fetch_page(pagenum): - print(pagenum) - res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum)) - return lxml.etree.fromstring(res.content) - -pages = [] -first = fetch_page(1) -pages.append(first) - -page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20) - -for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): - pages.append(page) - -entries = [] -for page in pages: - for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES): - title = entry.find('.//atom:title', namespaces=NAMESPACES).text - if re.search(r'\(repealed( .+)?\)$', title, re.I): - continue - id = entry.find('.//atom:id', namespaces=NAMESPACES).text - entries.append(dict(title=title, url=id)) - -with open('laws/uk.json', 'w') as f: - json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
\ No newline at end of file |