#!/usr/bin/env python3 import json import math import re from multiprocessing.dummy import Pool as ThreadPool import requests import lxml.etree NAMESPACES = dict( atom='http://www.w3.org/2005/Atom', leg='http://www.legislation.gov.uk/namespaces/legislation', ) sess = requests.session() def fetch_page(pagenum): print(pagenum) res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum)) return lxml.etree.fromstring(res.content) pages = [] first = fetch_page(1) pages.append(first) page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20) for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): pages.append(page) entries = [] for page in pages: for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES): title = entry.find('.//atom:title', namespaces=NAMESPACES).text if re.search(r'\(repealed( .+)?\)$', title, re.I): continue id = entry.find('.//atom:id', namespaces=NAMESPACES).text entries.append(dict(title=title, url=id)) with open('laws/uk.json', 'w') as f: json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)