From 4d43e952fff25b5b131e8699858da663a5ac2c42 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sun, 28 Feb 2021 09:18:48 +0100 Subject: initial commit --- scrapers/at.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrapers/de.py | 26 ++++++++++++++++++++++++++ scrapers/uk.py | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100755 scrapers/at.py create mode 100755 scrapers/de.py create mode 100755 scrapers/uk.py (limited to 'scrapers') diff --git a/scrapers/at.py b/scrapers/at.py new file mode 100755 index 0000000..54b2402 --- /dev/null +++ b/scrapers/at.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +import datetime +import json +import math +from multiprocessing.dummy import Pool as ThreadPool + +import requests + +sess = requests.session() + +# API documentation: +# https://data.bka.gv.at/ris/api/v2.5/applications/bundesnormen + +def fetch_page(page): + res = sess.get('https://data.bka.gv.at/ris/api/v2.5/bundesnormen', params=dict( + Seitennummer=page, + DokumenteProSeite='OneHundred', + FassungVom=datetime.datetime.today().strftime('%Y-%m-%d'), + Abschnitt_Von=1 + )) + print(res.request.url) + data = res.json()['OgdSearchResult'] + + if 'Error' in data: + print(data) + return + + return data['OgdDocumentResults'] + +pages = [] +first = fetch_page(1) +pages.append(first) +page_count = math.ceil(int(first['Hits']['#text']) / 100) + +for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): + pages.append(page) + +normen = {} + +for page in pages: + for result in page['OgdDocumentReference']: + info = result['Data']['Metadaten']['Bundes-Landesnormen'] + if info['Typ'] in ('K', 'K (Geltungsbereich)'): + continue + if info['Typ'].startswith('Vertrag -'): + continue + normen[info['Gesetzesnummer']] = dict( + title=info['Kurztitel'], + url=info['GesamteRechtsvorschriftUrl'], + abbr=info.get('Abkuerzung') + ) + +with open('laws/at.json', 'w') as f: + json.dump(list(normen.values()), f, indent=2, ensure_ascii=False) diff --git a/scrapers/de.py b/scrapers/de.py new file mode 100755 index 0000000..7e0cd1f --- /dev/null +++ b/scrapers/de.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import json +import urllib.parse + +import lxml.html +import requests + +laws = [] + +LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789' + +for idx, l in enumerate(LETTERS, 1): + print(f'fetching {idx}/{len(LETTERS)}') + url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l) + req = requests.get(url) + root = lxml.html.fromstring(req.text) + for el in root.get_element_by_id('paddingLR12'): + target = el[0].get('href').replace('index.html', '') + abbr = target.strip('/.') + laws.append(dict( + title = el[1].tail.strip(), + url = urllib.parse.urljoin(url, target), + abbr = abbr, + )) +with open('laws/de.json', 'w') as f: + json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) diff --git a/scrapers/uk.py b/scrapers/uk.py new file mode 100755 index 0000000..34e7c4b --- /dev/null +++ b/scrapers/uk.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +import json +import math +import re +from multiprocessing.dummy import Pool as ThreadPool + +import requests +import lxml.etree + +NAMESPACES = dict( + atom='http://www.w3.org/2005/Atom', + leg='http://www.legislation.gov.uk/namespaces/legislation', +) + +sess = requests.session() + +def fetch_page(pagenum): + print(pagenum) + res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum)) + return lxml.etree.fromstring(res.content) + +pages = [] +first = fetch_page(1) +pages.append(first) + +page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20) + +for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): + pages.append(page) + +entries = [] +for page in pages: + for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES): + title = entry.find('.//atom:title', namespaces=NAMESPACES).text + if re.search(r'\(repealed( .+)?\)$', title, re.I): + continue + id = entry.find('.//atom:id', namespaces=NAMESPACES).text + entries.append(dict(title=title, url=id)) + +with open('laws/uk.json', 'w') as f: + json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) \ No newline at end of file -- cgit v1.2.3