diff options
author | Martin Fischer <martin@push-f.com> | 2025-04-08 19:25:36 +0200 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2025-04-14 07:04:45 +0200 |
commit | e29d27533725819ec3f6d05a27048d3d2627b53e (patch) | |
tree | 5afba50408b25179edb4ea6445acfe1d3e051488 /scrapers | |
parent | 96236c9d80cea2d6ba83591a7d08a8cc096fd8d3 (diff) |
refactor: port fetchers to Go
* Austria: upgraded to RIS API v2.6 because v2.5 has been turned off
Diffstat (limited to 'scrapers')
-rwxr-xr-x | scrapers/at.py | 66 | ||||
-rwxr-xr-x | scrapers/de.py | 27 | ||||
-rwxr-xr-x | scrapers/uk.py | 41 |
3 files changed, 0 insertions, 134 deletions
diff --git a/scrapers/at.py b/scrapers/at.py deleted file mode 100755 index 1af9894..0000000 --- a/scrapers/at.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -import datetime -import json -import math -from multiprocessing.dummy import Pool as ThreadPool - -import requests - -sess = requests.session() - -# API documentation: -# https://data.bka.gv.at/ris/api/v2.5/applications/bundesnormen - -def fetch_page(page): - res = sess.get('https://data.bka.gv.at/ris/api/v2.5/bundesnormen', params=dict( - Seitennummer=page, - DokumenteProSeite='OneHundred', - FassungVom=datetime.datetime.today().strftime('%Y-%m-%d'), - Abschnitt_Von=1 - )) - print(res.request.url) - data = res.json()['OgdSearchResult'] - - if 'Error' in data: - print(data) - return - - return data['OgdDocumentResults'] - -pages = [] -first = fetch_page(1) -pages.append(first) -page_count = math.ceil(int(first['Hits']['#text']) / 100) - -for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): - pages.append(page) - -normen = {} - -for page in pages: - for result in page['OgdDocumentReference']: - info = result['Data']['Metadaten']['Bundes-Landesnormen'] - if info['Typ'] in ('K', 'K (Geltungsbereich)'): - continue - if info['Typ'].startswith('Vertrag -'): - continue - data = dict( - title=info['Kurztitel'].strip(), - url=info['GesamteRechtsvorschriftUrl'], - ) - if 'Abkuerzung' in info: - data['abbr'] = info['Abkuerzung'].strip() - data['redir'] = data['abbr'].lower()\ - .replace(')', '')\ - .replace('(', '')\ - .replace(' – ', '-')\ - .replace(' ', '-')\ - .replace('\xa0', '-')\ - .replace('ä', 'ae')\ - .replace('ü', 'ue')\ - .replace('ö', 'oe')\ - .replace('ß', 'ss') - normen[info['Gesetzesnummer']] = data - -with open('laws/at.json', 'w') as f: - json.dump(list(normen.values()), f, indent=2, ensure_ascii=False) diff --git a/scrapers/de.py b/scrapers/de.py deleted file mode 100755 index 9450143..0000000 --- a/scrapers/de.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python3 -import json -import urllib.parse - -import lxml.html -import requests - -laws = [] - -LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789' - -for idx, l in enumerate(LETTERS, 1): - print(f'fetching {idx}/{len(LETTERS)}') - url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l) - req = requests.get(url) - root = lxml.html.fromstring(req.text) - for el in root.get_element_by_id('paddingLR12'): - target = el[0].get('href').replace('index.html', '') - abbr = target.strip('/.') - laws.append(dict( - title = el[1].tail.strip(), - url = urllib.parse.urljoin(url, target), - abbr = abbr, - redir = abbr, - )) -with open('laws/de.json', 'w') as f: - json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) diff --git a/scrapers/uk.py b/scrapers/uk.py deleted file mode 100755 index 34e7c4b..0000000 --- a/scrapers/uk.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -import json -import math -import re -from multiprocessing.dummy import Pool as ThreadPool - -import requests -import lxml.etree - -NAMESPACES = dict( - atom='http://www.w3.org/2005/Atom', - leg='http://www.legislation.gov.uk/namespaces/legislation', -) - -sess = requests.session() - -def fetch_page(pagenum): - print(pagenum) - res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum)) - return lxml.etree.fromstring(res.content) - -pages = [] -first = fetch_page(1) -pages.append(first) - -page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20) - -for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)): - pages.append(page) - -entries = [] -for page in pages: - for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES): - title = entry.find('.//atom:title', namespaces=NAMESPACES).text - if re.search(r'\(repealed( .+)?\)$', title, re.I): - continue - id = entry.find('.//atom:id', namespaces=NAMESPACES).text - entries.append(dict(title=title, url=id)) - -with open('laws/uk.json', 'w') as f: - json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
\ No newline at end of file |