summaryrefslogtreecommitdiff
path: root/scrapers
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2025-04-08 19:25:36 +0200
committerMartin Fischer <martin@push-f.com>2025-04-14 07:04:45 +0200
commite29d27533725819ec3f6d05a27048d3d2627b53e (patch)
tree5afba50408b25179edb4ea6445acfe1d3e051488 /scrapers
parent96236c9d80cea2d6ba83591a7d08a8cc096fd8d3 (diff)
refactor: port fetchers to Go
* Austria: upgraded to RIS API v2.6 because v2.5 has been turned off
Diffstat (limited to 'scrapers')
-rwxr-xr-xscrapers/at.py66
-rwxr-xr-xscrapers/de.py27
-rwxr-xr-xscrapers/uk.py41
3 files changed, 0 insertions, 134 deletions
diff --git a/scrapers/at.py b/scrapers/at.py
deleted file mode 100755
index 1af9894..0000000
--- a/scrapers/at.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-import datetime
-import json
-import math
-from multiprocessing.dummy import Pool as ThreadPool
-
-import requests
-
-sess = requests.session()
-
-# API documentation:
-# https://data.bka.gv.at/ris/api/v2.5/applications/bundesnormen
-
-def fetch_page(page):
- res = sess.get('https://data.bka.gv.at/ris/api/v2.5/bundesnormen', params=dict(
- Seitennummer=page,
- DokumenteProSeite='OneHundred',
- FassungVom=datetime.datetime.today().strftime('%Y-%m-%d'),
- Abschnitt_Von=1
- ))
- print(res.request.url)
- data = res.json()['OgdSearchResult']
-
- if 'Error' in data:
- print(data)
- return
-
- return data['OgdDocumentResults']
-
-pages = []
-first = fetch_page(1)
-pages.append(first)
-page_count = math.ceil(int(first['Hits']['#text']) / 100)
-
-for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)):
- pages.append(page)
-
-normen = {}
-
-for page in pages:
- for result in page['OgdDocumentReference']:
- info = result['Data']['Metadaten']['Bundes-Landesnormen']
- if info['Typ'] in ('K', 'K (Geltungsbereich)'):
- continue
- if info['Typ'].startswith('Vertrag -'):
- continue
- data = dict(
- title=info['Kurztitel'].strip(),
- url=info['GesamteRechtsvorschriftUrl'],
- )
- if 'Abkuerzung' in info:
- data['abbr'] = info['Abkuerzung'].strip()
- data['redir'] = data['abbr'].lower()\
- .replace(')', '')\
- .replace('(', '')\
- .replace(' – ', '-')\
- .replace(' ', '-')\
- .replace('\xa0', '-')\
- .replace('ä', 'ae')\
- .replace('ü', 'ue')\
- .replace('ö', 'oe')\
- .replace('ß', 'ss')
- normen[info['Gesetzesnummer']] = data
-
-with open('laws/at.json', 'w') as f:
- json.dump(list(normen.values()), f, indent=2, ensure_ascii=False)
diff --git a/scrapers/de.py b/scrapers/de.py
deleted file mode 100755
index 9450143..0000000
--- a/scrapers/de.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python3
-import json
-import urllib.parse
-
-import lxml.html
-import requests
-
-laws = []
-
-LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789'
-
-for idx, l in enumerate(LETTERS, 1):
- print(f'fetching {idx}/{len(LETTERS)}')
- url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l)
- req = requests.get(url)
- root = lxml.html.fromstring(req.text)
- for el in root.get_element_by_id('paddingLR12'):
- target = el[0].get('href').replace('index.html', '')
- abbr = target.strip('/.')
- laws.append(dict(
- title = el[1].tail.strip(),
- url = urllib.parse.urljoin(url, target),
- abbr = abbr,
- redir = abbr,
- ))
-with open('laws/de.json', 'w') as f:
- json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
diff --git a/scrapers/uk.py b/scrapers/uk.py
deleted file mode 100755
index 34e7c4b..0000000
--- a/scrapers/uk.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-import json
-import math
-import re
-from multiprocessing.dummy import Pool as ThreadPool
-
-import requests
-import lxml.etree
-
-NAMESPACES = dict(
- atom='http://www.w3.org/2005/Atom',
- leg='http://www.legislation.gov.uk/namespaces/legislation',
-)
-
-sess = requests.session()
-
-def fetch_page(pagenum):
- print(pagenum)
- res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum))
- return lxml.etree.fromstring(res.content)
-
-pages = []
-first = fetch_page(1)
-pages.append(first)
-
-page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20)
-
-for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)):
- pages.append(page)
-
-entries = []
-for page in pages:
- for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES):
- title = entry.find('.//atom:title', namespaces=NAMESPACES).text
- if re.search(r'\(repealed( .+)?\)$', title, re.I):
- continue
- id = entry.find('.//atom:id', namespaces=NAMESPACES).text
- entries.append(dict(title=title, url=id))
-
-with open('laws/uk.json', 'w') as f:
- json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) \ No newline at end of file