refactor: port fetchers to Go

* Austria: upgraded to RIS API v2.6 because v2.5 has been turned off
author: Martin Fischer <martin@push-f.com> 2025-04-08 19:25:36 +0200
committer: Martin Fischer <martin@push-f.com> 2025-04-14 07:04:45 +0200
commit: e29d27533725819ec3f6d05a27048d3d2627b53e (patch)
tree: 5afba50408b25179edb4ea6445acfe1d3e051488 /scrapers
parent: 96236c9d80cea2d6ba83591a7d08a8cc096fd8d3 (diff)
3 files changed, 0 insertions, 134 deletions
diff --git a/scrapers/at.py b/scrapers/at.py
deleted file mode 100755
index 1af9894..0000000
--- a/scrapers/at.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-import datetime
-import json
-import math
-from multiprocessing.dummy import Pool as ThreadPool
-
-import requests
-
-sess = requests.session()
-
-# API documentation:
-# https://data.bka.gv.at/ris/api/v2.5/applications/bundesnormen
-
-def fetch_page(page):
-    res = sess.get('https://data.bka.gv.at/ris/api/v2.5/bundesnormen', params=dict(
-        Seitennummer=page,
-        DokumenteProSeite='OneHundred',
-        FassungVom=datetime.datetime.today().strftime('%Y-%m-%d'),
-        Abschnitt_Von=1
-    ))
-    print(res.request.url)
-    data = res.json()['OgdSearchResult']
-
-    if 'Error' in data:
-        print(data)
-        return
-
-    return data['OgdDocumentResults']
-
-pages = []
-first = fetch_page(1)
-pages.append(first)
-page_count = math.ceil(int(first['Hits']['#text']) / 100)
-
-for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)):
-    pages.append(page)
-
-normen = {}
-
-for page in pages:
-    for result in page['OgdDocumentReference']:
-        info = result['Data']['Metadaten']['Bundes-Landesnormen']
-        if info['Typ'] in ('K', 'K (Geltungsbereich)'):
-            continue
-        if info['Typ'].startswith('Vertrag -'):
-            continue
-        data = dict(
-            title=info['Kurztitel'].strip(),
-            url=info['GesamteRechtsvorschriftUrl'],
-        )
-        if 'Abkuerzung' in info:
-            data['abbr'] = info['Abkuerzung'].strip()
-            data['redir'] = data['abbr'].lower()\
-                .replace(')', '')\
-                .replace('(', '')\
-                .replace(' – ', '-')\
-                .replace(' ', '-')\
-                .replace('\xa0', '-')\
-                .replace('ä', 'ae')\
-                .replace('ü', 'ue')\
-                .replace('ö', 'oe')\
-                .replace('ß', 'ss')
-        normen[info['Gesetzesnummer']] = data
-
-with open('laws/at.json', 'w') as f:
-    json.dump(list(normen.values()), f, indent=2, ensure_ascii=False)
diff --git a/scrapers/de.py b/scrapers/de.py
deleted file mode 100755
index 9450143..0000000
--- a/scrapers/de.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python3
-import json
-import urllib.parse
-
-import lxml.html
-import requests
-
-laws = []
-
-LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789'
-
-for idx, l in enumerate(LETTERS, 1):
-    print(f'fetching {idx}/{len(LETTERS)}')
-    url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l)
-    req = requests.get(url)
-    root = lxml.html.fromstring(req.text)
-    for el in root.get_element_by_id('paddingLR12'):
-        target = el[0].get('href').replace('index.html', '')
-        abbr = target.strip('/.')
-        laws.append(dict(
-            title = el[1].tail.strip(),
-            url = urllib.parse.urljoin(url, target),
-            abbr = abbr,
-            redir = abbr,
-        ))
-with open('laws/de.json', 'w') as f:
-    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
diff --git a/scrapers/uk.py b/scrapers/uk.py
deleted file mode 100755
index 34e7c4b..0000000
--- a/scrapers/uk.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-import json
-import math
-import re
-from multiprocessing.dummy import Pool as ThreadPool
-
-import requests
-import lxml.etree
-
-NAMESPACES = dict(
-    atom='http://www.w3.org/2005/Atom',
-    leg='http://www.legislation.gov.uk/namespaces/legislation',
-)
-
-sess = requests.session()
-
-def fetch_page(pagenum):
-    print(pagenum)
-    res = sess.get('https://www.legislation.gov.uk/ukpga/data.feed', params=dict(page=pagenum))
-    return lxml.etree.fromstring(res.content)
-
-pages = []
-first = fetch_page(1)
-pages.append(first)
-
-page_count = math.ceil(int(first.find('.//leg:facetType', namespaces=NAMESPACES).get('value')) / 20)
-
-for page in ThreadPool(8).map(fetch_page, range(2, page_count+1)):
-    pages.append(page)
-
-entries = []
-for page in pages:
-    for entry in page.iterfind('.//atom:entry', namespaces=NAMESPACES):
-        title = entry.find('.//atom:title', namespaces=NAMESPACES).text
-        if re.search(r'\(repealed( .+)?\)$', title, re.I):
-            continue
-        id = entry.find('.//atom:id', namespaces=NAMESPACES).text
-        entries.append(dict(title=title, url=id))
-
-with open('laws/uk.json', 'w') as f:
-    json.dump(sorted(entries, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
-\ No newline at end of file
author	Martin Fischer <martin@push-f.com>	2025-04-08 19:25:36 +0200
committer	Martin Fischer <martin@push-f.com>	2025-04-14 07:04:45 +0200
commit	e29d27533725819ec3f6d05a27048d3d2627b53e (patch)
tree	5afba50408b25179edb4ea6445acfe1d3e051488 /scrapers
parent	96236c9d80cea2d6ba83591a7d08a8cc096fd8d3 (diff)