From 4d43e952fff25b5b131e8699858da663a5ac2c42 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sun, 28 Feb 2021 09:18:48 +0100 Subject: initial commit --- scrapers/de.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100755 scrapers/de.py (limited to 'scrapers/de.py') diff --git a/scrapers/de.py b/scrapers/de.py new file mode 100755 index 0000000..7e0cd1f --- /dev/null +++ b/scrapers/de.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import json +import urllib.parse + +import lxml.html +import requests + +laws = [] + +LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789' + +for idx, l in enumerate(LETTERS, 1): + print(f'fetching {idx}/{len(LETTERS)}') + url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l) + req = requests.get(url) + root = lxml.html.fromstring(req.text) + for el in root.get_element_by_id('paddingLR12'): + target = el[0].get('href').replace('index.html', '') + abbr = target.strip('/.') + laws.append(dict( + title = el[1].tail.strip(), + url = urllib.parse.urljoin(url, target), + abbr = abbr, + )) +with open('laws/de.json', 'w') as f: + json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) -- cgit v1.2.3