summaryrefslogtreecommitdiff
path: root/scrapers/de.py
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-02-28 09:18:48 +0100
committerMartin Fischer <martin@push-f.com>2021-03-03 12:52:46 +0100
commit4d43e952fff25b5b131e8699858da663a5ac2c42 (patch)
treeacff62119061480a1cd7580f25c16c539aabc2ae /scrapers/de.py
initial commit
Diffstat (limited to 'scrapers/de.py')
-rwxr-xr-xscrapers/de.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/scrapers/de.py b/scrapers/de.py
new file mode 100755
index 0000000..7e0cd1f
--- /dev/null
+++ b/scrapers/de.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+import json
+import urllib.parse
+
+import lxml.html
+import requests
+
+laws = []
+
+LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789'
+
+for idx, l in enumerate(LETTERS, 1):
+ print(f'fetching {idx}/{len(LETTERS)}')
+ url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l)
+ req = requests.get(url)
+ root = lxml.html.fromstring(req.text)
+ for el in root.get_element_by_id('paddingLR12'):
+ target = el[0].get('href').replace('index.html', '')
+ abbr = target.strip('/.')
+ laws.append(dict(
+ title = el[1].tail.strip(),
+ url = urllib.parse.urljoin(url, target),
+ abbr = abbr,
+ ))
+with open('laws/de.json', 'w') as f:
+ json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)