initial commit

author: Martin Fischer <martin@push-f.com> 2021-02-28 09:18:48 +0100
committer: Martin Fischer <martin@push-f.com> 2021-03-03 12:52:46 +0100
commit: 4d43e952fff25b5b131e8699858da663a5ac2c42 (patch)
tree: acff62119061480a1cd7580f25c16c539aabc2ae /scrapers/de.py
1 files changed, 26 insertions, 0 deletions
diff --git a/scrapers/de.py b/scrapers/de.py
new file mode 100755
index 0000000..7e0cd1f
--- /dev/null
+++ b/scrapers/de.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+import json
+import urllib.parse
+
+import lxml.html
+import requests
+
+laws = []
+
+LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789'
+
+for idx, l in enumerate(LETTERS, 1):
+    print(f'fetching {idx}/{len(LETTERS)}')
+    url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l)
+    req = requests.get(url)
+    root = lxml.html.fromstring(req.text)
+    for el in root.get_element_by_id('paddingLR12'):
+        target = el[0].get('href').replace('index.html', '')
+        abbr = target.strip('/.')
+        laws.append(dict(
+            title = el[1].tail.strip(),
+            url = urllib.parse.urljoin(url, target),
+            abbr = abbr,
+        ))
+with open('laws/de.json', 'w') as f:
+    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
author	Martin Fischer <martin@push-f.com>	2021-02-28 09:18:48 +0100
committer	Martin Fischer <martin@push-f.com>	2021-03-03 12:52:46 +0100
commit	4d43e952fff25b5b131e8699858da663a5ac2c42 (patch)
tree	acff62119061480a1cd7580f25c16c539aabc2ae /scrapers/de.py