From 4d43e952fff25b5b131e8699858da663a5ac2c42 Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Sun, 28 Feb 2021 09:18:48 +0100
Subject: initial commit

---
 scrapers/de.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100755 scrapers/de.py

(limited to 'scrapers/de.py')

diff --git a/scrapers/de.py b/scrapers/de.py
new file mode 100755
index 0000000..7e0cd1f
--- /dev/null
+++ b/scrapers/de.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+import json
+import urllib.parse
+
+import lxml.html
+import requests
+
+laws = []
+
+LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789'
+
+for idx, l in enumerate(LETTERS, 1):
+    print(f'fetching {idx}/{len(LETTERS)}')
+    url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l)
+    req = requests.get(url)
+    root = lxml.html.fromstring(req.text)
+    for el in root.get_element_by_id('paddingLR12'):
+        target = el[0].get('href').replace('index.html', '')
+        abbr = target.strip('/.')
+        laws.append(dict(
+            title = el[1].tail.strip(),
+            url = urllib.parse.urljoin(url, target),
+            abbr = abbr,
+        ))
+with open('laws/de.json', 'w') as f:
+    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
-- 
cgit v1.2.3