diff options
author | Martin Fischer <martin@push-f.com> | 2021-02-28 09:18:48 +0100 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-03-03 12:52:46 +0100 |
commit | 4d43e952fff25b5b131e8699858da663a5ac2c42 (patch) | |
tree | acff62119061480a1cd7580f25c16c539aabc2ae /scrapers/de.py |
initial commit
Diffstat (limited to 'scrapers/de.py')
-rwxr-xr-x | scrapers/de.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/scrapers/de.py b/scrapers/de.py new file mode 100755 index 0000000..7e0cd1f --- /dev/null +++ b/scrapers/de.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import json +import urllib.parse + +import lxml.html +import requests + +laws = [] + +LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789' + +for idx, l in enumerate(LETTERS, 1): + print(f'fetching {idx}/{len(LETTERS)}') + url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l) + req = requests.get(url) + root = lxml.html.fromstring(req.text) + for el in root.get_element_by_id('paddingLR12'): + target = el[0].get('href').replace('index.html', '') + abbr = target.strip('/.') + laws.append(dict( + title = el[1].tail.strip(), + url = urllib.parse.urljoin(url, target), + abbr = abbr, + )) +with open('laws/de.json', 'w') as f: + json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) |