scrapers/de.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

#!/usr/bin/env python3
import json
import urllib.parse

import lxml.html
import requests

laws = []

LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789'

for idx, l in enumerate(LETTERS, 1):
    print(f'fetching {idx}/{len(LETTERS)}')
    url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l)
    req = requests.get(url)
    root = lxml.html.fromstring(req.text)
    for el in root.get_element_by_id('paddingLR12'):
        target = el[0].get('href').replace('index.html', '')
        abbr = target.strip('/.')
        laws.append(dict(
            title = el[1].tail.strip(),
            url = urllib.parse.urljoin(url, target),
            abbr = abbr,
            redir = abbr,
        ))
with open('laws/de.json', 'w') as f:
    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)