#!/usr/bin/env python3 import json import urllib.parse import lxml.html import requests laws = [] LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWYZ123456789' for idx, l in enumerate(LETTERS, 1): print(f'fetching {idx}/{len(LETTERS)}') url = 'https://www.gesetze-im-internet.de/Teilliste_{}.html'.format(l) req = requests.get(url) root = lxml.html.fromstring(req.text) for el in root.get_element_by_id('paddingLR12'): target = el[0].get('href').replace('index.html', '') abbr = target.strip('/.') laws.append(dict( title = el[1].tail.strip(), url = urllib.parse.urljoin(url, target), abbr = abbr, )) with open('laws/de.json', 'w') as f: json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)