From 30099b5d9b616b820341b70582f26685597cebe5 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Wed, 3 Mar 2021 16:38:37 +0100 Subject: differentiate between abbreviations and redirects --- scrapers/at.py | 10 ++++++++++ scrapers/de.py | 1 + 2 files changed, 11 insertions(+) (limited to 'scrapers') diff --git a/scrapers/at.py b/scrapers/at.py index bc4d874..1af9894 100755 --- a/scrapers/at.py +++ b/scrapers/at.py @@ -50,6 +50,16 @@ for page in pages: ) if 'Abkuerzung' in info: data['abbr'] = info['Abkuerzung'].strip() + data['redir'] = data['abbr'].lower()\ + .replace(')', '')\ + .replace('(', '')\ + .replace(' – ', '-')\ + .replace(' ', '-')\ + .replace('\xa0', '-')\ + .replace('ä', 'ae')\ + .replace('ü', 'ue')\ + .replace('ö', 'oe')\ + .replace('ß', 'ss') normen[info['Gesetzesnummer']] = data with open('laws/at.json', 'w') as f: diff --git a/scrapers/de.py b/scrapers/de.py index 7e0cd1f..9450143 100755 --- a/scrapers/de.py +++ b/scrapers/de.py @@ -21,6 +21,7 @@ for idx, l in enumerate(LETTERS, 1): title = el[1].tail.strip(), url = urllib.parse.urljoin(url, target), abbr = abbr, + redir = abbr, )) with open('laws/de.json', 'w') as f: json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) -- cgit v1.2.3