diff options
author | Martin Fischer <martin@push-f.com> | 2021-03-04 08:01:53 +0100 |
---|---|---|
committer | Martin Fischer <martin@push-f.com> | 2021-03-04 08:04:05 +0100 |
commit | d5ae42fa1d63749de5ac332c83fd62c51eaaa5e2 (patch) | |
tree | 0427ad1657dfc8a71764c8d226b44cb078145c66 /scrapers/mx.py | |
parent | b0c30d5b3cc05c27270c9c6fdfb7576397bfccb0 (diff) |
add scraper for Mexico
Diffstat (limited to 'scrapers/mx.py')
-rwxr-xr-x | scrapers/mx.py | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/scrapers/mx.py b/scrapers/mx.py new file mode 100755 index 0000000..ec6a274 --- /dev/null +++ b/scrapers/mx.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import json +import re +import urllib.parse + +import lxml.html +import requests + +URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm' + +req = requests.get(URL) +root = lxml.html.fromstring(req.text) +# /following-sibling::span/table +table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0] + +laws = [] + +for link in table.iterfind('.//tr//td[2]//a'): + title = lxml.html.tostring(link, method='text', encoding='unicode') + href = link.get('href') + laws.append(dict( + title = re.sub(r'\s+', ' ', title.strip()), + url = urllib.parse.urljoin(URL, link.get('href')), + redir = re.match('ref/(.*)\.htm', href).group(1) + )) + +with open('laws/mx.json', 'w') as f: + json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
\ No newline at end of file |