scrapers/mx.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

#!/usr/bin/env python3
import json
import re
import urllib.parse

import lxml.html
import requests

URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm'

req = requests.get(URL)
root = lxml.html.fromstring(req.text)
# /following-sibling::span/table
table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0]

laws = []

for link in table.iterfind('.//tr//td[2]//a'):
    title = lxml.html.tostring(link, method='text', encoding='unicode')
    href = link.get('href')
    laws.append(dict(
        title = re.sub(r'\s+', ' ', title.strip()),
        url = urllib.parse.urljoin(URL, link.get('href')),
        redir = re.match('ref/(.*)\.htm', href).group(1)
    ))

with open('laws/mx.json', 'w') as f:
    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)