#!/usr/bin/env python3 import json import re import urllib.parse import lxml.html import requests URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm' req = requests.get(URL) root = lxml.html.fromstring(req.text) # /following-sibling::span/table table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0] laws = [] for link in table.iterfind('.//tr//td[2]//a'): title = lxml.html.tostring(link, method='text', encoding='unicode') href = link.get('href') laws.append(dict( title = re.sub(r'\s+', ' ', title.strip()), url = urllib.parse.urljoin(URL, link.get('href')), redir = re.match('ref/(.*)\.htm', href).group(1) )) with open('laws/mx.json', 'w') as f: json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)