From f1b0f9b6c9404cca0a5cd230279dd82254b18378 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sun, 13 Apr 2025 09:44:11 +0200 Subject: remove: scraper for Mexico I cannot reach www.diputados.gob.mx anymore. --- scrapers/mx.py | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100755 scrapers/mx.py (limited to 'scrapers/mx.py') diff --git a/scrapers/mx.py b/scrapers/mx.py deleted file mode 100755 index ec6a274..0000000 --- a/scrapers/mx.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -import json -import re -import urllib.parse - -import lxml.html -import requests - -URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm' - -req = requests.get(URL) -root = lxml.html.fromstring(req.text) -# /following-sibling::span/table -table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0] - -laws = [] - -for link in table.iterfind('.//tr//td[2]//a'): - title = lxml.html.tostring(link, method='text', encoding='unicode') - href = link.get('href') - laws.append(dict( - title = re.sub(r'\s+', ' ', title.strip()), - url = urllib.parse.urljoin(URL, link.get('href')), - redir = re.match('ref/(.*)\.htm', href).group(1) - )) - -with open('laws/mx.json', 'w') as f: - json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) \ No newline at end of file -- cgit v1.2.3