From d5ae42fa1d63749de5ac332c83fd62c51eaaa5e2 Mon Sep 17 00:00:00 2001
From: Martin Fischer <martin@push-f.com>
Date: Thu, 4 Mar 2021 08:01:53 +0100
Subject: add scraper for Mexico

---
 scrapers/mx.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100755 scrapers/mx.py

(limited to 'scrapers')

diff --git a/scrapers/mx.py b/scrapers/mx.py
new file mode 100755
index 0000000..ec6a274
--- /dev/null
+++ b/scrapers/mx.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import json
+import re
+import urllib.parse
+
+import lxml.html
+import requests
+
+URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm'
+
+req = requests.get(URL)
+root = lxml.html.fromstring(req.text)
+# /following-sibling::span/table
+table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0]
+
+laws = []
+
+for link in table.iterfind('.//tr//td[2]//a'):
+    title = lxml.html.tostring(link, method='text', encoding='unicode')
+    href = link.get('href')
+    laws.append(dict(
+        title = re.sub(r'\s+', ' ', title.strip()),
+        url = urllib.parse.urljoin(URL, link.get('href')),
+        redir = re.match('ref/(.*)\.htm', href).group(1)
+    ))
+
+with open('laws/mx.json', 'w') as f:
+    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
\ No newline at end of file
-- 
cgit v1.2.3