add scraper for Mexico

author: Martin Fischer <martin@push-f.com> 2021-03-04 08:01:53 +0100
committer: Martin Fischer <martin@push-f.com> 2021-03-04 08:04:05 +0100
commit: d5ae42fa1d63749de5ac332c83fd62c51eaaa5e2 (patch)
tree: 0427ad1657dfc8a71764c8d226b44cb078145c66 /scrapers/mx.py
parent: b0c30d5b3cc05c27270c9c6fdfb7576397bfccb0 (diff)
1 files changed, 28 insertions, 0 deletions
diff --git a/scrapers/mx.py b/scrapers/mx.py
new file mode 100755
index 0000000..ec6a274
--- /dev/null
+++ b/scrapers/mx.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import json
+import re
+import urllib.parse
+
+import lxml.html
+import requests
+
+URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm'
+
+req = requests.get(URL)
+root = lxml.html.fromstring(req.text)
+# /following-sibling::span/table
+table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0]
+
+laws = []
+
+for link in table.iterfind('.//tr//td[2]//a'):
+    title = lxml.html.tostring(link, method='text', encoding='unicode')
+    href = link.get('href')
+    laws.append(dict(
+        title = re.sub(r'\s+', ' ', title.strip()),
+        url = urllib.parse.urljoin(URL, link.get('href')),
+        redir = re.match('ref/(.*)\.htm', href).group(1)
+    ))
+
+with open('laws/mx.json', 'w') as f:
+    json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False)
+\ No newline at end of file
author	Martin Fischer <martin@push-f.com>	2021-03-04 08:01:53 +0100
committer	Martin Fischer <martin@push-f.com>	2021-03-04 08:04:05 +0100
commit	d5ae42fa1d63749de5ac332c83fd62c51eaaa5e2 (patch)
tree	0427ad1657dfc8a71764c8d226b44cb078145c66 /scrapers/mx.py
parent	b0c30d5b3cc05c27270c9c6fdfb7576397bfccb0 (diff)