summaryrefslogtreecommitdiff
path: root/scrapers/mx.py
diff options
context:
space:
mode:
authorMartin Fischer <martin@push-f.com>2021-03-04 08:01:53 +0100
committerMartin Fischer <martin@push-f.com>2021-03-04 08:04:05 +0100
commitd5ae42fa1d63749de5ac332c83fd62c51eaaa5e2 (patch)
tree0427ad1657dfc8a71764c8d226b44cb078145c66 /scrapers/mx.py
parentb0c30d5b3cc05c27270c9c6fdfb7576397bfccb0 (diff)
add scraper for Mexico
Diffstat (limited to 'scrapers/mx.py')
-rwxr-xr-xscrapers/mx.py28
1 files changed, 28 insertions, 0 deletions
diff --git a/scrapers/mx.py b/scrapers/mx.py
new file mode 100755
index 0000000..ec6a274
--- /dev/null
+++ b/scrapers/mx.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import json
+import re
+import urllib.parse
+
+import lxml.html
+import requests
+
+URL = 'http://www.diputados.gob.mx/LeyesBiblio/index.htm'
+
+req = requests.get(URL)
+root = lxml.html.fromstring(req.text)
+# /following-sibling::span/table
+table = root.xpath(".//*[contains(text(), 'LEYES FEDERALES VIGENTES')]/ancestor::table[1]/following::table[1]")[0]
+
+laws = []
+
+for link in table.iterfind('.//tr//td[2]//a'):
+ title = lxml.html.tostring(link, method='text', encoding='unicode')
+ href = link.get('href')
+ laws.append(dict(
+ title = re.sub(r'\s+', ' ', title.strip()),
+ url = urllib.parse.urljoin(URL, link.get('href')),
+ redir = re.match('ref/(.*)\.htm', href).group(1)
+ ))
+
+with open('laws/mx.json', 'w') as f:
+ json.dump(sorted(laws, key=lambda l: l['title']), f, indent=2, ensure_ascii=False) \ No newline at end of file