Index: sitescripts/crawler/bin/import_sites.py |
=================================================================== |
new file mode 100644 |
--- /dev/null |
+++ b/sitescripts/crawler/bin/import_sites.py |
@@ -0,0 +1,52 @@ |
+# coding: utf-8 |
+ |
+# This Source Code is subject to the terms of the Mozilla Public License |
+# version 2.0 (the "License"). You can obtain a copy of the License at |
+# http://mozilla.org/MPL/2.0/. |
+ |
+import MySQLdb, os, re, subprocess |
+from sitescripts.utils import cached, get_config |
+ |
+@cached(600) |
+def _get_db(): |
+ database = get_config().get("crawler", "database") |
+ dbuser = get_config().get("crawler", "dbuser") |
+ dbpasswd = get_config().get("crawler", "dbpassword") |
+ if os.name == "nt": |
+ return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
+ use_unicode=True, charset="utf8", named_pipe=True) |
+ else: |
+ return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
+ use_unicode=True, charset="utf8") |
+ |
+def _get_cursor(): |
+ return _get_db().cursor(MySQLdb.cursors.DictCursor) |
+ |
+def _hg(args): |
+ return subprocess.Popen(["hg"] + args, stdout = subprocess.PIPE) |
+ |
+def _extract_sites(easylist_dir): |
+ os.chdir(easylist_dir) |
+ process = _hg(["log", "--template", "{desc}\n"]) |
+ urls = set([]) |
+ |
+ for line in process.stdout: |
+ match = re.search(r"\b(https?://\S*)", line) |
+ if not match: |
+ continue |
+ |
+ url = match.group(1).strip() |
+ urls.add(url) |
+ |
+ return urls |
+ |
+def _insert_sites(site_urls): |
+ cursor = _get_cursor() |
+ for url in site_urls: |
+ cursor.execute("INSERT INTO crawler_sites (url) VALUES (%s)", url) |
+ |
+if __name__ == "__main__": |
+ easylist_dir = get_config().get("crawler", "easylist_repository") |
+ site_urls = _extract_sites(easylist_dir) |
+ _insert_sites(site_urls) |
+ |