Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Patch Set: Created Sept. 27, 2012, 2:15 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sitescripts/crawler/web/crawler.py
===================================================================
--- a/sitescripts/crawler/web/crawler.py
+++ b/sitescripts/crawler/web/crawler.py
@@ -1,29 +1,107 @@
-import MySQLdb, os
+import MySQLdb, os, re, simplejson, sys
from sitescripts.utils import cached, get_config
-from sitescripts.web import url_handler
-
-@url_handler('/crawlableUrls')
-def listUrls(environ, start_response):
- urls = fetch_crawlable_urls()
- start_response('200 OK', [('Content-Type', 'text/plain')])
- return '\n'.join(urls)
-
-def fetch_crawlable_urls():
- cursor = get_db().cursor(MySQLdb.cursors.DictCursor)
- executeQuery(cursor, 'SELECT url from crawler_urls')
- results = cursor.fetchall()
- urls = [result['url'] for result in results]
- return urls
+from sitescripts.web import url_handler, basic_auth
@cached(600)
-def get_db():
- database = get_config().get('crawler', 'database')
- dbuser = get_config().get('crawler', 'dbuser')
- dbpasswd = get_config().get('crawler', 'dbpassword')
- if os.name == 'nt':
- return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicode=True, charset='utf8', named_pipe=True)
+def _get_db():
+ database = get_config().get("crawler", "database")
+ dbuser = get_config().get("crawler", "dbuser")
+ dbpasswd = get_config().get("crawler", "dbpassword")
+ if os.name == "nt":
+ return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
+ use_unicode=True, charset="utf8", named_pipe=True)
else:
- return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicode=True, charset='utf8')
+ return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
+ use_unicode=True, charset="utf8")
-def executeQuery(cursor, query, args=None):
- cursor.execute(query, args)
+def get_cursor():
+ return _get_db().cursor(MySQLdb.cursors.DictCursor)
+
+def _fetch_crawlable_sites():
+ cursor = get_cursor()
+ cursor.execute("SELECT url from crawler_sites")
+ results = cursor.fetchall()
+ sites = [result["url"] for result in results]
+ return sites
+
+@url_handler("/crawlableSites")
+@basic_auth("crawler")
+def crawlable_sites(environ, start_response):
+ urls = _fetch_crawlable_sites()
+ start_response("200 OK", [("Content-Type", "text/plain")])
+ return "\n".join(urls)
+
+def _find_site_id(site_url):
+ cursor = get_cursor()
+ cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
+ result = cursor.fetchone()
+ return result["id"] if result else None
+
+def _read_multipart_lines(environ, line_callback):
+ data_file = environ["wsgi.input"]
+ content_type = environ.get("CONTENT_TYPE")
+ if not content_type:
+ raise ValueError("Content-Type missing from header")
+
+ match = re.search(r"boundary=(.*)", content_type)
+ if not match:
+ raise ValueError("Multipart form data or boundary declaration missing")
+
+ boundary = match.group(1)
+ boundary_passed = False
+ header_passed = False
+
+ for line in data_file:
+ line = line.strip()
+
+ if not boundary_passed:
+ if line == "--" + boundary:
+ boundary_passed = True
+ continue
+
+ if not header_passed:
+ if not line:
+ header_passed = True
+ continue
+
+ if line == "--" + boundary + "--":
+ break
+
+ if line:
+ line_callback(line)
+
+def _create_run():
+ cursor = get_cursor()
+ cursor.execute("INSERT INTO crawler_runs () VALUES ()")
+ return cursor.lastrowid
+
+def _insert_data(run_id, site, url, filtered):
+ site_id = _find_site_id(site)
+ if site_id is None:
+ print >>sys.stderr, "Unable to find site '%s' in the database" % site
+ return
+
+ cursor = get_cursor()
+ cursor.execute("""
+INSERT INTO crawler_data (run, site, url, filtered)
+VALUES (%s, %s, %s, %s)""",
+ (run_id, site_id, url, filtered))
+
+@url_handler("/crawlerData")
+@basic_auth("crawler")
+def crawler_data(environ, start_response):
+ def line_callback(line):
+ try:
+ url, site, filtered = simplejson.loads(line)
+ _insert_data(run_id, site, url, filtered)
+ except simplejson.JSONDecodeError:
+ print >>sys.stderr, "Unable to parse JSON from '%s'" % line
+
+ run_id = _create_run()
+ try:
+ _read_multipart_lines(environ, line_callback)
+ start_response("200 OK", [("Content-Type", "text/plain")])
+ return ""
+ except ValueError as e:
+ start_response("401 Bad Request", [("Content-Type", "text/plain")])
+ return e

Powered by Google App Engine
This is Rietveld