Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Patch Set: README fix Created Sept. 14, 2012, 2:42 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sitescripts/crawler/web/crawler.py
===================================================================
--- a/sitescripts/crawler/web/crawler.py
+++ b/sitescripts/crawler/web/crawler.py
@@ -1,10 +1,9 @@
-import MySQLdb, os
+import MySQLdb, os, simplejson
from sitescripts.utils import cached, get_config
-from sitescripts.web import url_handler
-from urlparse import parse_qs
+from sitescripts.web import url_handler, basic_auth
@cached(600)
-def get_db():
+def _get_db():
database = get_config().get("crawler", "database")
dbuser = get_config().get("crawler", "dbuser")
dbpasswd = get_config().get("crawler", "dbpassword")
@@ -14,44 +13,68 @@
return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicode=True, charset="utf8")
def get_cursor():
- return get_db().cursor(MySQLdb.cursors.DictCursor)
+ return _get_db().cursor(MySQLdb.cursors.DictCursor)
-def fetch_crawlable_urls():
+def _fetch_crawlable_sites():
cursor = get_cursor()
cursor.execute("SELECT url from crawler_sites")
results = cursor.fetchall()
- urls = [result["url"] for result in results]
- return urls
+ sites = [result["url"] for result in results]
+ return sites
-@url_handler("/crawlableUrls")
-def crawlable_urls(environ, start_response):
- urls = fetch_crawlable_urls()
+@url_handler("/crawlableSites")
+@basic_auth
+def crawlable_sites(environ, start_response):
+ urls = _fetch_crawlable_sites()
start_response("200 OK", [("Content-Type", "text/plain")])
return "\n".join(urls)
-@url_handler("/crawlerRun")
-def crawler_run(environ, start_response):
- cursor = get_cursor()
- cursor.execute("INSERT INTO crawler_runs () VALUES ()")
- start_response("200 OK", [("Content-Type", "text/plain")])
- return str(cursor.lastrowid)
-
-def find_site_id(site_url):
+def _find_site_id(site_url):
cursor = get_cursor()
cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
return cursor.fetchall()[0]["id"]
Wladimir Palant 2012/09/14 17:24:18 a) Why use fetchall() if we are only interested in
Felix Dahlke 2012/09/26 15:20:30 Done.
-@url_handler("/crawlerData")
-def crawler_data(environ, start_response):
- params = parse_qs(environ["QUERY_STRING"])
- run_id = params["run"][0]
- site_id = find_site_id(params["site"][0])
- request_url = params["request_url"][0]
- document_url = params["document_url"][0]
+def _read_multipart_lines(environ, line_callback):
+ data_file = environ["wsgi.input"]
+ current_line = 0
+
+ while True:
+ line = data_file.readline().strip()
Wladimir Palant 2012/09/14 17:24:18 Same as earlier, please use: for line in enviro
Felix Dahlke 2012/09/26 15:20:30 Done.
+ current_line += 1
+
+ if current_line == 1:
+ boundary = line
+ continue
+
+ if current_line < 5 or not line:
Wladimir Palant 2012/09/14 17:24:18 No, that's not how you parse multipart/form-data (
Felix Dahlke 2012/09/26 15:20:30 Done. Wow, that was one mean hack I did there :)
Wladimir Palant 2012/09/27 07:34:17 Much better now :)
+ continue
+
+ if line.startswith(boundary):
+ break
+
+ line_callback(line)
+
+def _create_run():
+ cursor = get_cursor()
+ cursor.execute("INSERT INTO crawler_runs () VALUES ()")
+ return cursor.lastrowid
+
+def _insert_data(run_id, site, url, filtered):
+ site_id = _find_site_id(site)
cursor = get_cursor()
cursor.execute("""
-INSERT INTO crawler_data (run, site, request_url, document_url)
+INSERT INTO crawler_data (run, site, url, filtered)
VALUES (%s, %s, %s, %s)""",
- (run_id, site_id, request_url, document_url))
+ (run_id, site_id, url, filtered))
+
+@url_handler("/crawlerData")
+@basic_auth
+def crawler_data(environ, start_response):
+ def line_callback(line):
+ url, site, filtered = simplejson.loads(line)
+ _insert_data(run_id, site, url, filtered)
+
+ run_id = _create_run()
+ _read_multipart_lines(environ, line_callback)
start_response("200 OK", [("Content-Type", "text/plain")])
return ""

Powered by Google App Engine
This is Rietveld