Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/crawler/web/crawler.py

Issue 8492019: sitescripts: Collect unmatched filters (Closed)
Patch Set: Created Oct. 2, 2012, 5:02 a.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sitescripts/crawler/web/crawler.py
===================================================================
--- a/sitescripts/crawler/web/crawler.py
+++ b/sitescripts/crawler/web/crawler.py
@@ -37,6 +37,16 @@
result = cursor.fetchone()
return result["id"] if result else None
+def _find_filter_id(filter_str):
+ if filter_str is None:
+ return None
+
+ cursor = _get_cursor()
+ query = "SELECT id FROM crawler_filters WHERE filter_hash = sha1(%s)"
+ cursor.execute(query, filter_str)
+ result = cursor.fetchone()
+ return result["id"] if result else None
+
def _read_multipart_lines(environ, line_callback):
data_file = environ["wsgi.input"]
content_type = environ.get("CONTENT_TYPE")
@@ -75,25 +85,37 @@
cursor.execute("INSERT INTO crawler_runs () VALUES ()")
return cursor.lastrowid
-def _insert_data(run_id, site, url, filtered):
+def _insert_data(run_id, site, url, filtered, filter_str):
site_id = _find_site_id(site)
if site_id is None:
print >>sys.stderr, "Unable to find site '%s' in the database" % site
return
cursor = _get_cursor()
- cursor.execute("""
-INSERT INTO crawler_data (run, site, url, filtered)
-VALUES (%s, %s, %s, %s)""",
- (run_id, site_id, url, filtered))
+ insert = """
+INSERT INTO crawler_requests (run, site, url, filtered)
+VALUES (%s, %s, %s, %s)"""
+ cursor.execute(insert, (run_id, site_id, url, filtered))
-@url_handler("/crawlerData")
+ filter_id = _find_filter_id(filter_str)
+ if filter_id is not None:
+ update = "UPDATE crawler_requests SET filter = %s WHERE id = %s";
+ cursor.execute(update, (filter_id, cursor.lastrowid))
+
+@url_handler("/crawlerRequests")
@basic_auth("crawler")
-def crawler_data(environ, start_response):
+def crawler_requests(environ, start_response):
def line_callback(line):
try:
- url, site, filtered = simplejson.loads(line)
- _insert_data(run_id, site, url, filtered)
+ data = simplejson.loads(line)
+ if len(data) < 3:
+ print >>sys.stderr, "Not enough elements in line '%s'" % line
+ return
+ url = data[0]
+ site = data[1]
+ filtered = data[2]
+ filter_str = data[3] if len(data) >= 4 else None
+ _insert_data(run_id, site, url, filtered, filter_str)
except simplejson.JSONDecodeError:
print >>sys.stderr, "Unable to parse JSON from '%s'" % line
@@ -104,4 +126,5 @@
return ""
except ValueError as e:
start_response("401 Bad Request", [("Content-Type", "text/plain")])
+ print >>sys.stderr, "Unable to read multipart data: %s" % e
return e
« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld