Index: sitescripts/crawler/web/crawler.py |
=================================================================== |
--- a/sitescripts/crawler/web/crawler.py |
+++ b/sitescripts/crawler/web/crawler.py |
@@ -37,6 +37,16 @@ |
result = cursor.fetchone() |
return result["id"] if result else None |
+def _find_filter_id(filter_str): |
+ if filter_str is None: |
+ return None |
+ |
+ cursor = _get_cursor() |
+ query = "SELECT id FROM crawler_filters WHERE filter_hash = sha1(%s)" |
+ cursor.execute(query, filter_str) |
+ result = cursor.fetchone() |
+ return result["id"] if result else None |
+ |
def _read_multipart_lines(environ, line_callback): |
data_file = environ["wsgi.input"] |
content_type = environ.get("CONTENT_TYPE") |
@@ -75,25 +85,37 @@ |
cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
return cursor.lastrowid |
-def _insert_data(run_id, site, url, filtered): |
+def _insert_data(run_id, site, url, filtered, filter_str): |
site_id = _find_site_id(site) |
if site_id is None: |
print >>sys.stderr, "Unable to find site '%s' in the database" % site |
return |
cursor = _get_cursor() |
- cursor.execute(""" |
-INSERT INTO crawler_data (run, site, url, filtered) |
-VALUES (%s, %s, %s, %s)""", |
- (run_id, site_id, url, filtered)) |
+ insert = """ |
+INSERT INTO crawler_requests (run, site, url, filtered) |
+VALUES (%s, %s, %s, %s)""" |
+ cursor.execute(insert, (run_id, site_id, url, filtered)) |
-@url_handler("/crawlerData") |
+ filter_id = _find_filter_id(filter_str) |
+ if filter_id is not None: |
+ update = "UPDATE crawler_requests SET filter = %s WHERE id = %s"; |
+ cursor.execute(update, (filter_id, cursor.lastrowid)) |
+ |
+@url_handler("/crawlerRequests") |
@basic_auth("crawler") |
-def crawler_data(environ, start_response): |
+def crawler_requests(environ, start_response): |
def line_callback(line): |
try: |
- url, site, filtered = simplejson.loads(line) |
- _insert_data(run_id, site, url, filtered) |
+ data = simplejson.loads(line) |
+ if len(data) < 3: |
+ print >>sys.stderr, "Not enough elements in line '%s'" % line |
+ return |
+ url = data[0] |
+ site = data[1] |
+ filtered = data[2] |
+ filter_str = data[3] if len(data) >= 4 else None |
+ _insert_data(run_id, site, url, filtered, filter_str) |
except simplejson.JSONDecodeError: |
print >>sys.stderr, "Unable to parse JSON from '%s'" % line |
@@ -104,4 +126,5 @@ |
return "" |
except ValueError as e: |
start_response("401 Bad Request", [("Content-Type", "text/plain")]) |
+ print >>sys.stderr, "Unable to read multipart data: %s" % e |
return e |