| Index: sitescripts/crawler/web/crawler.py | 
| =================================================================== | 
| --- a/sitescripts/crawler/web/crawler.py | 
| +++ b/sitescripts/crawler/web/crawler.py | 
| @@ -37,6 +37,16 @@ | 
| result = cursor.fetchone() | 
| return result["id"] if result else None | 
|  | 
| +def _find_filter_id(filter_str): | 
| +  if filter_str is None: | 
| +    return None | 
| + | 
| +  cursor = _get_cursor() | 
| +  query = "SELECT id FROM crawler_filters WHERE filter_hash = sha1(%s)" | 
| +  cursor.execute(query, filter_str) | 
| +  result = cursor.fetchone() | 
| +  return result["id"] if result else None | 
| + | 
| def _read_multipart_lines(environ, line_callback): | 
| data_file = environ["wsgi.input"] | 
| content_type = environ.get("CONTENT_TYPE") | 
| @@ -75,25 +85,37 @@ | 
| cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 
| return cursor.lastrowid | 
|  | 
| -def _insert_data(run_id, site, url, filtered): | 
| +def _insert_data(run_id, site, url, filtered, filter_str): | 
| site_id = _find_site_id(site) | 
| if site_id is None: | 
| print >>sys.stderr, "Unable to find site '%s' in the database" % site | 
| return | 
|  | 
| cursor = _get_cursor() | 
| -  cursor.execute(""" | 
| -INSERT INTO crawler_data (run, site, url, filtered) | 
| -VALUES (%s, %s, %s, %s)""", | 
| -                 (run_id, site_id, url, filtered)) | 
| +  insert = """ | 
| +INSERT INTO crawler_requests (run, site, url, filtered) | 
| +VALUES (%s, %s, %s, %s)""" | 
| +  cursor.execute(insert, (run_id, site_id, url, filtered)) | 
|  | 
| -@url_handler("/crawlerData") | 
| +  filter_id = _find_filter_id(filter_str) | 
| +  if filter_id is not None: | 
| +    update = "UPDATE crawler_requests SET filter = %s WHERE id = %s"; | 
| +    cursor.execute(update, (filter_id, cursor.lastrowid)) | 
| + | 
| +@url_handler("/crawlerRequests") | 
| @basic_auth("crawler") | 
| -def crawler_data(environ, start_response): | 
| +def crawler_requests(environ, start_response): | 
| def line_callback(line): | 
| try: | 
| -      url, site, filtered = simplejson.loads(line) | 
| -      _insert_data(run_id, site, url, filtered) | 
| +      data = simplejson.loads(line) | 
| +      if len(data) < 3: | 
| +        print >>sys.stderr, "Not enough elements in line '%s'" % line | 
| +        return | 
| +      url = data[0] | 
| +      site = data[1] | 
| +      filtered = data[2] | 
| +      filter_str = data[3] if len(data) >= 4 else None | 
| +      _insert_data(run_id, site, url, filtered, filter_str) | 
| except simplejson.JSONDecodeError: | 
| print >>sys.stderr, "Unable to parse JSON from '%s'" % line | 
|  | 
| @@ -104,4 +126,5 @@ | 
| return "" | 
| except ValueError as e: | 
| start_response("401 Bad Request", [("Content-Type", "text/plain")]) | 
| +    print >>sys.stderr, "Unable to read multipart data: %s" % e | 
| return e | 
|  |