| Index: sitescripts/crawler/web/crawler.py | 
| =================================================================== | 
| --- a/sitescripts/crawler/web/crawler.py | 
| +++ b/sitescripts/crawler/web/crawler.py | 
| @@ -1,10 +1,9 @@ | 
| -import MySQLdb, os | 
| +import MySQLdb, os, simplejson | 
| from sitescripts.utils import cached, get_config | 
| -from sitescripts.web import url_handler | 
| -from urlparse import parse_qs | 
| +from sitescripts.web import url_handler, basic_auth | 
|  | 
| @cached(600) | 
| -def get_db(): | 
| +def _get_db(): | 
| database = get_config().get("crawler", "database") | 
| dbuser = get_config().get("crawler", "dbuser") | 
| dbpasswd = get_config().get("crawler", "dbpassword") | 
| @@ -14,44 +13,68 @@ | 
| return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicode=True, charset="utf8") | 
|  | 
| def get_cursor(): | 
| -  return get_db().cursor(MySQLdb.cursors.DictCursor) | 
| +  return _get_db().cursor(MySQLdb.cursors.DictCursor) | 
|  | 
| -def fetch_crawlable_urls(): | 
| +def _fetch_crawlable_sites(): | 
| cursor = get_cursor() | 
| cursor.execute("SELECT url from crawler_sites") | 
| results = cursor.fetchall() | 
| -  urls = [result["url"] for result in results] | 
| -  return urls | 
| +  sites = [result["url"] for result in results] | 
| +  return sites | 
|  | 
| -@url_handler("/crawlableUrls") | 
| -def crawlable_urls(environ, start_response): | 
| -  urls = fetch_crawlable_urls() | 
| +@url_handler("/crawlableSites") | 
| +@basic_auth | 
| +def crawlable_sites(environ, start_response): | 
| +  urls = _fetch_crawlable_sites() | 
| start_response("200 OK", [("Content-Type", "text/plain")]) | 
| return "\n".join(urls) | 
|  | 
| -@url_handler("/crawlerRun") | 
| -def crawler_run(environ, start_response): | 
| -  cursor = get_cursor() | 
| -  cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 
| -  start_response("200 OK", [("Content-Type", "text/plain")]) | 
| -  return str(cursor.lastrowid) | 
| - | 
| -def find_site_id(site_url): | 
| +def _find_site_id(site_url): | 
| cursor = get_cursor() | 
| cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 
| return cursor.fetchall()[0]["id"] | 
|  | 
| -@url_handler("/crawlerData") | 
| -def crawler_data(environ, start_response): | 
| -  params = parse_qs(environ["QUERY_STRING"]) | 
| -  run_id = params["run"][0] | 
| -  site_id = find_site_id(params["site"][0]) | 
| -  request_url = params["request_url"][0] | 
| -  document_url = params["document_url"][0] | 
| +def _read_multipart_lines(environ, line_callback): | 
| +  data_file = environ["wsgi.input"] | 
| +  current_line = 0 | 
| + | 
| +  while True: | 
| +    line = data_file.readline().strip() | 
| +    current_line += 1 | 
| + | 
| +    if current_line == 1: | 
| +      boundary = line | 
| +      continue | 
| + | 
| +    if current_line < 5 or not line: | 
| +      continue | 
| + | 
| +    if line.startswith(boundary): | 
| +      break | 
| + | 
| +    line_callback(line) | 
| + | 
| +def _create_run(): | 
| +  cursor = get_cursor() | 
| +  cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 
| +  return cursor.lastrowid | 
| + | 
| +def _insert_data(run_id, site, url, filtered): | 
| +  site_id = _find_site_id(site) | 
| cursor = get_cursor() | 
| cursor.execute(""" | 
| -INSERT INTO crawler_data (run, site, request_url, document_url) | 
| +INSERT INTO crawler_data (run, site, url, filtered) | 
| VALUES (%s, %s, %s, %s)""", | 
| -                (run_id, site_id, request_url, document_url)) | 
| +                 (run_id, site_id, url, filtered)) | 
| + | 
| +@url_handler("/crawlerData") | 
| +@basic_auth | 
| +def crawler_data(environ, start_response): | 
| +  def line_callback(line): | 
| +    url, site, filtered = simplejson.loads(line) | 
| +    _insert_data(run_id, site, url, filtered) | 
| + | 
| +  run_id = _create_run() | 
| +  _read_multipart_lines(environ, line_callback) | 
| start_response("200 OK", [("Content-Type", "text/plain")]) | 
| return "" | 
|  |