| Index: sitescripts/crawler/web/crawler.py |
| =================================================================== |
| --- a/sitescripts/crawler/web/crawler.py |
| +++ b/sitescripts/crawler/web/crawler.py |
| @@ -1,10 +1,9 @@ |
| -import MySQLdb, os |
| +import MySQLdb, os, simplejson |
| from sitescripts.utils import cached, get_config |
| -from sitescripts.web import url_handler |
| -from urlparse import parse_qs |
| +from sitescripts.web import url_handler, basic_auth |
| @cached(600) |
| -def get_db(): |
| +def _get_db(): |
| database = get_config().get("crawler", "database") |
| dbuser = get_config().get("crawler", "dbuser") |
| dbpasswd = get_config().get("crawler", "dbpassword") |
| @@ -14,44 +13,68 @@ |
| return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicode=True, charset="utf8") |
| def get_cursor(): |
| - return get_db().cursor(MySQLdb.cursors.DictCursor) |
| + return _get_db().cursor(MySQLdb.cursors.DictCursor) |
| -def fetch_crawlable_urls(): |
| +def _fetch_crawlable_sites(): |
| cursor = get_cursor() |
| cursor.execute("SELECT url from crawler_sites") |
| results = cursor.fetchall() |
| - urls = [result["url"] for result in results] |
| - return urls |
| + sites = [result["url"] for result in results] |
| + return sites |
| -@url_handler("/crawlableUrls") |
| -def crawlable_urls(environ, start_response): |
| - urls = fetch_crawlable_urls() |
| +@url_handler("/crawlableSites") |
| +@basic_auth |
| +def crawlable_sites(environ, start_response): |
| + urls = _fetch_crawlable_sites() |
| start_response("200 OK", [("Content-Type", "text/plain")]) |
| return "\n".join(urls) |
| -@url_handler("/crawlerRun") |
| -def crawler_run(environ, start_response): |
| - cursor = get_cursor() |
| - cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
| - start_response("200 OK", [("Content-Type", "text/plain")]) |
| - return str(cursor.lastrowid) |
| - |
| -def find_site_id(site_url): |
| +def _find_site_id(site_url): |
| cursor = get_cursor() |
| cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
| return cursor.fetchall()[0]["id"] |
|
Wladimir Palant
2012/09/14 17:24:18
a) Why use fetchall() if we are only interested in
Felix Dahlke
2012/09/26 15:20:30
Done.
|
| -@url_handler("/crawlerData") |
| -def crawler_data(environ, start_response): |
| - params = parse_qs(environ["QUERY_STRING"]) |
| - run_id = params["run"][0] |
| - site_id = find_site_id(params["site"][0]) |
| - request_url = params["request_url"][0] |
| - document_url = params["document_url"][0] |
| +def _read_multipart_lines(environ, line_callback): |
| + data_file = environ["wsgi.input"] |
| + current_line = 0 |
| + |
| + while True: |
| + line = data_file.readline().strip() |
|
Wladimir Palant
2012/09/14 17:24:18
Same as earlier, please use:
for line in enviro
Felix Dahlke
2012/09/26 15:20:30
Done.
|
| + current_line += 1 |
| + |
| + if current_line == 1: |
| + boundary = line |
| + continue |
| + |
| + if current_line < 5 or not line: |
|
Wladimir Palant
2012/09/14 17:24:18
No, that's not how you parse multipart/form-data (
Felix Dahlke
2012/09/26 15:20:30
Done. Wow, that was one mean hack I did there :)
Wladimir Palant
2012/09/27 07:34:17
Much better now :)
|
| + continue |
| + |
| + if line.startswith(boundary): |
| + break |
| + |
| + line_callback(line) |
| + |
| +def _create_run(): |
| + cursor = get_cursor() |
| + cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
| + return cursor.lastrowid |
| + |
| +def _insert_data(run_id, site, url, filtered): |
| + site_id = _find_site_id(site) |
| cursor = get_cursor() |
| cursor.execute(""" |
| -INSERT INTO crawler_data (run, site, request_url, document_url) |
| +INSERT INTO crawler_data (run, site, url, filtered) |
| VALUES (%s, %s, %s, %s)""", |
| - (run_id, site_id, request_url, document_url)) |
| + (run_id, site_id, url, filtered)) |
| + |
| +@url_handler("/crawlerData") |
| +@basic_auth |
| +def crawler_data(environ, start_response): |
| + def line_callback(line): |
| + url, site, filtered = simplejson.loads(line) |
| + _insert_data(run_id, site, url, filtered) |
| + |
| + run_id = _create_run() |
| + _read_multipart_lines(environ, line_callback) |
| start_response("200 OK", [("Content-Type", "text/plain")]) |
| return "" |