Index: sitescripts/crawler/web/crawler.py |
=================================================================== |
--- a/sitescripts/crawler/web/crawler.py |
+++ b/sitescripts/crawler/web/crawler.py |
@@ -1,10 +1,9 @@ |
-import MySQLdb, os |
+import MySQLdb, os, simplejson |
from sitescripts.utils import cached, get_config |
-from sitescripts.web import url_handler |
-from urlparse import parse_qs |
+from sitescripts.web import url_handler, basic_auth |
@cached(600) |
-def get_db(): |
+def _get_db(): |
database = get_config().get("crawler", "database") |
dbuser = get_config().get("crawler", "dbuser") |
dbpasswd = get_config().get("crawler", "dbpassword") |
@@ -14,44 +13,68 @@ |
return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicode=True, charset="utf8") |
def get_cursor(): |
- return get_db().cursor(MySQLdb.cursors.DictCursor) |
+ return _get_db().cursor(MySQLdb.cursors.DictCursor) |
-def fetch_crawlable_urls(): |
+def _fetch_crawlable_sites(): |
cursor = get_cursor() |
cursor.execute("SELECT url from crawler_sites") |
results = cursor.fetchall() |
- urls = [result["url"] for result in results] |
- return urls |
+ sites = [result["url"] for result in results] |
+ return sites |
-@url_handler("/crawlableUrls") |
-def crawlable_urls(environ, start_response): |
- urls = fetch_crawlable_urls() |
+@url_handler("/crawlableSites") |
+@basic_auth |
+def crawlable_sites(environ, start_response): |
+ urls = _fetch_crawlable_sites() |
start_response("200 OK", [("Content-Type", "text/plain")]) |
return "\n".join(urls) |
-@url_handler("/crawlerRun") |
-def crawler_run(environ, start_response): |
- cursor = get_cursor() |
- cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
- start_response("200 OK", [("Content-Type", "text/plain")]) |
- return str(cursor.lastrowid) |
- |
-def find_site_id(site_url): |
+def _find_site_id(site_url): |
cursor = get_cursor() |
cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
return cursor.fetchall()[0]["id"] |
Wladimir Palant
2012/09/14 17:24:18
a) Why use fetchall() if we are only interested in
Felix Dahlke
2012/09/26 15:20:30
Done.
|
-@url_handler("/crawlerData") |
-def crawler_data(environ, start_response): |
- params = parse_qs(environ["QUERY_STRING"]) |
- run_id = params["run"][0] |
- site_id = find_site_id(params["site"][0]) |
- request_url = params["request_url"][0] |
- document_url = params["document_url"][0] |
+def _read_multipart_lines(environ, line_callback): |
+ data_file = environ["wsgi.input"] |
+ current_line = 0 |
+ |
+ while True: |
+ line = data_file.readline().strip() |
Wladimir Palant
2012/09/14 17:24:18
Same as earlier, please use:
for line in enviro
Felix Dahlke
2012/09/26 15:20:30
Done.
|
+ current_line += 1 |
+ |
+ if current_line == 1: |
+ boundary = line |
+ continue |
+ |
+ if current_line < 5 or not line: |
Wladimir Palant
2012/09/14 17:24:18
No, that's not how you parse multipart/form-data (
Felix Dahlke
2012/09/26 15:20:30
Done. Wow, that was one mean hack I did there :)
Wladimir Palant
2012/09/27 07:34:17
Much better now :)
|
+ continue |
+ |
+ if line.startswith(boundary): |
+ break |
+ |
+ line_callback(line) |
+ |
+def _create_run(): |
+ cursor = get_cursor() |
+ cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
+ return cursor.lastrowid |
+ |
+def _insert_data(run_id, site, url, filtered): |
+ site_id = _find_site_id(site) |
cursor = get_cursor() |
cursor.execute(""" |
-INSERT INTO crawler_data (run, site, request_url, document_url) |
+INSERT INTO crawler_data (run, site, url, filtered) |
VALUES (%s, %s, %s, %s)""", |
- (run_id, site_id, request_url, document_url)) |
+ (run_id, site_id, url, filtered)) |
+ |
+@url_handler("/crawlerData") |
+@basic_auth |
+def crawler_data(environ, start_response): |
+ def line_callback(line): |
+ url, site, filtered = simplejson.loads(line) |
+ _insert_data(run_id, site, url, filtered) |
+ |
+ run_id = _create_run() |
+ _read_multipart_lines(environ, line_callback) |
start_response("200 OK", [("Content-Type", "text/plain")]) |
return "" |