OLD | NEW |
1 import MySQLdb, os | 1 import MySQLdb, os, simplejson |
2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
3 from sitescripts.web import url_handler | 3 from sitescripts.web import url_handler, basic_auth |
4 from urlparse import parse_qs | |
5 | 4 |
6 @cached(600) | 5 @cached(600) |
7 def get_db(): | 6 def _get_db(): |
8 database = get_config().get("crawler", "database") | 7 database = get_config().get("crawler", "database") |
9 dbuser = get_config().get("crawler", "dbuser") | 8 dbuser = get_config().get("crawler", "dbuser") |
10 dbpasswd = get_config().get("crawler", "dbpassword") | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
11 if os.name == "nt": | 10 if os.name == "nt": |
12 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod
e=True, charset="utf8", named_pipe=True) | 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod
e=True, charset="utf8", named_pipe=True) |
13 else: | 12 else: |
14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod
e=True, charset="utf8") | 13 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod
e=True, charset="utf8") |
15 | 14 |
16 def get_cursor(): | 15 def get_cursor(): |
17 return get_db().cursor(MySQLdb.cursors.DictCursor) | 16 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
18 | 17 |
19 def fetch_crawlable_urls(): | 18 def _fetch_crawlable_sites(): |
20 cursor = get_cursor() | 19 cursor = get_cursor() |
21 cursor.execute("SELECT url from crawler_sites") | 20 cursor.execute("SELECT url from crawler_sites") |
22 results = cursor.fetchall() | 21 results = cursor.fetchall() |
23 urls = [result["url"] for result in results] | 22 sites = [result["url"] for result in results] |
24 return urls | 23 return sites |
25 | 24 |
26 @url_handler("/crawlableUrls") | 25 @url_handler("/crawlableSites") |
27 def crawlable_urls(environ, start_response): | 26 @basic_auth |
28 urls = fetch_crawlable_urls() | 27 def crawlable_sites(environ, start_response): |
| 28 urls = _fetch_crawlable_sites() |
29 start_response("200 OK", [("Content-Type", "text/plain")]) | 29 start_response("200 OK", [("Content-Type", "text/plain")]) |
30 return "\n".join(urls) | 30 return "\n".join(urls) |
31 | 31 |
32 @url_handler("/crawlerRun") | 32 def _find_site_id(site_url): |
33 def crawler_run(environ, start_response): | |
34 cursor = get_cursor() | |
35 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | |
36 start_response("200 OK", [("Content-Type", "text/plain")]) | |
37 return str(cursor.lastrowid) | |
38 | |
39 def find_site_id(site_url): | |
40 cursor = get_cursor() | 33 cursor = get_cursor() |
41 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 34 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
42 return cursor.fetchall()[0]["id"] | 35 return cursor.fetchall()[0]["id"] |
43 | 36 |
44 @url_handler("/crawlerData") | 37 def _read_multipart_lines(environ, line_callback): |
45 def crawler_data(environ, start_response): | 38 data_file = environ["wsgi.input"] |
46 params = parse_qs(environ["QUERY_STRING"]) | 39 current_line = 0 |
47 run_id = params["run"][0] | 40 |
48 site_id = find_site_id(params["site"][0]) | 41 while True: |
49 request_url = params["request_url"][0] | 42 line = data_file.readline().strip() |
50 document_url = params["document_url"][0] | 43 current_line += 1 |
| 44 |
| 45 if current_line == 1: |
| 46 boundary = line |
| 47 continue |
| 48 |
| 49 if current_line < 5 or not line: |
| 50 continue |
| 51 |
| 52 if line.startswith(boundary): |
| 53 break |
| 54 |
| 55 line_callback(line) |
| 56 |
| 57 def _create_run(): |
| 58 cursor = get_cursor() |
| 59 cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
| 60 return cursor.lastrowid |
| 61 |
| 62 def _insert_data(run_id, site, url, filtered): |
| 63 site_id = _find_site_id(site) |
51 cursor = get_cursor() | 64 cursor = get_cursor() |
52 cursor.execute(""" | 65 cursor.execute(""" |
53 INSERT INTO crawler_data (run, site, request_url, document_url) | 66 INSERT INTO crawler_data (run, site, url, filtered) |
54 VALUES (%s, %s, %s, %s)""", | 67 VALUES (%s, %s, %s, %s)""", |
55 (run_id, site_id, request_url, document_url)) | 68 (run_id, site_id, url, filtered)) |
| 69 |
| 70 @url_handler("/crawlerData") |
| 71 @basic_auth |
| 72 def crawler_data(environ, start_response): |
| 73 def line_callback(line): |
| 74 url, site, filtered = simplejson.loads(line) |
| 75 _insert_data(run_id, site, url, filtered) |
| 76 |
| 77 run_id = _create_run() |
| 78 _read_multipart_lines(environ, line_callback) |
56 start_response("200 OK", [("Content-Type", "text/plain")]) | 79 start_response("200 OK", [("Content-Type", "text/plain")]) |
57 return "" | 80 return "" |
OLD | NEW |