Left: | ||
Right: |
OLD | NEW |
---|---|
1 import MySQLdb, os | 1 import MySQLdb, os, simplejson |
2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
3 from sitescripts.web import url_handler | 3 from sitescripts.web import url_handler, basic_auth |
4 from urlparse import parse_qs | |
5 | 4 |
6 @cached(600) | 5 @cached(600) |
7 def get_db(): | 6 def _get_db(): |
8 database = get_config().get("crawler", "database") | 7 database = get_config().get("crawler", "database") |
9 dbuser = get_config().get("crawler", "dbuser") | 8 dbuser = get_config().get("crawler", "dbuser") |
10 dbpasswd = get_config().get("crawler", "dbpassword") | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
11 if os.name == "nt": | 10 if os.name == "nt": |
12 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True) | 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True) |
13 else: | 12 else: |
14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8") | 13 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8") |
15 | 14 |
16 def get_cursor(): | 15 def get_cursor(): |
17 return get_db().cursor(MySQLdb.cursors.DictCursor) | 16 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
18 | 17 |
19 def fetch_crawlable_urls(): | 18 def _fetch_crawlable_sites(): |
20 cursor = get_cursor() | 19 cursor = get_cursor() |
21 cursor.execute("SELECT url from crawler_sites") | 20 cursor.execute("SELECT url from crawler_sites") |
22 results = cursor.fetchall() | 21 results = cursor.fetchall() |
23 urls = [result["url"] for result in results] | 22 sites = [result["url"] for result in results] |
24 return urls | 23 return sites |
25 | 24 |
26 @url_handler("/crawlableUrls") | 25 @url_handler("/crawlableSites") |
27 def crawlable_urls(environ, start_response): | 26 @basic_auth |
28 urls = fetch_crawlable_urls() | 27 def crawlable_sites(environ, start_response): |
28 urls = _fetch_crawlable_sites() | |
29 start_response("200 OK", [("Content-Type", "text/plain")]) | 29 start_response("200 OK", [("Content-Type", "text/plain")]) |
30 return "\n".join(urls) | 30 return "\n".join(urls) |
31 | 31 |
32 @url_handler("/crawlerRun") | 32 def _find_site_id(site_url): |
33 def crawler_run(environ, start_response): | |
34 cursor = get_cursor() | |
35 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | |
36 start_response("200 OK", [("Content-Type", "text/plain")]) | |
37 return str(cursor.lastrowid) | |
38 | |
39 def find_site_id(site_url): | |
40 cursor = get_cursor() | 33 cursor = get_cursor() |
41 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 34 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
42 return cursor.fetchall()[0]["id"] | 35 return cursor.fetchall()[0]["id"] |
Wladimir Palant
2012/09/14 17:24:18
a) Why use fetchall() if we are only interested in
Felix Dahlke
2012/09/26 15:20:30
Done.
| |
43 | 36 |
44 @url_handler("/crawlerData") | 37 def _read_multipart_lines(environ, line_callback): |
45 def crawler_data(environ, start_response): | 38 data_file = environ["wsgi.input"] |
46 params = parse_qs(environ["QUERY_STRING"]) | 39 current_line = 0 |
47 run_id = params["run"][0] | 40 |
48 site_id = find_site_id(params["site"][0]) | 41 while True: |
49 request_url = params["request_url"][0] | 42 line = data_file.readline().strip() |
Wladimir Palant
2012/09/14 17:24:18
Same as earlier, please use:
for line in enviro
Felix Dahlke
2012/09/26 15:20:30
Done.
| |
50 document_url = params["document_url"][0] | 43 current_line += 1 |
44 | |
45 if current_line == 1: | |
46 boundary = line | |
47 continue | |
48 | |
49 if current_line < 5 or not line: | |
Wladimir Palant
2012/09/14 17:24:18
No, that's not how you parse multipart/form-data (
Felix Dahlke
2012/09/26 15:20:30
Done. Wow, that was one mean hack I did there :)
Wladimir Palant
2012/09/27 07:34:17
Much better now :)
| |
50 continue | |
51 | |
52 if line.startswith(boundary): | |
53 break | |
54 | |
55 line_callback(line) | |
56 | |
57 def _create_run(): | |
58 cursor = get_cursor() | |
59 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | |
60 return cursor.lastrowid | |
61 | |
62 def _insert_data(run_id, site, url, filtered): | |
63 site_id = _find_site_id(site) | |
51 cursor = get_cursor() | 64 cursor = get_cursor() |
52 cursor.execute(""" | 65 cursor.execute(""" |
53 INSERT INTO crawler_data (run, site, request_url, document_url) | 66 INSERT INTO crawler_data (run, site, url, filtered) |
54 VALUES (%s, %s, %s, %s)""", | 67 VALUES (%s, %s, %s, %s)""", |
55 (run_id, site_id, request_url, document_url)) | 68 (run_id, site_id, url, filtered)) |
69 | |
70 @url_handler("/crawlerData") | |
71 @basic_auth | |
72 def crawler_data(environ, start_response): | |
73 def line_callback(line): | |
74 url, site, filtered = simplejson.loads(line) | |
75 _insert_data(run_id, site, url, filtered) | |
76 | |
77 run_id = _create_run() | |
78 _read_multipart_lines(environ, line_callback) | |
56 start_response("200 OK", [("Content-Type", "text/plain")]) | 79 start_response("200 OK", [("Content-Type", "text/plain")]) |
57 return "" | 80 return "" |
OLD | NEW |