Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 import MySQLdb, os, re, simplejson, sys | 1 import MySQLdb, os, re, simplejson, sys |
2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
3 from sitescripts.web import url_handler, basic_auth | 3 from sitescripts.web import url_handler, basic_auth |
4 | 4 |
5 @cached(600) | 5 @cached(600) |
6 def _get_db(): | 6 def _get_db(): |
7 database = get_config().get("crawler", "database") | 7 database = get_config().get("crawler", "database") |
8 dbuser = get_config().get("crawler", "dbuser") | 8 dbuser = get_config().get("crawler", "dbuser") |
9 dbpasswd = get_config().get("crawler", "dbpassword") | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
10 if os.name == "nt": | 10 if os.name == "nt": |
11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
12 use_unicode=True, charset="utf8", named_pipe=True) | 12 use_unicode=True, charset="utf8", named_pipe=True) |
13 else: | 13 else: |
14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
15 use_unicode=True, charset="utf8") | 15 use_unicode=True, charset="utf8") |
16 | 16 |
17 def get_cursor(): | 17 def get_cursor(): |
18 return _get_db().cursor(MySQLdb.cursors.DictCursor) | 18 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
19 | 19 |
20 def _fetch_crawlable_sites(): | 20 def _fetch_crawlable_sites(): |
21 cursor = get_cursor() | 21 cursor = get_cursor() |
22 cursor.execute("SELECT url from crawler_sites") | 22 cursor.execute("SELECT url from crawler_sites") |
23 results = cursor.fetchall() | 23 results = cursor.fetchall() |
24 sites = [result["url"] for result in results] | 24 sites = [result["url"] for result in results] |
25 return sites | 25 return sites |
26 | 26 |
27 @url_handler("/crawlableSites") | 27 @url_handler("/crawlableSites") |
28 @basic_auth | 28 @basic_auth("crawler") |
Wladimir Palant
2012/09/27 13:44:51
Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
29 def crawlable_sites(environ, start_response): | 29 def crawlable_sites(environ, start_response): |
30 urls = _fetch_crawlable_sites() | 30 urls = _fetch_crawlable_sites() |
31 start_response("200 OK", [("Content-Type", "text/plain")]) | 31 start_response("200 OK", [("Content-Type", "text/plain")]) |
32 return "\n".join(urls) | 32 return "\n".join(urls) |
33 | 33 |
34 def _find_site_id(site_url): | 34 def _find_site_id(site_url): |
35 cursor = get_cursor() | 35 cursor = get_cursor() |
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
37 result = cursor.fetchone() | 37 result = cursor.fetchone() |
38 return result["id"] if result else None | 38 return result["id"] if result else None |
39 | 39 |
40 def _read_multipart_lines(environ, line_callback): | 40 def _read_multipart_lines(environ, line_callback): |
41 data_file = environ["wsgi.input"] | 41 data_file = environ["wsgi.input"] |
42 content_type = environ["CONTENT_TYPE"] | 42 content_type = environ.get("CONTENT_TYPE") |
43 if not content_type: | 43 if not content_type: |
44 raise ValueError("Content-Type missing from header") | 44 raise ValueError("Content-Type missing from header") |
Wladimir Palant
2012/09/27 13:44:51
Python doesn't work like that - accessing a non-ex
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
45 | 45 |
46 match = re.search(r"boundary=(.*)", content_type) | 46 match = re.search(r"boundary=(.*)", content_type) |
47 if not match: | 47 if not match: |
48 raise ValueError("Multipart form data or boundary declaration missing") | 48 raise ValueError("Multipart form data or boundary declaration missing") |
49 | 49 |
50 boundary = match.group(1) | 50 boundary = match.group(1) |
51 boundary_passed = False | 51 boundary_passed = False |
52 header_passed = False | 52 header_passed = False |
53 | 53 |
54 for line in data_file: | 54 for line in data_file: |
55 line = line.strip() | 55 line = line.strip() |
56 | 56 |
57 if not boundary_passed: | 57 if not boundary_passed: |
58 if line == "--" + boundary: | 58 if line == "--" + boundary: |
59 boundary_passed = True | 59 boundary_passed = True |
60 continue | 60 continue |
61 | 61 |
62 if not header_passed: | 62 if not header_passed: |
63 if not line: | 63 if not line: |
64 header_passed = True | 64 header_passed = True |
65 continue | 65 continue |
66 | 66 |
67 if line == "--" + boundary + "--": | 67 if line == "--" + boundary + "--": |
68 break | 68 break |
69 | 69 |
70 if line: | 70 if line: |
71 line_callback(line) | 71 line_callback(line) |
72 | 72 |
73 return True | |
Wladimir Palant
2012/09/27 13:44:51
That return value is unused, left-over from earlie
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
74 | |
75 def _create_run(): | 73 def _create_run(): |
76 cursor = get_cursor() | 74 cursor = get_cursor() |
77 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 75 cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
78 return cursor.lastrowid | 76 return cursor.lastrowid |
79 | 77 |
80 def _insert_data(run_id, site, url, filtered): | 78 def _insert_data(run_id, site, url, filtered): |
81 site_id = _find_site_id(site) | 79 site_id = _find_site_id(site) |
82 if site_id is None: | 80 if site_id is None: |
83 print >>sys.stderr, "Unable to find site '%s' in the database" % site | 81 print >>sys.stderr, "Unable to find site '%s' in the database" % site |
84 return | 82 return |
85 | 83 |
86 cursor = get_cursor() | 84 cursor = get_cursor() |
87 cursor.execute(""" | 85 cursor.execute(""" |
88 INSERT INTO crawler_data (run, site, url, filtered) | 86 INSERT INTO crawler_data (run, site, url, filtered) |
89 VALUES (%s, %s, %s, %s)""", | 87 VALUES (%s, %s, %s, %s)""", |
90 (run_id, site_id, url, filtered)) | 88 (run_id, site_id, url, filtered)) |
91 | 89 |
92 @url_handler("/crawlerData") | 90 @url_handler("/crawlerData") |
93 @basic_auth | 91 @basic_auth("crawler") |
Wladimir Palant
2012/09/27 13:44:51
Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
94 def crawler_data(environ, start_response): | 92 def crawler_data(environ, start_response): |
95 def line_callback(line): | 93 def line_callback(line): |
96 try: | 94 try: |
97 url, site, filtered = simplejson.loads(line) | 95 url, site, filtered = simplejson.loads(line) |
98 _insert_data(run_id, site, url, filtered) | 96 _insert_data(run_id, site, url, filtered) |
99 except simplejson.JSONDecodeError: | 97 except simplejson.JSONDecodeError: |
100 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | 98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line |
101 | 99 |
102 run_id = _create_run() | 100 run_id = _create_run() |
103 try: | 101 try: |
104 _read_multipart_lines(environ, line_callback) | 102 _read_multipart_lines(environ, line_callback) |
105 start_response("200 OK", [("Content-Type", "text/plain")]) | 103 start_response("200 OK", [("Content-Type", "text/plain")]) |
106 return "" | 104 return "" |
107 except ValueError as e: | 105 except ValueError as e: |
108 start_response("401 Bad Request", [("Content-Type", "text/plain")]) | 106 start_response("401 Bad Request", [("Content-Type", "text/plain")]) |
109 return e | 107 return e |
LEFT | RIGHT |