| Left: | ||
| Right: |
| LEFT | RIGHT |
|---|---|
| 1 import MySQLdb, os, simplejson | 1 import MySQLdb, os, re, simplejson, sys |
| 2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
| 3 from sitescripts.web import url_handler, basic_auth | 3 from sitescripts.web import url_handler, basic_auth |
| 4 | 4 |
| 5 @cached(600) | 5 @cached(600) |
| 6 def _get_db(): | 6 def _get_db(): |
| 7 database = get_config().get("crawler", "database") | 7 database = get_config().get("crawler", "database") |
| 8 dbuser = get_config().get("crawler", "dbuser") | 8 dbuser = get_config().get("crawler", "dbuser") |
| 9 dbpasswd = get_config().get("crawler", "dbpassword") | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
| 10 if os.name == "nt": | 10 if os.name == "nt": |
| 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True) | 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| 12 use_unicode=True, charset="utf8", named_pipe=True) | |
| 12 else: | 13 else: |
| 13 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8") | 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| 15 use_unicode=True, charset="utf8") | |
| 14 | 16 |
| 15 def get_cursor(): | 17 def get_cursor(): |
| 16 return _get_db().cursor(MySQLdb.cursors.DictCursor) | 18 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
| 17 | 19 |
| 18 def _fetch_crawlable_sites(): | 20 def _fetch_crawlable_sites(): |
| 19 cursor = get_cursor() | 21 cursor = get_cursor() |
| 20 cursor.execute("SELECT url from crawler_sites") | 22 cursor.execute("SELECT url from crawler_sites") |
| 21 results = cursor.fetchall() | 23 results = cursor.fetchall() |
| 22 sites = [result["url"] for result in results] | 24 sites = [result["url"] for result in results] |
| 23 return sites | 25 return sites |
| 24 | 26 |
| 25 @url_handler("/crawlableSites") | 27 @url_handler("/crawlableSites") |
| 26 @basic_auth | 28 @basic_auth("crawler") |
| 27 def crawlable_sites(environ, start_response): | 29 def crawlable_sites(environ, start_response): |
| 28 urls = _fetch_crawlable_sites() | 30 urls = _fetch_crawlable_sites() |
| 29 start_response("200 OK", [("Content-Type", "text/plain")]) | 31 start_response("200 OK", [("Content-Type", "text/plain")]) |
| 30 return "\n".join(urls) | 32 return "\n".join(urls) |
| 31 | 33 |
| 32 def _find_site_id(site_url): | 34 def _find_site_id(site_url): |
| 33 cursor = get_cursor() | 35 cursor = get_cursor() |
| 34 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
| 35 return cursor.fetchall()[0]["id"] | 37 result = cursor.fetchone() |
|
Wladimir Palant
2012/09/14 17:24:18
a) Why use fetchall() if we are only interested in
Felix Dahlke
2012/09/26 15:20:30
Done.
| |
| 38 return result["id"] if result else None | |
| 36 | 39 |
| 37 def _read_multipart_lines(environ, line_callback): | 40 def _read_multipart_lines(environ, line_callback): |
| 38 data_file = environ["wsgi.input"] | 41 data_file = environ["wsgi.input"] |
| 39 current_line = 0 | 42 content_type = environ.get("CONTENT_TYPE") |
| 43 if not content_type: | |
| 44 raise ValueError("Content-Type missing from header") | |
| 40 | 45 |
| 41 while True: | 46 match = re.search(r"boundary=(.*)", content_type) |
| 42 line = data_file.readline().strip() | 47 if not match: |
|
Wladimir Palant
2012/09/14 17:24:18
Same as earlier, please use:
for line in enviro
Felix Dahlke
2012/09/26 15:20:30
Done.
| |
| 43 current_line += 1 | 48 raise ValueError("Multipart form data or boundary declaration missing") |
| 44 | 49 |
| 45 if current_line == 1: | 50 boundary = match.group(1) |
| 46 boundary = line | 51 boundary_passed = False |
| 52 header_passed = False | |
| 53 | |
| 54 for line in data_file: | |
| 55 line = line.strip() | |
| 56 | |
| 57 if not boundary_passed: | |
| 58 if line == "--" + boundary: | |
| 59 boundary_passed = True | |
| 47 continue | 60 continue |
| 48 | 61 |
| 49 if current_line < 5 or not line: | 62 if not header_passed: |
|
Wladimir Palant
2012/09/14 17:24:18
No, that's not how you parse multipart/form-data (
Felix Dahlke
2012/09/26 15:20:30
Done. Wow, that was one mean hack I did there :)
Wladimir Palant
2012/09/27 07:34:17
Much better now :)
| |
| 63 if not line: | |
| 64 header_passed = True | |
| 50 continue | 65 continue |
| 51 | 66 |
| 52 if line.startswith(boundary): | 67 if line == "--" + boundary + "--": |
| 53 break | 68 break |
| 54 | 69 |
| 55 line_callback(line) | 70 if line: |
| 71 line_callback(line) | |
| 56 | 72 |
| 57 def _create_run(): | 73 def _create_run(): |
| 58 cursor = get_cursor() | 74 cursor = get_cursor() |
| 59 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 75 cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
| 60 return cursor.lastrowid | 76 return cursor.lastrowid |
| 61 | 77 |
| 62 def _insert_data(run_id, site, url, filtered): | 78 def _insert_data(run_id, site, url, filtered): |
| 63 site_id = _find_site_id(site) | 79 site_id = _find_site_id(site) |
| 80 if site_id is None: | |
| 81 print >>sys.stderr, "Unable to find site '%s' in the database" % site | |
| 82 return | |
| 83 | |
| 64 cursor = get_cursor() | 84 cursor = get_cursor() |
| 65 cursor.execute(""" | 85 cursor.execute(""" |
| 66 INSERT INTO crawler_data (run, site, url, filtered) | 86 INSERT INTO crawler_data (run, site, url, filtered) |
| 67 VALUES (%s, %s, %s, %s)""", | 87 VALUES (%s, %s, %s, %s)""", |
| 68 (run_id, site_id, url, filtered)) | 88 (run_id, site_id, url, filtered)) |
| 69 | 89 |
| 70 @url_handler("/crawlerData") | 90 @url_handler("/crawlerData") |
| 71 @basic_auth | 91 @basic_auth("crawler") |
| 72 def crawler_data(environ, start_response): | 92 def crawler_data(environ, start_response): |
| 73 def line_callback(line): | 93 def line_callback(line): |
| 74 url, site, filtered = simplejson.loads(line) | 94 try: |
| 75 _insert_data(run_id, site, url, filtered) | 95 url, site, filtered = simplejson.loads(line) |
| 96 _insert_data(run_id, site, url, filtered) | |
| 97 except simplejson.JSONDecodeError: | |
| 98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | |
| 76 | 99 |
| 77 run_id = _create_run() | 100 run_id = _create_run() |
| 78 _read_multipart_lines(environ, line_callback) | 101 try: |
| 79 start_response("200 OK", [("Content-Type", "text/plain")]) | 102 _read_multipart_lines(environ, line_callback) |
| 80 return "" | 103 start_response("200 OK", [("Content-Type", "text/plain")]) |
| 104 return "" | |
| 105 except ValueError as e: | |
| 106 start_response("401 Bad Request", [("Content-Type", "text/plain")]) | |
| 107 return e | |
| LEFT | RIGHT |