| OLD | NEW |
| 1 import MySQLdb, os, re, simplejson, sys | 1 import MySQLdb, os, re, simplejson, sys |
| 2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
| 3 from sitescripts.web import url_handler, basic_auth | 3 from sitescripts.web import url_handler, basic_auth |
| 4 | 4 |
| 5 @cached(600) | 5 @cached(600) |
| 6 def _get_db(): | 6 def _get_db(): |
| 7 database = get_config().get("crawler", "database") | 7 database = get_config().get("crawler", "database") |
| 8 dbuser = get_config().get("crawler", "dbuser") | 8 dbuser = get_config().get("crawler", "dbuser") |
| 9 dbpasswd = get_config().get("crawler", "dbpassword") | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
| 10 if os.name == "nt": | 10 if os.name == "nt": |
| (...skipping 19 matching lines...) Expand all Loading... |
| 30 urls = _fetch_crawlable_sites() | 30 urls = _fetch_crawlable_sites() |
| 31 start_response("200 OK", [("Content-Type", "text/plain")]) | 31 start_response("200 OK", [("Content-Type", "text/plain")]) |
| 32 return "\n".join(urls) | 32 return "\n".join(urls) |
| 33 | 33 |
| 34 def _find_site_id(site_url): | 34 def _find_site_id(site_url): |
| 35 cursor = _get_cursor() | 35 cursor = _get_cursor() |
| 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
| 37 result = cursor.fetchone() | 37 result = cursor.fetchone() |
| 38 return result["id"] if result else None | 38 return result["id"] if result else None |
| 39 | 39 |
| 40 def _find_filter_id(filter_str): |
| 41 if filter_str is None: |
| 42 return None |
| 43 |
| 44 cursor = _get_cursor() |
| 45 query = "SELECT id FROM crawler_filters WHERE filter_hash = sha1(%s)" |
| 46 cursor.execute(query, filter_str) |
| 47 result = cursor.fetchone() |
| 48 return result["id"] if result else None |
| 49 |
| 40 def _read_multipart_lines(environ, line_callback): | 50 def _read_multipart_lines(environ, line_callback): |
| 41 data_file = environ["wsgi.input"] | 51 data_file = environ["wsgi.input"] |
| 42 content_type = environ.get("CONTENT_TYPE") | 52 content_type = environ.get("CONTENT_TYPE") |
| 43 if not content_type: | 53 if not content_type: |
| 44 raise ValueError("Content-Type missing from header") | 54 raise ValueError("Content-Type missing from header") |
| 45 | 55 |
| 46 match = re.search(r"boundary=(.*)", content_type) | 56 match = re.search(r"boundary=(.*)", content_type) |
| 47 if not match: | 57 if not match: |
| 48 raise ValueError("Multipart form data or boundary declaration missing") | 58 raise ValueError("Multipart form data or boundary declaration missing") |
| 49 | 59 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 68 break | 78 break |
| 69 | 79 |
| 70 if line: | 80 if line: |
| 71 line_callback(line) | 81 line_callback(line) |
| 72 | 82 |
| 73 def _create_run(): | 83 def _create_run(): |
| 74 cursor = _get_cursor() | 84 cursor = _get_cursor() |
| 75 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 85 cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
| 76 return cursor.lastrowid | 86 return cursor.lastrowid |
| 77 | 87 |
| 78 def _insert_data(run_id, site, url, filtered): | 88 def _insert_data(run_id, site, url, filtered, filter_str): |
| 79 site_id = _find_site_id(site) | 89 site_id = _find_site_id(site) |
| 80 if site_id is None: | 90 if site_id is None: |
| 81 print >>sys.stderr, "Unable to find site '%s' in the database" % site | 91 print >>sys.stderr, "Unable to find site '%s' in the database" % site |
| 82 return | 92 return |
| 83 | 93 |
| 84 cursor = _get_cursor() | 94 cursor = _get_cursor() |
| 85 cursor.execute(""" | 95 insert = """ |
| 86 INSERT INTO crawler_data (run, site, url, filtered) | 96 INSERT INTO crawler_requests (run, site, url, filtered) |
| 87 VALUES (%s, %s, %s, %s)""", | 97 VALUES (%s, %s, %s, %s)""" |
| 88 (run_id, site_id, url, filtered)) | 98 cursor.execute(insert, (run_id, site_id, url, filtered)) |
| 89 | 99 |
| 90 @url_handler("/crawlerData") | 100 filter_id = _find_filter_id(filter_str) |
| 101 if filter_id is not None: |
| 102 update = "UPDATE crawler_requests SET filter = %s WHERE id = %s"; |
| 103 cursor.execute(update, (filter_id, cursor.lastrowid)) |
| 104 |
| 105 @url_handler("/crawlerRequests") |
| 91 @basic_auth("crawler") | 106 @basic_auth("crawler") |
| 92 def crawler_data(environ, start_response): | 107 def crawler_requests(environ, start_response): |
| 93 def line_callback(line): | 108 def line_callback(line): |
| 94 try: | 109 try: |
| 95 url, site, filtered = simplejson.loads(line) | 110 data = simplejson.loads(line) |
| 96 _insert_data(run_id, site, url, filtered) | 111 if len(data) < 3: |
| 112 print >>sys.stderr, "Not enough elements in line '%s'" % line |
| 113 return |
| 114 url = data[0] |
| 115 site = data[1] |
| 116 filtered = data[2] |
| 117 filter_str = data[3] if len(data) >= 4 else None |
| 118 _insert_data(run_id, site, url, filtered, filter_str) |
| 97 except simplejson.JSONDecodeError: | 119 except simplejson.JSONDecodeError: |
| 98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | 120 print >>sys.stderr, "Unable to parse JSON from '%s'" % line |
| 99 | 121 |
| 100 run_id = _create_run() | 122 run_id = _create_run() |
| 101 try: | 123 try: |
| 102 _read_multipart_lines(environ, line_callback) | 124 _read_multipart_lines(environ, line_callback) |
| 103 start_response("200 OK", [("Content-Type", "text/plain")]) | 125 start_response("200 OK", [("Content-Type", "text/plain")]) |
| 104 return "" | 126 return "" |
| 105 except ValueError as e: | 127 except ValueError as e: |
| 106 start_response("401 Bad Request", [("Content-Type", "text/plain")]) | 128 start_response("401 Bad Request", [("Content-Type", "text/plain")]) |
| 129 print >>sys.stderr, "Unable to read multipart data: %s" % e |
| 107 return e | 130 return e |
| OLD | NEW |