OLD | NEW |
1 import MySQLdb, os, re, simplejson, sys | 1 import MySQLdb, os, re, simplejson, sys |
2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
3 from sitescripts.web import url_handler, basic_auth | 3 from sitescripts.web import url_handler, basic_auth |
4 | 4 |
5 @cached(600) | 5 @cached(600) |
6 def _get_db(): | 6 def _get_db(): |
7 database = get_config().get("crawler", "database") | 7 database = get_config().get("crawler", "database") |
8 dbuser = get_config().get("crawler", "dbuser") | 8 dbuser = get_config().get("crawler", "dbuser") |
9 dbpasswd = get_config().get("crawler", "dbpassword") | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
10 if os.name == "nt": | 10 if os.name == "nt": |
(...skipping 19 matching lines...) Expand all Loading... |
30 urls = _fetch_crawlable_sites() | 30 urls = _fetch_crawlable_sites() |
31 start_response("200 OK", [("Content-Type", "text/plain")]) | 31 start_response("200 OK", [("Content-Type", "text/plain")]) |
32 return "\n".join(urls) | 32 return "\n".join(urls) |
33 | 33 |
34 def _find_site_id(site_url): | 34 def _find_site_id(site_url): |
35 cursor = _get_cursor() | 35 cursor = _get_cursor() |
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
37 result = cursor.fetchone() | 37 result = cursor.fetchone() |
38 return result["id"] if result else None | 38 return result["id"] if result else None |
39 | 39 |
| 40 def _find_filter_id(filter_str): |
| 41 if filter_str is None: |
| 42 return None |
| 43 |
| 44 cursor = _get_cursor() |
| 45 query = "SELECT id FROM crawler_filters WHERE filter_hash = sha1(%s)" |
| 46 cursor.execute(query, filter_str) |
| 47 result = cursor.fetchone() |
| 48 return result["id"] if result else None |
| 49 |
40 def _read_multipart_lines(environ, line_callback): | 50 def _read_multipart_lines(environ, line_callback): |
41 data_file = environ["wsgi.input"] | 51 data_file = environ["wsgi.input"] |
42 content_type = environ.get("CONTENT_TYPE") | 52 content_type = environ.get("CONTENT_TYPE") |
43 if not content_type: | 53 if not content_type: |
44 raise ValueError("Content-Type missing from header") | 54 raise ValueError("Content-Type missing from header") |
45 | 55 |
46 match = re.search(r"boundary=(.*)", content_type) | 56 match = re.search(r"boundary=(.*)", content_type) |
47 if not match: | 57 if not match: |
48 raise ValueError("Multipart form data or boundary declaration missing") | 58 raise ValueError("Multipart form data or boundary declaration missing") |
49 | 59 |
(...skipping 18 matching lines...) Expand all Loading... |
68 break | 78 break |
69 | 79 |
70 if line: | 80 if line: |
71 line_callback(line) | 81 line_callback(line) |
72 | 82 |
73 def _create_run(): | 83 def _create_run(): |
74 cursor = _get_cursor() | 84 cursor = _get_cursor() |
75 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 85 cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
76 return cursor.lastrowid | 86 return cursor.lastrowid |
77 | 87 |
78 def _insert_data(run_id, site, url, filtered): | 88 def _insert_data(run_id, site, url, filtered, filter_str): |
79 site_id = _find_site_id(site) | 89 site_id = _find_site_id(site) |
80 if site_id is None: | 90 if site_id is None: |
81 print >>sys.stderr, "Unable to find site '%s' in the database" % site | 91 print >>sys.stderr, "Unable to find site '%s' in the database" % site |
82 return | 92 return |
83 | 93 |
84 cursor = _get_cursor() | 94 cursor = _get_cursor() |
85 cursor.execute(""" | 95 insert = """ |
86 INSERT INTO crawler_data (run, site, url, filtered) | 96 INSERT INTO crawler_requests (run, site, url, filtered) |
87 VALUES (%s, %s, %s, %s)""", | 97 VALUES (%s, %s, %s, %s)""" |
88 (run_id, site_id, url, filtered)) | 98 cursor.execute(insert, (run_id, site_id, url, filtered)) |
89 | 99 |
90 @url_handler("/crawlerData") | 100 filter_id = _find_filter_id(filter_str) |
| 101 if filter_id is not None: |
| 102 update = "UPDATE crawler_requests SET filter = %s WHERE id = %s"; |
| 103 cursor.execute(update, (filter_id, cursor.lastrowid)) |
| 104 |
| 105 @url_handler("/crawlerRequests") |
91 @basic_auth("crawler") | 106 @basic_auth("crawler") |
92 def crawler_data(environ, start_response): | 107 def crawler_requests(environ, start_response): |
93 def line_callback(line): | 108 def line_callback(line): |
94 try: | 109 try: |
95 url, site, filtered = simplejson.loads(line) | 110 data = simplejson.loads(line) |
96 _insert_data(run_id, site, url, filtered) | 111 if len(data) < 3: |
| 112 print >>sys.stderr, "Not enough elements in line '%s'" % line |
| 113 return |
| 114 url = data[0] |
| 115 site = data[1] |
| 116 filtered = data[2] |
| 117 filter_str = data[3] if len(data) >= 4 else None |
| 118 _insert_data(run_id, site, url, filtered, filter_str) |
97 except simplejson.JSONDecodeError: | 119 except simplejson.JSONDecodeError: |
98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | 120 print >>sys.stderr, "Unable to parse JSON from '%s'" % line |
99 | 121 |
100 run_id = _create_run() | 122 run_id = _create_run() |
101 try: | 123 try: |
102 _read_multipart_lines(environ, line_callback) | 124 _read_multipart_lines(environ, line_callback) |
103 start_response("200 OK", [("Content-Type", "text/plain")]) | 125 start_response("200 OK", [("Content-Type", "text/plain")]) |
104 return "" | 126 return "" |
105 except ValueError as e: | 127 except ValueError as e: |
106 start_response("401 Bad Request", [("Content-Type", "text/plain")]) | 128 start_response("401 Bad Request", [("Content-Type", "text/plain")]) |
| 129 print >>sys.stderr, "Unable to read multipart data: %s" % e |
107 return e | 130 return e |
OLD | NEW |