OLD | NEW |
1 import MySQLdb | 1 import MySQLdb |
2 import os | 2 import os |
3 import re | 3 import re |
4 import json | 4 import json |
5 import sys | 5 import sys |
6 from sitescripts.utils import cached, get_config | 6 from sitescripts.utils import cached, get_config |
7 from sitescripts.web import url_handler, basic_auth | 7 from sitescripts.web import url_handler, basic_auth |
8 | 8 |
9 | 9 |
10 @cached(600) | 10 @cached(600) |
11 def _get_db(): | 11 def _get_db(): |
12 database = get_config().get("crawler", "database") | 12 database = get_config().get('crawler', 'database') |
13 dbuser = get_config().get("crawler", "dbuser") | 13 dbuser = get_config().get('crawler', 'dbuser') |
14 dbpasswd = get_config().get("crawler", "dbpassword") | 14 dbpasswd = get_config().get('crawler', 'dbpassword') |
15 if os.name == "nt": | 15 if os.name == 'nt': |
16 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | 16 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
17 use_unicode=True, charset="utf8", named_pipe=True
) | 17 use_unicode=True, charset='utf8', named_pipe=True
) |
18 else: | 18 else: |
19 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | 19 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
20 use_unicode=True, charset="utf8") | 20 use_unicode=True, charset='utf8') |
21 | 21 |
22 | 22 |
23 def _get_cursor(): | 23 def _get_cursor(): |
24 return _get_db().cursor(MySQLdb.cursors.DictCursor) | 24 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
25 | 25 |
26 | 26 |
27 def _fetch_crawlable_sites(): | 27 def _fetch_crawlable_sites(): |
28 cursor = _get_cursor() | 28 cursor = _get_cursor() |
29 cursor.execute("SELECT url from crawler_sites") | 29 cursor.execute('SELECT url from crawler_sites') |
30 results = cursor.fetchall() | 30 results = cursor.fetchall() |
31 sites = [result["url"] for result in results] | 31 sites = [result['url'] for result in results] |
32 return sites | 32 return sites |
33 | 33 |
34 | 34 |
35 @url_handler("/crawlableSites") | 35 @url_handler('/crawlableSites') |
36 @basic_auth("crawler") | 36 @basic_auth('crawler') |
37 def crawlable_sites(environ, start_response): | 37 def crawlable_sites(environ, start_response): |
38 urls = _fetch_crawlable_sites() | 38 urls = _fetch_crawlable_sites() |
39 start_response("200 OK", [("Content-Type", "text/plain")]) | 39 start_response('200 OK', [('Content-Type', 'text/plain')]) |
40 return "\n".join(urls) | 40 return '\n'.join(urls) |
41 | 41 |
42 | 42 |
43 def _find_site_id(site_url): | 43 def _find_site_id(site_url): |
44 cursor = _get_cursor() | 44 cursor = _get_cursor() |
45 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | 45 cursor.execute('SELECT id FROM crawler_sites WHERE url = %s', site_url) |
46 result = cursor.fetchone() | 46 result = cursor.fetchone() |
47 return result["id"] if result else None | 47 return result['id'] if result else None |
48 | 48 |
49 | 49 |
50 def _read_multipart_lines(environ, line_callback): | 50 def _read_multipart_lines(environ, line_callback): |
51 data_file = environ["wsgi.input"] | 51 data_file = environ['wsgi.input'] |
52 content_type = environ.get("CONTENT_TYPE") | 52 content_type = environ.get('CONTENT_TYPE') |
53 if not content_type: | 53 if not content_type: |
54 raise ValueError("Content-Type missing from header") | 54 raise ValueError('Content-Type missing from header') |
55 | 55 |
56 match = re.search(r"boundary=(.*)", content_type) | 56 match = re.search(r'boundary=(.*)', content_type) |
57 if not match: | 57 if not match: |
58 raise ValueError("Multipart form data or boundary declaration missing") | 58 raise ValueError('Multipart form data or boundary declaration missing') |
59 | 59 |
60 boundary = match.group(1) | 60 boundary = match.group(1) |
61 boundary_passed = False | 61 boundary_passed = False |
62 header_passed = False | 62 header_passed = False |
63 | 63 |
64 for line in data_file: | 64 for line in data_file: |
65 line = line.strip() | 65 line = line.strip() |
66 | 66 |
67 if not boundary_passed: | 67 if not boundary_passed: |
68 if line == "--" + boundary: | 68 if line == '--' + boundary: |
69 boundary_passed = True | 69 boundary_passed = True |
70 continue | 70 continue |
71 | 71 |
72 if not header_passed: | 72 if not header_passed: |
73 if not line: | 73 if not line: |
74 header_passed = True | 74 header_passed = True |
75 continue | 75 continue |
76 | 76 |
77 if line == "--" + boundary + "--": | 77 if line == '--' + boundary + '--': |
78 break | 78 break |
79 | 79 |
80 if line: | 80 if line: |
81 line_callback(line) | 81 line_callback(line) |
82 | 82 |
83 | 83 |
84 def _create_run(): | 84 def _create_run(): |
85 cursor = _get_cursor() | 85 cursor = _get_cursor() |
86 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | 86 cursor.execute('INSERT INTO crawler_runs () VALUES ()') |
87 return cursor.lastrowid | 87 return cursor.lastrowid |
88 | 88 |
89 | 89 |
90 def _insert_data(run_id, site, url, filtered): | 90 def _insert_data(run_id, site, url, filtered): |
91 site_id = _find_site_id(site) | 91 site_id = _find_site_id(site) |
92 if site_id is None: | 92 if site_id is None: |
93 print >>sys.stderr, "Unable to find site '%s' in the database" % site | 93 print >>sys.stderr, "Unable to find site '%s' in the database" % site |
94 return | 94 return |
95 | 95 |
96 cursor = _get_cursor() | 96 cursor = _get_cursor() |
97 cursor.execute(""" | 97 cursor.execute(''' |
98 INSERT INTO crawler_requests (run, site, url, filtered) | 98 INSERT INTO crawler_requests (run, site, url, filtered) |
99 VALUES (%s, %s, %s, %s)""", | 99 VALUES (%s, %s, %s, %s)''', |
100 (run_id, site_id, url, filtered)) | 100 (run_id, site_id, url, filtered)) |
101 | 101 |
102 | 102 |
103 @url_handler("/crawlerRequests") | 103 @url_handler('/crawlerRequests') |
104 @basic_auth("crawler") | 104 @basic_auth('crawler') |
105 def crawler_requests(environ, start_response): | 105 def crawler_requests(environ, start_response): |
106 def line_callback(line): | 106 def line_callback(line): |
107 try: | 107 try: |
108 data = json.loads(line) | 108 data = json.loads(line) |
109 if len(data) < 3: | 109 if len(data) < 3: |
110 print >>sys.stderr, "Not enough elements in line '%s'" % line | 110 print >>sys.stderr, "Not enough elements in line '%s'" % line |
111 return | 111 return |
112 url = data[0] | 112 url = data[0] |
113 site = data[1] | 113 site = data[1] |
114 filtered = data[2] | 114 filtered = data[2] |
115 _insert_data(run_id, site, url, filtered) | 115 _insert_data(run_id, site, url, filtered) |
116 except json.JSONDecodeError: | 116 except json.JSONDecodeError: |
117 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | 117 print >>sys.stderr, "Unable to parse JSON from '%s'" % line |
118 | 118 |
119 run_id = _create_run() | 119 run_id = _create_run() |
120 try: | 120 try: |
121 _read_multipart_lines(environ, line_callback) | 121 _read_multipart_lines(environ, line_callback) |
122 start_response("200 OK", [("Content-Type", "text/plain")]) | 122 start_response('200 OK', [('Content-Type', 'text/plain')]) |
123 return "" | 123 return '' |
124 except ValueError as e: | 124 except ValueError as e: |
125 start_response("401 Bad Request", [("Content-Type", "text/plain")]) | 125 start_response('401 Bad Request', [('Content-Type', 'text/plain')]) |
126 print >>sys.stderr, "Unable to read multipart data: %s" % e | 126 print >>sys.stderr, 'Unable to read multipart data: %s' % e |
127 return e | 127 return e |
OLD | NEW |