Left: | ||
Right: |
OLD | NEW |
---|---|
1 import MySQLdb, os | 1 import MySQLdb, os, re, simplejson, sys |
2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
3 from sitescripts.web import url_handler | 3 from sitescripts.web import url_handler, basic_auth |
4 | |
5 @url_handler('/crawlableUrls') | |
6 def listUrls(environ, start_response): | |
7 urls = fetch_crawlable_urls() | |
8 start_response('200 OK', [('Content-Type', 'text/plain')]) | |
9 return '\n'.join(urls) | |
10 | |
11 def fetch_crawlable_urls(): | |
12 cursor = get_db().cursor(MySQLdb.cursors.DictCursor) | |
13 executeQuery(cursor, 'SELECT url from crawler_urls') | |
14 results = cursor.fetchall() | |
15 urls = [result['url'] for result in results] | |
16 return urls | |
17 | 4 |
18 @cached(600) | 5 @cached(600) |
19 def get_db(): | 6 def _get_db(): |
20 database = get_config().get('crawler', 'database') | 7 database = get_config().get("crawler", "database") |
21 dbuser = get_config().get('crawler', 'dbuser') | 8 dbuser = get_config().get("crawler", "dbuser") |
22 dbpasswd = get_config().get('crawler', 'dbpassword') | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
23 if os.name == 'nt': | 10 if os.name == "nt": |
24 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8', named_pipe=True) | 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
12 use_unicode=True, charset="utf8", named_pipe=True) | |
25 else: | 13 else: |
26 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8') | 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
15 use_unicode=True, charset="utf8") | |
27 | 16 |
28 def executeQuery(cursor, query, args=None): | 17 def get_cursor(): |
29 cursor.execute(query, args) | 18 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
19 | |
20 def _fetch_crawlable_sites(): | |
21 cursor = get_cursor() | |
22 cursor.execute("SELECT url from crawler_sites") | |
23 results = cursor.fetchall() | |
24 sites = [result["url"] for result in results] | |
25 return sites | |
26 | |
27 @url_handler("/crawlableSites") | |
28 @basic_auth | |
Wladimir Palant
2012/09/27 13:44:51
Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
29 def crawlable_sites(environ, start_response): | |
30 urls = _fetch_crawlable_sites() | |
31 start_response("200 OK", [("Content-Type", "text/plain")]) | |
32 return "\n".join(urls) | |
33 | |
34 def _find_site_id(site_url): | |
35 cursor = get_cursor() | |
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | |
37 result = cursor.fetchone() | |
38 return result["id"] if result else None | |
39 | |
40 def _read_multipart_lines(environ, line_callback): | |
41 data_file = environ["wsgi.input"] | |
42 content_type = environ["CONTENT_TYPE"] | |
43 if not content_type: | |
44 raise ValueError("Content-Type missing from header") | |
Wladimir Palant
2012/09/27 13:44:51
Python doesn't work like that - accessing a non-ex
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
45 | |
46 match = re.search(r"boundary=(.*)", content_type) | |
47 if not match: | |
48 raise ValueError("Multipart form data or boundary declaration missing") | |
49 | |
50 boundary = match.group(1) | |
51 boundary_passed = False | |
52 header_passed = False | |
53 | |
54 for line in data_file: | |
55 line = line.strip() | |
56 | |
57 if not boundary_passed: | |
58 if line == "--" + boundary: | |
59 boundary_passed = True | |
60 continue | |
61 | |
62 if not header_passed: | |
63 if not line: | |
64 header_passed = True | |
65 continue | |
66 | |
67 if line == "--" + boundary + "--": | |
68 break | |
69 | |
70 if line: | |
71 line_callback(line) | |
72 | |
73 return True | |
Wladimir Palant
2012/09/27 13:44:51
That return value is unused, left-over from earlie
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
74 | |
75 def _create_run(): | |
76 cursor = get_cursor() | |
77 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | |
78 return cursor.lastrowid | |
79 | |
80 def _insert_data(run_id, site, url, filtered): | |
81 site_id = _find_site_id(site) | |
82 if site_id is None: | |
83 print >>sys.stderr, "Unable to find site '%s' in the database" % site | |
84 return | |
85 | |
86 cursor = get_cursor() | |
87 cursor.execute(""" | |
88 INSERT INTO crawler_data (run, site, url, filtered) | |
89 VALUES (%s, %s, %s, %s)""", | |
90 (run_id, site_id, url, filtered)) | |
91 | |
92 @url_handler("/crawlerData") | |
93 @basic_auth | |
Wladimir Palant
2012/09/27 13:44:51
Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke
2012/09/27 14:15:33
Done.
| |
94 def crawler_data(environ, start_response): | |
95 def line_callback(line): | |
96 try: | |
97 url, site, filtered = simplejson.loads(line) | |
98 _insert_data(run_id, site, url, filtered) | |
99 except simplejson.JSONDecodeError: | |
100 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | |
101 | |
102 run_id = _create_run() | |
103 try: | |
104 _read_multipart_lines(environ, line_callback) | |
105 start_response("200 OK", [("Content-Type", "text/plain")]) | |
106 return "" | |
107 except ValueError as e: | |
108 start_response("401 Bad Request", [("Content-Type", "text/plain")]) | |
109 return e | |
OLD | NEW |