Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Left Patch Set: Created Sept. 27, 2012, 9:26 a.m.
Right Patch Set: Created Sept. 27, 2012, 2:15 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
LEFTRIGHT
1 import MySQLdb, os, re, simplejson, sys 1 import MySQLdb, os, re, simplejson, sys
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler, basic_auth 3 from sitescripts.web import url_handler, basic_auth
4 4
5 @cached(600) 5 @cached(600)
6 def _get_db(): 6 def _get_db():
7 database = get_config().get("crawler", "database") 7 database = get_config().get("crawler", "database")
8 dbuser = get_config().get("crawler", "dbuser") 8 dbuser = get_config().get("crawler", "dbuser")
9 dbpasswd = get_config().get("crawler", "dbpassword") 9 dbpasswd = get_config().get("crawler", "dbpassword")
10 if os.name == "nt": 10 if os.name == "nt":
11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
12 use_unicode=True, charset="utf8", named_pipe=True) 12 use_unicode=True, charset="utf8", named_pipe=True)
13 else: 13 else:
14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
15 use_unicode=True, charset="utf8") 15 use_unicode=True, charset="utf8")
16 16
17 def get_cursor(): 17 def get_cursor():
18 return _get_db().cursor(MySQLdb.cursors.DictCursor) 18 return _get_db().cursor(MySQLdb.cursors.DictCursor)
19 19
20 def _fetch_crawlable_sites(): 20 def _fetch_crawlable_sites():
21 cursor = get_cursor() 21 cursor = get_cursor()
22 cursor.execute("SELECT url from crawler_sites") 22 cursor.execute("SELECT url from crawler_sites")
23 results = cursor.fetchall() 23 results = cursor.fetchall()
24 sites = [result["url"] for result in results] 24 sites = [result["url"] for result in results]
25 return sites 25 return sites
26 26
27 @url_handler("/crawlableSites") 27 @url_handler("/crawlableSites")
28 @basic_auth 28 @basic_auth("crawler")
Wladimir Palant 2012/09/27 13:44:51 Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke 2012/09/27 14:15:33 Done.
29 def crawlable_sites(environ, start_response): 29 def crawlable_sites(environ, start_response):
30 urls = _fetch_crawlable_sites() 30 urls = _fetch_crawlable_sites()
31 start_response("200 OK", [("Content-Type", "text/plain")]) 31 start_response("200 OK", [("Content-Type", "text/plain")])
32 return "\n".join(urls) 32 return "\n".join(urls)
33 33
34 def _find_site_id(site_url): 34 def _find_site_id(site_url):
35 cursor = get_cursor() 35 cursor = get_cursor()
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
37 result = cursor.fetchone() 37 result = cursor.fetchone()
38 return result["id"] if result else None 38 return result["id"] if result else None
39 39
40 def _read_multipart_lines(environ, line_callback): 40 def _read_multipart_lines(environ, line_callback):
41 data_file = environ["wsgi.input"] 41 data_file = environ["wsgi.input"]
42 content_type = environ["CONTENT_TYPE"] 42 content_type = environ.get("CONTENT_TYPE")
43 if not content_type: 43 if not content_type:
44 raise ValueError("Content-Type missing from header") 44 raise ValueError("Content-Type missing from header")
Wladimir Palant 2012/09/27 13:44:51 Python doesn't work like that - accessing a non-ex
Felix Dahlke 2012/09/27 14:15:33 Done.
45 45
46 match = re.search(r"boundary=(.*)", content_type) 46 match = re.search(r"boundary=(.*)", content_type)
47 if not match: 47 if not match:
48 raise ValueError("Multipart form data or boundary declaration missing") 48 raise ValueError("Multipart form data or boundary declaration missing")
49 49
50 boundary = match.group(1) 50 boundary = match.group(1)
51 boundary_passed = False 51 boundary_passed = False
52 header_passed = False 52 header_passed = False
53 53
54 for line in data_file: 54 for line in data_file:
55 line = line.strip() 55 line = line.strip()
56 56
57 if not boundary_passed: 57 if not boundary_passed:
58 if line == "--" + boundary: 58 if line == "--" + boundary:
59 boundary_passed = True 59 boundary_passed = True
60 continue 60 continue
61 61
62 if not header_passed: 62 if not header_passed:
63 if not line: 63 if not line:
64 header_passed = True 64 header_passed = True
65 continue 65 continue
66 66
67 if line == "--" + boundary + "--": 67 if line == "--" + boundary + "--":
68 break 68 break
69 69
70 if line: 70 if line:
71 line_callback(line) 71 line_callback(line)
72 72
73 return True
Wladimir Palant 2012/09/27 13:44:51 That return value is unused, left-over from earlie
Felix Dahlke 2012/09/27 14:15:33 Done.
74
75 def _create_run(): 73 def _create_run():
76 cursor = get_cursor() 74 cursor = get_cursor()
77 cursor.execute("INSERT INTO crawler_runs () VALUES ()") 75 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
78 return cursor.lastrowid 76 return cursor.lastrowid
79 77
80 def _insert_data(run_id, site, url, filtered): 78 def _insert_data(run_id, site, url, filtered):
81 site_id = _find_site_id(site) 79 site_id = _find_site_id(site)
82 if site_id is None: 80 if site_id is None:
83 print >>sys.stderr, "Unable to find site '%s' in the database" % site 81 print >>sys.stderr, "Unable to find site '%s' in the database" % site
84 return 82 return
85 83
86 cursor = get_cursor() 84 cursor = get_cursor()
87 cursor.execute(""" 85 cursor.execute("""
88 INSERT INTO crawler_data (run, site, url, filtered) 86 INSERT INTO crawler_data (run, site, url, filtered)
89 VALUES (%s, %s, %s, %s)""", 87 VALUES (%s, %s, %s, %s)""",
90 (run_id, site_id, url, filtered)) 88 (run_id, site_id, url, filtered))
91 89
92 @url_handler("/crawlerData") 90 @url_handler("/crawlerData")
93 @basic_auth 91 @basic_auth("crawler")
Wladimir Palant 2012/09/27 13:44:51 Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke 2012/09/27 14:15:33 Done.
94 def crawler_data(environ, start_response): 92 def crawler_data(environ, start_response):
95 def line_callback(line): 93 def line_callback(line):
96 try: 94 try:
97 url, site, filtered = simplejson.loads(line) 95 url, site, filtered = simplejson.loads(line)
98 _insert_data(run_id, site, url, filtered) 96 _insert_data(run_id, site, url, filtered)
99 except simplejson.JSONDecodeError: 97 except simplejson.JSONDecodeError:
100 print >>sys.stderr, "Unable to parse JSON from '%s'" % line 98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
101 99
102 run_id = _create_run() 100 run_id = _create_run()
103 try: 101 try:
104 _read_multipart_lines(environ, line_callback) 102 _read_multipart_lines(environ, line_callback)
105 start_response("200 OK", [("Content-Type", "text/plain")]) 103 start_response("200 OK", [("Content-Type", "text/plain")])
106 return "" 104 return ""
107 except ValueError as e: 105 except ValueError as e:
108 start_response("401 Bad Request", [("Content-Type", "text/plain")]) 106 start_response("401 Bad Request", [("Content-Type", "text/plain")])
109 return e 107 return e
LEFTRIGHT

Powered by Google App Engine
This is Rietveld