Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Left Patch Set: README fix Created Sept. 14, 2012, 2:42 p.m.
Right Patch Set: Created Sept. 27, 2012, 2:15 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
LEFTRIGHT
1 import MySQLdb, os, simplejson 1 import MySQLdb, os, re, simplejson, sys
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler, basic_auth 3 from sitescripts.web import url_handler, basic_auth
4 4
5 @cached(600) 5 @cached(600)
6 def _get_db(): 6 def _get_db():
7 database = get_config().get("crawler", "database") 7 database = get_config().get("crawler", "database")
8 dbuser = get_config().get("crawler", "dbuser") 8 dbuser = get_config().get("crawler", "dbuser")
9 dbpasswd = get_config().get("crawler", "dbpassword") 9 dbpasswd = get_config().get("crawler", "dbpassword")
10 if os.name == "nt": 10 if os.name == "nt":
11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True) 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
12 use_unicode=True, charset="utf8", named_pipe=True)
12 else: 13 else:
13 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8") 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
15 use_unicode=True, charset="utf8")
14 16
15 def get_cursor(): 17 def get_cursor():
16 return _get_db().cursor(MySQLdb.cursors.DictCursor) 18 return _get_db().cursor(MySQLdb.cursors.DictCursor)
17 19
18 def _fetch_crawlable_sites(): 20 def _fetch_crawlable_sites():
19 cursor = get_cursor() 21 cursor = get_cursor()
20 cursor.execute("SELECT url from crawler_sites") 22 cursor.execute("SELECT url from crawler_sites")
21 results = cursor.fetchall() 23 results = cursor.fetchall()
22 sites = [result["url"] for result in results] 24 sites = [result["url"] for result in results]
23 return sites 25 return sites
24 26
25 @url_handler("/crawlableSites") 27 @url_handler("/crawlableSites")
26 @basic_auth 28 @basic_auth("crawler")
27 def crawlable_sites(environ, start_response): 29 def crawlable_sites(environ, start_response):
28 urls = _fetch_crawlable_sites() 30 urls = _fetch_crawlable_sites()
29 start_response("200 OK", [("Content-Type", "text/plain")]) 31 start_response("200 OK", [("Content-Type", "text/plain")])
30 return "\n".join(urls) 32 return "\n".join(urls)
31 33
32 def _find_site_id(site_url): 34 def _find_site_id(site_url):
33 cursor = get_cursor() 35 cursor = get_cursor()
34 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
35 return cursor.fetchall()[0]["id"] 37 result = cursor.fetchone()
Wladimir Palant 2012/09/14 17:24:18 a) Why use fetchall() if we are only interested in
Felix Dahlke 2012/09/26 15:20:30 Done.
38 return result["id"] if result else None
36 39
37 def _read_multipart_lines(environ, line_callback): 40 def _read_multipart_lines(environ, line_callback):
38 data_file = environ["wsgi.input"] 41 data_file = environ["wsgi.input"]
39 current_line = 0 42 content_type = environ.get("CONTENT_TYPE")
43 if not content_type:
44 raise ValueError("Content-Type missing from header")
40 45
41 while True: 46 match = re.search(r"boundary=(.*)", content_type)
42 line = data_file.readline().strip() 47 if not match:
Wladimir Palant 2012/09/14 17:24:18 Same as earlier, please use: for line in enviro
Felix Dahlke 2012/09/26 15:20:30 Done.
43 current_line += 1 48 raise ValueError("Multipart form data or boundary declaration missing")
44 49
45 if current_line == 1: 50 boundary = match.group(1)
46 boundary = line 51 boundary_passed = False
52 header_passed = False
53
54 for line in data_file:
55 line = line.strip()
56
57 if not boundary_passed:
58 if line == "--" + boundary:
59 boundary_passed = True
47 continue 60 continue
48 61
49 if current_line < 5 or not line: 62 if not header_passed:
Wladimir Palant 2012/09/14 17:24:18 No, that's not how you parse multipart/form-data (
Felix Dahlke 2012/09/26 15:20:30 Done. Wow, that was one mean hack I did there :)
Wladimir Palant 2012/09/27 07:34:17 Much better now :)
63 if not line:
64 header_passed = True
50 continue 65 continue
51 66
52 if line.startswith(boundary): 67 if line == "--" + boundary + "--":
53 break 68 break
54 69
55 line_callback(line) 70 if line:
71 line_callback(line)
56 72
57 def _create_run(): 73 def _create_run():
58 cursor = get_cursor() 74 cursor = get_cursor()
59 cursor.execute("INSERT INTO crawler_runs () VALUES ()") 75 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
60 return cursor.lastrowid 76 return cursor.lastrowid
61 77
62 def _insert_data(run_id, site, url, filtered): 78 def _insert_data(run_id, site, url, filtered):
63 site_id = _find_site_id(site) 79 site_id = _find_site_id(site)
80 if site_id is None:
81 print >>sys.stderr, "Unable to find site '%s' in the database" % site
82 return
83
64 cursor = get_cursor() 84 cursor = get_cursor()
65 cursor.execute(""" 85 cursor.execute("""
66 INSERT INTO crawler_data (run, site, url, filtered) 86 INSERT INTO crawler_data (run, site, url, filtered)
67 VALUES (%s, %s, %s, %s)""", 87 VALUES (%s, %s, %s, %s)""",
68 (run_id, site_id, url, filtered)) 88 (run_id, site_id, url, filtered))
69 89
70 @url_handler("/crawlerData") 90 @url_handler("/crawlerData")
71 @basic_auth 91 @basic_auth("crawler")
72 def crawler_data(environ, start_response): 92 def crawler_data(environ, start_response):
73 def line_callback(line): 93 def line_callback(line):
74 url, site, filtered = simplejson.loads(line) 94 try:
75 _insert_data(run_id, site, url, filtered) 95 url, site, filtered = simplejson.loads(line)
96 _insert_data(run_id, site, url, filtered)
97 except simplejson.JSONDecodeError:
98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
76 99
77 run_id = _create_run() 100 run_id = _create_run()
78 _read_multipart_lines(environ, line_callback) 101 try:
79 start_response("200 OK", [("Content-Type", "text/plain")]) 102 _read_multipart_lines(environ, line_callback)
80 return "" 103 start_response("200 OK", [("Content-Type", "text/plain")])
104 return ""
105 except ValueError as e:
106 start_response("401 Bad Request", [("Content-Type", "text/plain")])
107 return e
LEFTRIGHT

Powered by Google App Engine
This is Rietveld