Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Left Patch Set: Created Sept. 27, 2012, 6:22 a.m.
Right Patch Set: Created Sept. 27, 2012, 2:15 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
LEFTRIGHT
1 import MySQLdb, os, re, simplejson, sys 1 import MySQLdb, os, re, simplejson, sys
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler, basic_auth 3 from sitescripts.web import url_handler, basic_auth
4 4
5 @cached(600) 5 @cached(600)
6 def _get_db(): 6 def _get_db():
7 database = get_config().get("crawler", "database") 7 database = get_config().get("crawler", "database")
8 dbuser = get_config().get("crawler", "dbuser") 8 dbuser = get_config().get("crawler", "dbuser")
9 dbpasswd = get_config().get("crawler", "dbpassword") 9 dbpasswd = get_config().get("crawler", "dbpassword")
10 if os.name == "nt": 10 if os.name == "nt":
11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
12 use_unicode=True, charset="utf8", named_pipe=True) 12 use_unicode=True, charset="utf8", named_pipe=True)
13 else: 13 else:
14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
15 use_unicode=True, charset="utf8") 15 use_unicode=True, charset="utf8")
16 16
17 def get_cursor(): 17 def get_cursor():
18 return _get_db().cursor(MySQLdb.cursors.DictCursor) 18 return _get_db().cursor(MySQLdb.cursors.DictCursor)
19 19
20 def _fetch_crawlable_sites(): 20 def _fetch_crawlable_sites():
21 cursor = get_cursor() 21 cursor = get_cursor()
22 cursor.execute("SELECT url from crawler_sites") 22 cursor.execute("SELECT url from crawler_sites")
23 results = cursor.fetchall() 23 results = cursor.fetchall()
24 sites = [result["url"] for result in results] 24 sites = [result["url"] for result in results]
25 return sites 25 return sites
26 26
27 @url_handler("/crawlableSites") 27 @url_handler("/crawlableSites")
28 @basic_auth 28 @basic_auth("crawler")
29 def crawlable_sites(environ, start_response): 29 def crawlable_sites(environ, start_response):
30 urls = _fetch_crawlable_sites() 30 urls = _fetch_crawlable_sites()
31 start_response("200 OK", [("Content-Type", "text/plain")]) 31 start_response("200 OK", [("Content-Type", "text/plain")])
32 return "\n".join(urls) 32 return "\n".join(urls)
33 33
34 def _find_site_id(site_url): 34 def _find_site_id(site_url):
35 cursor = get_cursor() 35 cursor = get_cursor()
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
37 result = cursor.fetchone() 37 result = cursor.fetchone()
38 return result["id"] if result else None 38 return result["id"] if result else None
39 39
40 def _read_multipart_lines(environ, line_callback): 40 def _read_multipart_lines(environ, line_callback):
41 data_file = environ["wsgi.input"] 41 data_file = environ["wsgi.input"]
42 boundary = re.search(r"boundary=(.*)", environ["CONTENT_TYPE"]).group(1) 42 content_type = environ.get("CONTENT_TYPE")
Wladimir Palant 2012/09/27 07:34:17 Assumptions here: 1) There is a CONTENT_TYPE head
Felix Dahlke 2012/09/27 09:26:24 Done. You're finding a lot of those because my min
43 if not content_type:
44 raise ValueError("Content-Type missing from header")
45
46 match = re.search(r"boundary=(.*)", content_type)
47 if not match:
48 raise ValueError("Multipart form data or boundary declaration missing")
49
50 boundary = match.group(1)
43 boundary_passed = False 51 boundary_passed = False
44 header_passed = False 52 header_passed = False
45 53
46 for line in data_file: 54 for line in data_file:
47 line = line.strip() 55 line = line.strip()
48 56
49 if not boundary_passed: 57 if not boundary_passed:
50 if line == "--" + boundary: 58 if line == "--" + boundary:
51 boundary_passed = True 59 boundary_passed = True
52 continue 60 continue
(...skipping 12 matching lines...) Expand all
65 def _create_run(): 73 def _create_run():
66 cursor = get_cursor() 74 cursor = get_cursor()
67 cursor.execute("INSERT INTO crawler_runs () VALUES ()") 75 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
68 return cursor.lastrowid 76 return cursor.lastrowid
69 77
70 def _insert_data(run_id, site, url, filtered): 78 def _insert_data(run_id, site, url, filtered):
71 site_id = _find_site_id(site) 79 site_id = _find_site_id(site)
72 if site_id is None: 80 if site_id is None:
73 print >>sys.stderr, "Unable to find site '%s' in the database" % site 81 print >>sys.stderr, "Unable to find site '%s' in the database" % site
74 return 82 return
75 83
Wladimir Palant 2012/09/27 07:34:17 Thinking about that... So we give the client a lis
Felix Dahlke 2012/09/27 09:26:24 I did that for two reasons: 1. The site urls are d
76 cursor = get_cursor() 84 cursor = get_cursor()
77 cursor.execute(""" 85 cursor.execute("""
78 INSERT INTO crawler_data (run, site, url, filtered) 86 INSERT INTO crawler_data (run, site, url, filtered)
79 VALUES (%s, %s, %s, %s)""", 87 VALUES (%s, %s, %s, %s)""",
80 (run_id, site_id, url, filtered)) 88 (run_id, site_id, url, filtered))
81 89
82 @url_handler("/crawlerData") 90 @url_handler("/crawlerData")
83 @basic_auth 91 @basic_auth("crawler")
84 def crawler_data(environ, start_response): 92 def crawler_data(environ, start_response):
85 def line_callback(line): 93 def line_callback(line):
86 try: 94 try:
87 url, site, filtered = simplejson.loads(line) 95 url, site, filtered = simplejson.loads(line)
88 _insert_data(run_id, site, url, filtered) 96 _insert_data(run_id, site, url, filtered)
89 except simplejson.JSONDecodeError: 97 except simplejson.JSONDecodeError:
90 print >>sys.stderr, "Unable to parse JSON from '%s'" % line 98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
91 99
92 run_id = _create_run() 100 run_id = _create_run()
93 _read_multipart_lines(environ, line_callback) 101 try:
94 start_response("200 OK", [("Content-Type", "text/plain")]) 102 _read_multipart_lines(environ, line_callback)
95 return "" 103 start_response("200 OK", [("Content-Type", "text/plain")])
104 return ""
105 except ValueError as e:
106 start_response("401 Bad Request", [("Content-Type", "text/plain")])
107 return e
LEFTRIGHT

Powered by Google App Engine
This is Rietveld