Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Patch Set: README fix Created Sept. 14, 2012, 2:42 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 import MySQLdb, os 1 import MySQLdb, os, simplejson
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler 3 from sitescripts.web import url_handler, basic_auth
4 from urlparse import parse_qs
5 4
6 @cached(600) 5 @cached(600)
7 def get_db(): 6 def _get_db():
8 database = get_config().get("crawler", "database") 7 database = get_config().get("crawler", "database")
9 dbuser = get_config().get("crawler", "dbuser") 8 dbuser = get_config().get("crawler", "dbuser")
10 dbpasswd = get_config().get("crawler", "dbpassword") 9 dbpasswd = get_config().get("crawler", "dbpassword")
11 if os.name == "nt": 10 if os.name == "nt":
12 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True) 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True)
13 else: 12 else:
14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8") 13 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8")
15 14
16 def get_cursor(): 15 def get_cursor():
17 return get_db().cursor(MySQLdb.cursors.DictCursor) 16 return _get_db().cursor(MySQLdb.cursors.DictCursor)
18 17
19 def fetch_crawlable_urls(): 18 def _fetch_crawlable_sites():
20 cursor = get_cursor() 19 cursor = get_cursor()
21 cursor.execute("SELECT url from crawler_sites") 20 cursor.execute("SELECT url from crawler_sites")
22 results = cursor.fetchall() 21 results = cursor.fetchall()
23 urls = [result["url"] for result in results] 22 sites = [result["url"] for result in results]
24 return urls 23 return sites
25 24
26 @url_handler("/crawlableUrls") 25 @url_handler("/crawlableSites")
27 def crawlable_urls(environ, start_response): 26 @basic_auth
28 urls = fetch_crawlable_urls() 27 def crawlable_sites(environ, start_response):
28 urls = _fetch_crawlable_sites()
29 start_response("200 OK", [("Content-Type", "text/plain")]) 29 start_response("200 OK", [("Content-Type", "text/plain")])
30 return "\n".join(urls) 30 return "\n".join(urls)
31 31
32 @url_handler("/crawlerRun") 32 def _find_site_id(site_url):
33 def crawler_run(environ, start_response):
34 cursor = get_cursor()
35 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
36 start_response("200 OK", [("Content-Type", "text/plain")])
37 return str(cursor.lastrowid)
38
39 def find_site_id(site_url):
40 cursor = get_cursor() 33 cursor = get_cursor()
41 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) 34 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
42 return cursor.fetchall()[0]["id"] 35 return cursor.fetchall()[0]["id"]
Wladimir Palant 2012/09/14 17:24:18 a) Why use fetchall() if we are only interested in
Felix Dahlke 2012/09/26 15:20:30 Done.
43 36
44 @url_handler("/crawlerData") 37 def _read_multipart_lines(environ, line_callback):
45 def crawler_data(environ, start_response): 38 data_file = environ["wsgi.input"]
46 params = parse_qs(environ["QUERY_STRING"]) 39 current_line = 0
47 run_id = params["run"][0] 40
48 site_id = find_site_id(params["site"][0]) 41 while True:
49 request_url = params["request_url"][0] 42 line = data_file.readline().strip()
Wladimir Palant 2012/09/14 17:24:18 Same as earlier, please use: for line in enviro
Felix Dahlke 2012/09/26 15:20:30 Done.
50 document_url = params["document_url"][0] 43 current_line += 1
44
45 if current_line == 1:
46 boundary = line
47 continue
48
49 if current_line < 5 or not line:
Wladimir Palant 2012/09/14 17:24:18 No, that's not how you parse multipart/form-data (
Felix Dahlke 2012/09/26 15:20:30 Done. Wow, that was one mean hack I did there :)
Wladimir Palant 2012/09/27 07:34:17 Much better now :)
50 continue
51
52 if line.startswith(boundary):
53 break
54
55 line_callback(line)
56
57 def _create_run():
58 cursor = get_cursor()
59 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
60 return cursor.lastrowid
61
62 def _insert_data(run_id, site, url, filtered):
63 site_id = _find_site_id(site)
51 cursor = get_cursor() 64 cursor = get_cursor()
52 cursor.execute(""" 65 cursor.execute("""
53 INSERT INTO crawler_data (run, site, request_url, document_url) 66 INSERT INTO crawler_data (run, site, url, filtered)
54 VALUES (%s, %s, %s, %s)""", 67 VALUES (%s, %s, %s, %s)""",
55 (run_id, site_id, request_url, document_url)) 68 (run_id, site_id, url, filtered))
69
70 @url_handler("/crawlerData")
71 @basic_auth
72 def crawler_data(environ, start_response):
73 def line_callback(line):
74 url, site, filtered = simplejson.loads(line)
75 _insert_data(run_id, site, url, filtered)
76
77 run_id = _create_run()
78 _read_multipart_lines(environ, line_callback)
56 start_response("200 OK", [("Content-Type", "text/plain")]) 79 start_response("200 OK", [("Content-Type", "text/plain")])
57 return "" 80 return ""
OLDNEW

Powered by Google App Engine
This is Rietveld