Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)
Patch Set: Created Sept. 27, 2012, 9:26 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | sitescripts/extensions/bin/createNightlies.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 import MySQLdb, os 1 import MySQLdb, os, re, simplejson, sys
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler 3 from sitescripts.web import url_handler, basic_auth
4
5 @url_handler('/crawlableUrls')
6 def listUrls(environ, start_response):
7 urls = fetch_crawlable_urls()
8 start_response('200 OK', [('Content-Type', 'text/plain')])
9 return '\n'.join(urls)
10
11 def fetch_crawlable_urls():
12 cursor = get_db().cursor(MySQLdb.cursors.DictCursor)
13 executeQuery(cursor, 'SELECT url from crawler_urls')
14 results = cursor.fetchall()
15 urls = [result['url'] for result in results]
16 return urls
17 4
18 @cached(600) 5 @cached(600)
19 def get_db(): 6 def _get_db():
20 database = get_config().get('crawler', 'database') 7 database = get_config().get("crawler", "database")
21 dbuser = get_config().get('crawler', 'dbuser') 8 dbuser = get_config().get("crawler", "dbuser")
22 dbpasswd = get_config().get('crawler', 'dbpassword') 9 dbpasswd = get_config().get("crawler", "dbpassword")
23 if os.name == 'nt': 10 if os.name == "nt":
24 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8', named_pipe=True) 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
12 use_unicode=True, charset="utf8", named_pipe=True)
25 else: 13 else:
26 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8') 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
15 use_unicode=True, charset="utf8")
27 16
28 def executeQuery(cursor, query, args=None): 17 def get_cursor():
29 cursor.execute(query, args) 18 return _get_db().cursor(MySQLdb.cursors.DictCursor)
19
20 def _fetch_crawlable_sites():
21 cursor = get_cursor()
22 cursor.execute("SELECT url from crawler_sites")
23 results = cursor.fetchall()
24 sites = [result["url"] for result in results]
25 return sites
26
27 @url_handler("/crawlableSites")
28 @basic_auth
Wladimir Palant 2012/09/27 13:44:51 Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke 2012/09/27 14:15:33 Done.
29 def crawlable_sites(environ, start_response):
30 urls = _fetch_crawlable_sites()
31 start_response("200 OK", [("Content-Type", "text/plain")])
32 return "\n".join(urls)
33
34 def _find_site_id(site_url):
35 cursor = get_cursor()
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
37 result = cursor.fetchone()
38 return result["id"] if result else None
39
40 def _read_multipart_lines(environ, line_callback):
41 data_file = environ["wsgi.input"]
42 content_type = environ["CONTENT_TYPE"]
43 if not content_type:
44 raise ValueError("Content-Type missing from header")
Wladimir Palant 2012/09/27 13:44:51 Python doesn't work like that - accessing a non-ex
Felix Dahlke 2012/09/27 14:15:33 Done.
45
46 match = re.search(r"boundary=(.*)", content_type)
47 if not match:
48 raise ValueError("Multipart form data or boundary declaration missing")
49
50 boundary = match.group(1)
51 boundary_passed = False
52 header_passed = False
53
54 for line in data_file:
55 line = line.strip()
56
57 if not boundary_passed:
58 if line == "--" + boundary:
59 boundary_passed = True
60 continue
61
62 if not header_passed:
63 if not line:
64 header_passed = True
65 continue
66
67 if line == "--" + boundary + "--":
68 break
69
70 if line:
71 line_callback(line)
72
73 return True
Wladimir Palant 2012/09/27 13:44:51 That return value is unused, left-over from earlie
Felix Dahlke 2012/09/27 14:15:33 Done.
74
75 def _create_run():
76 cursor = get_cursor()
77 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
78 return cursor.lastrowid
79
80 def _insert_data(run_id, site, url, filtered):
81 site_id = _find_site_id(site)
82 if site_id is None:
83 print >>sys.stderr, "Unable to find site '%s' in the database" % site
84 return
85
86 cursor = get_cursor()
87 cursor.execute("""
88 INSERT INTO crawler_data (run, site, url, filtered)
89 VALUES (%s, %s, %s, %s)""",
90 (run_id, site_id, url, filtered))
91
92 @url_handler("/crawlerData")
93 @basic_auth
Wladimir Palant 2012/09/27 13:44:51 Shouldn't this be @basic_auth(config_section="craw
Felix Dahlke 2012/09/27 14:15:33 Done.
94 def crawler_data(environ, start_response):
95 def line_callback(line):
96 try:
97 url, site, filtered = simplejson.loads(line)
98 _insert_data(run_id, site, url, filtered)
99 except simplejson.JSONDecodeError:
100 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
101
102 run_id = _create_run()
103 try:
104 _read_multipart_lines(environ, line_callback)
105 start_response("200 OK", [("Content-Type", "text/plain")])
106 return ""
107 except ValueError as e:
108 start_response("401 Bad Request", [("Content-Type", "text/plain")])
109 return e
OLDNEW
« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | sitescripts/extensions/bin/createNightlies.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld