Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 8492019: sitescripts: Collect unmatched filters (Closed)
Patch Set: Created Oct. 2, 2012, 5:02 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 import MySQLdb, os, re, simplejson, sys 1 import MySQLdb, os, re, simplejson, sys
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler, basic_auth 3 from sitescripts.web import url_handler, basic_auth
4 4
5 @cached(600) 5 @cached(600)
6 def _get_db(): 6 def _get_db():
7 database = get_config().get("crawler", "database") 7 database = get_config().get("crawler", "database")
8 dbuser = get_config().get("crawler", "dbuser") 8 dbuser = get_config().get("crawler", "dbuser")
9 dbpasswd = get_config().get("crawler", "dbpassword") 9 dbpasswd = get_config().get("crawler", "dbpassword")
10 if os.name == "nt": 10 if os.name == "nt":
(...skipping 19 matching lines...) Expand all
30 urls = _fetch_crawlable_sites() 30 urls = _fetch_crawlable_sites()
31 start_response("200 OK", [("Content-Type", "text/plain")]) 31 start_response("200 OK", [("Content-Type", "text/plain")])
32 return "\n".join(urls) 32 return "\n".join(urls)
33 33
34 def _find_site_id(site_url): 34 def _find_site_id(site_url):
35 cursor = _get_cursor() 35 cursor = _get_cursor()
36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)
37 result = cursor.fetchone() 37 result = cursor.fetchone()
38 return result["id"] if result else None 38 return result["id"] if result else None
39 39
40 def _find_filter_id(filter_str):
41 if filter_str is None:
42 return None
43
44 cursor = _get_cursor()
45 query = "SELECT id FROM crawler_filters WHERE filter_hash = sha1(%s)"
46 cursor.execute(query, filter_str)
47 result = cursor.fetchone()
48 return result["id"] if result else None
49
40 def _read_multipart_lines(environ, line_callback): 50 def _read_multipart_lines(environ, line_callback):
41 data_file = environ["wsgi.input"] 51 data_file = environ["wsgi.input"]
42 content_type = environ.get("CONTENT_TYPE") 52 content_type = environ.get("CONTENT_TYPE")
43 if not content_type: 53 if not content_type:
44 raise ValueError("Content-Type missing from header") 54 raise ValueError("Content-Type missing from header")
45 55
46 match = re.search(r"boundary=(.*)", content_type) 56 match = re.search(r"boundary=(.*)", content_type)
47 if not match: 57 if not match:
48 raise ValueError("Multipart form data or boundary declaration missing") 58 raise ValueError("Multipart form data or boundary declaration missing")
49 59
(...skipping 18 matching lines...) Expand all
68 break 78 break
69 79
70 if line: 80 if line:
71 line_callback(line) 81 line_callback(line)
72 82
73 def _create_run(): 83 def _create_run():
74 cursor = _get_cursor() 84 cursor = _get_cursor()
75 cursor.execute("INSERT INTO crawler_runs () VALUES ()") 85 cursor.execute("INSERT INTO crawler_runs () VALUES ()")
76 return cursor.lastrowid 86 return cursor.lastrowid
77 87
78 def _insert_data(run_id, site, url, filtered): 88 def _insert_data(run_id, site, url, filtered, filter_str):
79 site_id = _find_site_id(site) 89 site_id = _find_site_id(site)
80 if site_id is None: 90 if site_id is None:
81 print >>sys.stderr, "Unable to find site '%s' in the database" % site 91 print >>sys.stderr, "Unable to find site '%s' in the database" % site
82 return 92 return
83 93
84 cursor = _get_cursor() 94 cursor = _get_cursor()
85 cursor.execute(""" 95 insert = """
86 INSERT INTO crawler_data (run, site, url, filtered) 96 INSERT INTO crawler_requests (run, site, url, filtered)
87 VALUES (%s, %s, %s, %s)""", 97 VALUES (%s, %s, %s, %s)"""
88 (run_id, site_id, url, filtered)) 98 cursor.execute(insert, (run_id, site_id, url, filtered))
89 99
90 @url_handler("/crawlerData") 100 filter_id = _find_filter_id(filter_str)
101 if filter_id is not None:
102 update = "UPDATE crawler_requests SET filter = %s WHERE id = %s";
103 cursor.execute(update, (filter_id, cursor.lastrowid))
104
105 @url_handler("/crawlerRequests")
91 @basic_auth("crawler") 106 @basic_auth("crawler")
92 def crawler_data(environ, start_response): 107 def crawler_requests(environ, start_response):
93 def line_callback(line): 108 def line_callback(line):
94 try: 109 try:
95 url, site, filtered = simplejson.loads(line) 110 data = simplejson.loads(line)
96 _insert_data(run_id, site, url, filtered) 111 if len(data) < 3:
112 print >>sys.stderr, "Not enough elements in line '%s'" % line
113 return
114 url = data[0]
115 site = data[1]
116 filtered = data[2]
117 filter_str = data[3] if len(data) >= 4 else None
118 _insert_data(run_id, site, url, filtered, filter_str)
97 except simplejson.JSONDecodeError: 119 except simplejson.JSONDecodeError:
98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line 120 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
99 121
100 run_id = _create_run() 122 run_id = _create_run()
101 try: 123 try:
102 _read_multipart_lines(environ, line_callback) 124 _read_multipart_lines(environ, line_callback)
103 start_response("200 OK", [("Content-Type", "text/plain")]) 125 start_response("200 OK", [("Content-Type", "text/plain")])
104 return "" 126 return ""
105 except ValueError as e: 127 except ValueError as e:
106 start_response("401 Bad Request", [("Content-Type", "text/plain")]) 128 start_response("401 Bad Request", [("Content-Type", "text/plain")])
129 print >>sys.stderr, "Unable to read multipart data: %s" % e
107 return e 130 return e
OLDNEW
« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld