Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 9045097: sitescripts: Unmerged changes (Closed)
Patch Set: Created Dec. 21, 2012, 9:39 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 import MySQLdb, os, re, simplejson, sys 1 import MySQLdb, os, re, simplejson, sys
2 from sitescripts.utils import cached, get_config 2 from sitescripts.utils import cached, get_config
3 from sitescripts.web import url_handler, basic_auth 3 from sitescripts.web import url_handler, basic_auth
4 4
5 @cached(600) 5 @cached(600)
6 def _get_db(): 6 def _get_db():
7 database = get_config().get("crawler", "database") 7 database = get_config().get("crawler", "database")
8 dbuser = get_config().get("crawler", "dbuser") 8 dbuser = get_config().get("crawler", "dbuser")
9 dbpasswd = get_config().get("crawler", "dbpassword") 9 dbpasswd = get_config().get("crawler", "dbpassword")
10 if os.name == "nt": 10 if os.name == "nt":
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
76 return cursor.lastrowid 76 return cursor.lastrowid
77 77
78 def _insert_data(run_id, site, url, filtered): 78 def _insert_data(run_id, site, url, filtered):
79 site_id = _find_site_id(site) 79 site_id = _find_site_id(site)
80 if site_id is None: 80 if site_id is None:
81 print >>sys.stderr, "Unable to find site '%s' in the database" % site 81 print >>sys.stderr, "Unable to find site '%s' in the database" % site
82 return 82 return
83 83
84 cursor = _get_cursor() 84 cursor = _get_cursor()
85 cursor.execute(""" 85 cursor.execute("""
86 INSERT INTO crawler_data (run, site, url, filtered) 86 INSERT INTO crawler_requests (run, site, url, filtered)
87 VALUES (%s, %s, %s, %s)""", 87 VALUES (%s, %s, %s, %s)""",
88 (run_id, site_id, url, filtered)) 88 (run_id, site_id, url, filtered))
89 89
90 @url_handler("/crawlerData") 90 @url_handler("/crawlerRequests")
91 @basic_auth("crawler") 91 @basic_auth("crawler")
92 def crawler_data(environ, start_response): 92 def crawler_requests(environ, start_response):
93 def line_callback(line): 93 def line_callback(line):
94 try: 94 try:
95 url, site, filtered = simplejson.loads(line) 95 data = simplejson.loads(line)
96 if len(data) < 3:
97 print >>sys.stderr, "Not enough elements in line '%s'" % line
98 return
99 url = data[0]
100 site = data[1]
101 filtered = data[2]
96 _insert_data(run_id, site, url, filtered) 102 _insert_data(run_id, site, url, filtered)
97 except simplejson.JSONDecodeError: 103 except simplejson.JSONDecodeError:
98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line 104 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
99 105
100 run_id = _create_run() 106 run_id = _create_run()
101 try: 107 try:
102 _read_multipart_lines(environ, line_callback) 108 _read_multipart_lines(environ, line_callback)
103 start_response("200 OK", [("Content-Type", "text/plain")]) 109 start_response("200 OK", [("Content-Type", "text/plain")])
104 return "" 110 return ""
105 except ValueError as e: 111 except ValueError as e:
106 start_response("401 Bad Request", [("Content-Type", "text/plain")]) 112 start_response("401 Bad Request", [("Content-Type", "text/plain")])
113 print >>sys.stderr, "Unable to read multipart data: %s" % e
107 return e 114 return e
OLDNEW
« sitescripts/crawler/bin/import_sites.py ('K') | « sitescripts/crawler/schema.sql ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld