Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts (Closed)
Patch Set: Fixed raw string Created May 30, 2016, 8:47 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 import MySQLdb 1 import MySQLdb
2 import os 2 import os
3 import re 3 import re
4 import json 4 import json
5 import sys 5 import sys
6 from sitescripts.utils import cached, get_config 6 from sitescripts.utils import cached, get_config
7 from sitescripts.web import url_handler, basic_auth 7 from sitescripts.web import url_handler, basic_auth
8 8
9 9
10 @cached(600) 10 @cached(600)
11 def _get_db(): 11 def _get_db():
12 database = get_config().get("crawler", "database") 12 database = get_config().get('crawler', 'database')
13 dbuser = get_config().get("crawler", "dbuser") 13 dbuser = get_config().get('crawler', 'dbuser')
14 dbpasswd = get_config().get("crawler", "dbpassword") 14 dbpasswd = get_config().get('crawler', 'dbpassword')
15 if os.name == "nt": 15 if os.name == 'nt':
16 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 16 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
17 use_unicode=True, charset="utf8", named_pipe=True ) 17 use_unicode=True, charset='utf8', named_pipe=True )
18 else: 18 else:
19 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 19 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
20 use_unicode=True, charset="utf8") 20 use_unicode=True, charset='utf8')
21 21
22 22
23 def _get_cursor(): 23 def _get_cursor():
24 return _get_db().cursor(MySQLdb.cursors.DictCursor) 24 return _get_db().cursor(MySQLdb.cursors.DictCursor)
25 25
26 26
27 def _fetch_crawlable_sites(): 27 def _fetch_crawlable_sites():
28 cursor = _get_cursor() 28 cursor = _get_cursor()
29 cursor.execute("SELECT url from crawler_sites") 29 cursor.execute('SELECT url from crawler_sites')
30 results = cursor.fetchall() 30 results = cursor.fetchall()
31 sites = [result["url"] for result in results] 31 sites = [result['url'] for result in results]
32 return sites 32 return sites
33 33
34 34
35 @url_handler("/crawlableSites") 35 @url_handler('/crawlableSites')
36 @basic_auth("crawler") 36 @basic_auth('crawler')
37 def crawlable_sites(environ, start_response): 37 def crawlable_sites(environ, start_response):
38 urls = _fetch_crawlable_sites() 38 urls = _fetch_crawlable_sites()
39 start_response("200 OK", [("Content-Type", "text/plain")]) 39 start_response('200 OK', [('Content-Type', 'text/plain')])
40 return "\n".join(urls) 40 return '\n'.join(urls)
41 41
42 42
43 def _find_site_id(site_url): 43 def _find_site_id(site_url):
44 cursor = _get_cursor() 44 cursor = _get_cursor()
45 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) 45 cursor.execute('SELECT id FROM crawler_sites WHERE url = %s', site_url)
46 result = cursor.fetchone() 46 result = cursor.fetchone()
47 return result["id"] if result else None 47 return result['id'] if result else None
48 48
49 49
50 def _read_multipart_lines(environ, line_callback): 50 def _read_multipart_lines(environ, line_callback):
51 data_file = environ["wsgi.input"] 51 data_file = environ['wsgi.input']
52 content_type = environ.get("CONTENT_TYPE") 52 content_type = environ.get('CONTENT_TYPE')
53 if not content_type: 53 if not content_type:
54 raise ValueError("Content-Type missing from header") 54 raise ValueError('Content-Type missing from header')
55 55
56 match = re.search(r"boundary=(.*)", content_type) 56 match = re.search(r'boundary=(.*)', content_type)
57 if not match: 57 if not match:
58 raise ValueError("Multipart form data or boundary declaration missing") 58 raise ValueError('Multipart form data or boundary declaration missing')
59 59
60 boundary = match.group(1) 60 boundary = match.group(1)
61 boundary_passed = False 61 boundary_passed = False
62 header_passed = False 62 header_passed = False
63 63
64 for line in data_file: 64 for line in data_file:
65 line = line.strip() 65 line = line.strip()
66 66
67 if not boundary_passed: 67 if not boundary_passed:
68 if line == "--" + boundary: 68 if line == '--' + boundary:
69 boundary_passed = True 69 boundary_passed = True
70 continue 70 continue
71 71
72 if not header_passed: 72 if not header_passed:
73 if not line: 73 if not line:
74 header_passed = True 74 header_passed = True
75 continue 75 continue
76 76
77 if line == "--" + boundary + "--": 77 if line == '--' + boundary + '--':
78 break 78 break
79 79
80 if line: 80 if line:
81 line_callback(line) 81 line_callback(line)
82 82
83 83
84 def _create_run(): 84 def _create_run():
85 cursor = _get_cursor() 85 cursor = _get_cursor()
86 cursor.execute("INSERT INTO crawler_runs () VALUES ()") 86 cursor.execute('INSERT INTO crawler_runs () VALUES ()')
87 return cursor.lastrowid 87 return cursor.lastrowid
88 88
89 89
90 def _insert_data(run_id, site, url, filtered): 90 def _insert_data(run_id, site, url, filtered):
91 site_id = _find_site_id(site) 91 site_id = _find_site_id(site)
92 if site_id is None: 92 if site_id is None:
93 print >>sys.stderr, "Unable to find site '%s' in the database" % site 93 print >>sys.stderr, "Unable to find site '%s' in the database" % site
94 return 94 return
95 95
96 cursor = _get_cursor() 96 cursor = _get_cursor()
97 cursor.execute(""" 97 cursor.execute('''
98 INSERT INTO crawler_requests (run, site, url, filtered) 98 INSERT INTO crawler_requests (run, site, url, filtered)
99 VALUES (%s, %s, %s, %s)""", 99 VALUES (%s, %s, %s, %s)''',
100 (run_id, site_id, url, filtered)) 100 (run_id, site_id, url, filtered))
101 101
102 102
103 @url_handler("/crawlerRequests") 103 @url_handler('/crawlerRequests')
104 @basic_auth("crawler") 104 @basic_auth('crawler')
105 def crawler_requests(environ, start_response): 105 def crawler_requests(environ, start_response):
106 def line_callback(line): 106 def line_callback(line):
107 try: 107 try:
108 data = json.loads(line) 108 data = json.loads(line)
109 if len(data) < 3: 109 if len(data) < 3:
110 print >>sys.stderr, "Not enough elements in line '%s'" % line 110 print >>sys.stderr, "Not enough elements in line '%s'" % line
111 return 111 return
112 url = data[0] 112 url = data[0]
113 site = data[1] 113 site = data[1]
114 filtered = data[2] 114 filtered = data[2]
115 _insert_data(run_id, site, url, filtered) 115 _insert_data(run_id, site, url, filtered)
116 except json.JSONDecodeError: 116 except json.JSONDecodeError:
117 print >>sys.stderr, "Unable to parse JSON from '%s'" % line 117 print >>sys.stderr, "Unable to parse JSON from '%s'" % line
118 118
119 run_id = _create_run() 119 run_id = _create_run()
120 try: 120 try:
121 _read_multipart_lines(environ, line_callback) 121 _read_multipart_lines(environ, line_callback)
122 start_response("200 OK", [("Content-Type", "text/plain")]) 122 start_response('200 OK', [('Content-Type', 'text/plain')])
123 return "" 123 return ''
124 except ValueError as e: 124 except ValueError as e:
125 start_response("401 Bad Request", [("Content-Type", "text/plain")]) 125 start_response('401 Bad Request', [('Content-Type', 'text/plain')])
126 print >>sys.stderr, "Unable to read multipart data: %s" % e 126 print >>sys.stderr, 'Unable to read multipart data: %s' % e
127 return e 127 return e
OLDNEW

Powered by Google App Engine
This is Rietveld