Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: run.py

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)
Left Patch Set: Created April 24, 2015, 3:38 p.m.
Right Patch Set: Addressed comments Created May 7, 2015, 12:04 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« lib/debug.js ('K') | « metadata.gecko ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 import argparse 4 import argparse
5 import datetime 5 import datetime
6 import errno 6 import errno
7 import hashlib 7 import hashlib
8 import io 8 import io
9 import json 9 import json
10 import os 10 import os
11 import random 11 import random
12 import subprocess 12 import subprocess
13 import sys 13 import sys
14 import tempfile 14 import tempfile
15 import threading 15 import threading
16 import urllib 16 import urllib
17 import urlparse 17 import urlparse
18 from wsgiref.simple_server import make_server 18 from wsgiref.simple_server import make_server
19 19
20 from mozprofile import FirefoxProfile 20 from mozprofile import FirefoxProfile
21 from mozrunner import FirefoxRunner 21 from mozrunner import FirefoxRunner
22 22
23 class CrawlerApp: 23 class CrawlerApp:
24 server = None
25 def __init__(self, parameters): 24 def __init__(self, parameters):
26 self.parameters = parameters 25 self.parameters = parameters
27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: 26 with io.open(self.parameters.list, 'r', encoding='utf-8') as file:
28 self.urls = map(unicode.strip, handle.readlines()) 27 self.urls = [line.strip() for line in file]
29 28
30 def __call__(self, environ, start_response): 29 def __call__(self, environ, start_response):
31 path = environ.get('PATH_INFO', '') 30 path = environ.get('PATH_INFO', '')
32 if path == '/parameters': 31 if path == '/parameters':
33 start_response('200 OK', [('Content-Type', 'application/json')]) 32 start_response('200 OK', [('Content-Type', 'application/json')])
34 return [json.dumps({ 33 return [json.dumps({
35 'urls': self.urls, 34 'urls': self.urls,
36 'timeout': self.parameters.timeout * 1000, 35 'timeout': self.parameters.timeout * 1000,
37 'maxtabs': self.parameters.maxtabs, 36 'maxtabs': self.parameters.maxtabs,
38 })] 37 })]
39 elif path == '/save': 38 if path == '/save':
40 try: 39 try:
41 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) 40 request_body_size = int(environ.get('CONTENT_LENGTH', 0))
42 except (ValueError): 41 except ValueError:
43 start_response('400 Bad Request', []) 42 start_response('411 Length Required', [])
44 return '' 43 return []
45 44
46 data = json.loads(environ['wsgi.input'].read(request_body_size)) 45 data = json.loads(environ['wsgi.input'].read(request_body_size))
47 self.urls.remove(data['url']) 46 self.urls.remove(data['url'])
48 47
49 parsedurl = urlparse.urlparse(data['url']) 48 parsedurl = urlparse.urlparse(data['url'])
50 urlhash = hashlib.new('md5', data['url']).hexdigest() 49 urlhash = hashlib.new('md5', data['url']).hexdigest()
51 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f') 50 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f')
52 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) 51 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)
53 datapath = os.path.join(self.parameters.outdir, basename + ".json") 52 datapath = os.path.join(self.parameters.outdir, basename + ".json")
54 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") 53 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")
55 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") 54 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")
56 55
57 try: 56 try:
58 os.makedirs(self.parameters.outdir) 57 os.makedirs(self.parameters.outdir)
59 except OSError as e: 58 except OSError as e:
60 if e.errno != errno.EEXIST: 59 if e.errno != errno.EEXIST:
61 raise 60 raise
62 61
63 if "screenshot" in data: 62 screenshot = data.pop("screenshot", None)
64 with open(screenshotpath, 'wb') as handle: 63 if screenshot:
65 handle.write(urllib.urlopen(data["screenshot"]).read()) 64 with open(screenshotpath, 'wb') as file:
66 del data["screenshot"] 65 response = urllib.urlopen(screenshot)
67 66 try:
68 if "source" in data: 67 file.write(response.read())
69 with io.open(sourcepath, 'w', encoding='utf-8') as handle: 68 finally:
70 handle.write(data["source"]) 69 response.close()
71 del data["source"] 70
72 71 source = data.pop("source", None)
73 with io.open(datapath, 'w', encoding='utf-8') as handle: 72 if source:
74 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n') 73 with io.open(sourcepath, 'w', encoding='utf-8') as file:
Sebastian Noack 2015/04/27 14:55:50 How about json.dump(data, handle, ..)?
74 file.write(source)
75
76 with io.open(datapath, 'w', encoding='utf-8') as file:
77 file.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_k eys=True)) + u'\n')
75 start_response('204 No Content', []) 78 start_response('204 No Content', [])
76 return '' 79 return []
77 80
78 start_response('404 Not Found', []) 81 start_response('404 Not Found', [])
79 return '' 82 return []
80 83
81 def run(): 84 def run():
82 parser = argparse.ArgumentParser(description='Run crawler') 85 parser = argparse.ArgumentParser(description='Run crawler')
83 parser.add_argument( 86 parser.add_argument(
84 '-b', '--binary', type=str, 87 '-b', '--binary', type=str,
85 help='path to the Firefox binary' 88 help='path to the Firefox binary'
86 ) 89 )
87 parser.add_argument( 90 parser.add_argument(
88 '-a', '--abpdir', type=str, 91 '-a', '--abpdir', type=str,
89 help='path to the Adblock Plus repository' 92 help='path to the Adblock Plus repository'
(...skipping 18 matching lines...) Expand all
108 parser.add_argument( 111 parser.add_argument(
109 'outdir', type=str, 112 'outdir', type=str,
110 help='directory to write data into' 113 help='directory to write data into'
111 ) 114 )
112 parameters = parser.parse_args() 115 parameters = parser.parse_args()
113 116
114 import buildtools.packagerGecko as packager 117 import buildtools.packagerGecko as packager
115 cleanup = [] 118 cleanup = []
116 try: 119 try:
117 base_dir = os.path.dirname(__file__) 120 base_dir = os.path.dirname(__file__)
118 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') 121 file, crawlerxpi = tempfile.mkstemp(suffix='.xpi')
119 os.close(handle) 122 os.close(file)
120 cleanup.append(crawlerxpi) 123 cleanup.append(crawlerxpi)
121 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) 124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)
122 125
123 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi' 126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi'
124 if parameters.abpdir: 127 if parameters.abpdir:
125 handle, abpxpi = tempfile.mkstemp(suffix='.xpi') 128 file, abpxpi = tempfile.mkstemp(suffix='.xpi')
126 os.close(handle) 129 os.close(file)
127 cleanup.append(abpxpi) 130 cleanup.append(abpxpi)
128 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) 131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True)
129 132
130 profile = FirefoxProfile( 133 profile = FirefoxProfile(
131 addons=[ 134 addons=[
132 crawlerxpi, 135 crawlerxpi,
133 abpxpi, 136 abpxpi,
134 ], 137 ],
135 preferences={ 138 preferences={
136 'browser.uitour.enabled': False, 139 'browser.uitour.enabled': False,
137 'prompts.tab_modal.enabled': False, 140 'prompts.tab_modal.enabled': False,
138 } 141 }
139 ) 142 )
140 143
141 abpsettings = os.path.join(profile.profile, 'adblockplus') 144 abpsettings = os.path.join(profile.profile, 'adblockplus')
142 os.makedirs(abpsettings) 145 os.makedirs(abpsettings)
143 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: 146 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as file:
144 print >>handle, '# Adblock Plus preferences' 147 print >>file, '# Adblock Plus preferences'
145 print >>handle, 'version=4' 148 print >>file, 'version=4'
146 for url in parameters.filters: 149 for url in parameters.filters:
147 if '=' in url: 150 if '=' in url:
148 path, url = url.split('=', 1) 151 path, url = url.split('=', 1)
149 with open(path, 'r') as source: 152 with open(path, 'r') as source:
150 data = source.read() 153 data = source.read()
151 else: 154 else:
152 data = urllib.urlopen(url).read() 155 data = urllib.urlopen(url).read()
153 print >>handle, '[Subscription]' 156 print >>file, '[Subscription]'
154 print >>handle, 'url=%s' % url 157 print >>file, 'url=%s' % url
155 print >>handle, '[Subscription filters]' 158 print >>file, '[Subscription filters]'
156 print >>handle, '\n'.join(data.splitlines()[1:]) 159 print >>file, '\n'.join(data.splitlines()[1:])
157 finally: 160 finally:
158 for path in cleanup: 161 for path in cleanup:
159 os.unlink(path) 162 os.unlink(path)
160 163
161 server = None 164 server = None
162 try: 165 try:
163 port = random.randrange(2000, 60000) 166 port = random.randrange(2000, 60000)
164 print "Communicating with client on port %i" % port 167 print "Communicating with client on port %i" % port
165 168
166 app = CrawlerApp(parameters) 169 app = CrawlerApp(parameters)
167 server = make_server('localhost', port, app) 170 server = make_server('localhost', port, app)
168 app.server = server 171
Wladimir Palant 2015/05/07 00:04:59 Done.
169 threading.Thread(target=lambda: server.serve_forever()).start() 172 thread = threading.Thread(target=server.serve_forever)
173 thread.daemon = True
174 thread.start()
170 175
171 runner = FirefoxRunner( 176 runner = FirefoxRunner(
172 profile=profile, 177 profile=profile,
173 binary=parameters.binary, 178 binary=parameters.binary,
174 cmdargs=['--crawler-port', str(port)], 179 cmdargs=['--crawler-port', str(port)],
175 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), 180 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),
176 ) 181 )
177 while app.urls: 182 while app.urls:
178 runner.start() 183 runner.start()
179 runner.wait() 184 runner.wait()
180 finally: 185 finally:
181 if server: 186 if server:
182 server.shutdown() 187 server.shutdown()
183 profile.cleanup() 188 profile.cleanup()
184 189
185 if __name__ == '__main__': 190 if __name__ == '__main__':
186 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 191 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
187 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") 192 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")
188 193
189 try: 194 try:
190 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) 195 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])
191 except subprocess.CalledProcessError as e: 196 except subprocess.CalledProcessError as e:
192 print >>sys.stderr, e 197 print >>sys.stderr, e
193 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" 198 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"
194 199
195 run() 200 run()
LEFTRIGHT

Powered by Google App Engine
This is Rietveld