Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: run.py

Issue 29338442: Issue 3815 - Fix TabAllocator. Now it returns tab with initialized outerWindowID (Closed)
Left Patch Set: eliminate race conditions Created April 11, 2016, 3:06 p.m.
Right Patch Set: remove additional empty line Created Sept. 16, 2016, 12:33 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « lib/crawler.js ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 import argparse 4 import argparse
5 import datetime 5 import datetime
6 import errno 6 import errno
7 import hashlib 7 import hashlib
8 import io 8 import io
9 import json 9 import json
10 import os 10 import os
11 import random 11 import random
12 import subprocess 12 import subprocess
13 import sys 13 import sys
14 import tempfile 14 import tempfile
15 import threading 15 import threading
16 import urllib 16 import urllib
17 import urlparse 17 import urlparse
18 from wsgiref.simple_server import make_server 18 from wsgiref.simple_server import make_server
19 19
20 from mozprofile import FirefoxProfile 20 from mozprofile import FirefoxProfile
21 from mozrunner import FirefoxRunner 21 from mozrunner import FirefoxRunner
22 22
23
23 class CrawlerApp: 24 class CrawlerApp:
24 server = None 25 server = None
25 def __init__(self, parameters): 26
26 self.parameters = parameters 27 def __init__(self, parameters):
27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: 28 self.parameters = parameters
28 self.urls = map(unicode.strip, handle.readlines()) 29 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle:
29 30 self.urls = map(unicode.strip, handle.readlines())
30 def __call__(self, environ, start_response): 31
31 path = environ.get('PATH_INFO', '') 32 def __call__(self, environ, start_response):
32 if path == '/parameters': 33 path = environ.get('PATH_INFO', '')
33 start_response('200 OK', [('Content-Type', 'application/json')]) 34 if path == '/parameters':
34 return [json.dumps({ 35 start_response('200 OK', [('Content-Type', 'application/json')])
35 'urls': self.urls, 36 return [json.dumps({
36 'timeout': self.parameters.timeout * 1000, 37 'urls': self.urls,
37 'maxtabs': self.parameters.maxtabs, 38 'timeout': self.parameters.timeout * 1000,
38 })] 39 'maxtabs': self.parameters.maxtabs,
39 elif path == '/save': 40 })]
40 try: 41 elif path == '/save':
41 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) 42 try:
42 except (ValueError): 43 request_body_size = int(environ.get('CONTENT_LENGTH', 0))
43 start_response('400 Bad Request', []) 44 except (ValueError):
45 start_response('400 Bad Request', [])
46 return ''
47
48 data = json.loads(environ['wsgi.input'].read(request_body_size))
49 self.urls.remove(data['url'])
50
51 fullurl = data['url']
52 if not urlparse.urlparse(fullurl).scheme:
53 fullurl = 'http://' + fullurl
54 parsedurl = urlparse.urlparse(fullurl)
55 urlhash = hashlib.new('md5', data['url']).hexdigest()
56 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000 .0).strftime('%Y-%m-%dT%H%M%S.%f')
57 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)
58 datapath = os.path.join(self.parameters.outdir, basename + ".json")
59 screenshotpath = os.path.join(self.parameters.outdir, basename + ".j pg")
60 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")
61
62 try:
63 os.makedirs(self.parameters.outdir)
64 except OSError as e:
65 if e.errno != errno.EEXIST:
66 raise
67
68 if "screenshot" in data:
69 with open(screenshotpath, 'wb') as handle:
70 handle.write(urllib.urlopen(data["screenshot"]).read())
71 del data["screenshot"]
72
73 if "source" in data:
74 with io.open(sourcepath, 'w', encoding='utf-8') as handle:
75 handle.write(data["source"])
76 del data["source"]
77
78 with io.open(datapath, 'w', encoding='utf-8') as handle:
79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal se, sort_keys=True)) + u'\n')
80 start_response('204 No Content', [])
81 return ''
82
83 start_response('404 Not Found', [])
44 return '' 84 return ''
45 85
46 data = json.loads(environ['wsgi.input'].read(request_body_size))
47 self.urls.remove(data['url'])
48
49 fullurl = data['url']
50 if not urlparse.urlparse(fullurl).scheme:
51 fullurl = 'http://' + fullurl
52 parsedurl = urlparse.urlparse(fullurl)
53 urlhash = hashlib.new('md5', data['url']).hexdigest()
54 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f')
55 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)
56 datapath = os.path.join(self.parameters.outdir, basename + ".json")
57 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")
58 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")
59
60 try:
61 os.makedirs(self.parameters.outdir)
62 except OSError as e:
63 if e.errno != errno.EEXIST:
64 raise
65
66 if "screenshot" in data:
67 with open(screenshotpath, 'wb') as handle:
68 handle.write(urllib.urlopen(data["screenshot"]).read())
69 del data["screenshot"]
70
71 if "source" in data:
72 with io.open(sourcepath, 'w', encoding='utf-8') as handle:
73 handle.write(data["source"])
74 del data["source"]
75
76 with io.open(datapath, 'w', encoding='utf-8') as handle:
77 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n')
78 start_response('204 No Content', [])
79 return ''
80
81 start_response('404 Not Found', [])
82 return ''
83 86
84 def run(): 87 def run():
85 parser = argparse.ArgumentParser(description='Run crawler') 88 parser = argparse.ArgumentParser(description='Run crawler')
86 parser.add_argument( 89 parser.add_argument(
87 '-b', '--binary', type=str, 90 '-b', '--binary', type=str,
88 help='path to the Firefox binary' 91 help='path to the Firefox binary'
89 ) 92 )
90 parser.add_argument( 93 parser.add_argument(
91 '-a', '--abpdir', type=str, 94 '-a', '--abpdir', type=str,
92 help='path to the Adblock Plus repository' 95 help='path to the Adblock Plus repository'
93 ) 96 )
94 parser.add_argument( 97 parser.add_argument(
95 '-f', '--filters', metavar='url', type=str, nargs='+', 98 '-f', '--filters', metavar='url', type=str, nargs='+',
96 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/ /easylist-downloads.adblockplus.org/exceptionrules.txt"], 99 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "htt ps://easylist-downloads.adblockplus.org/exceptionrules.txt"],
97 help='filter lists to install in Adblock Plus. The arguments can also have t he format path=url, the data will be read from the specified path then.' 100 help='filter lists to install in Adblock Plus. The arguments can also ha ve the format path=url, the data will be read from the specified path then.'
98 ) 101 )
99 parser.add_argument( 102 parser.add_argument(
100 '-t', '--timeout', type=int, default=300, 103 '-t', '--timeout', type=int, default=300,
101 help='Load timeout (seconds)' 104 help='Load timeout (seconds)'
102 ) 105 )
103 parser.add_argument( 106 parser.add_argument(
104 '-x', '--maxtabs', type=int, default=15, 107 '-x', '--maxtabs', type=int, default=15,
105 help='Maximal number of tabs to open in parallel' 108 help='Maximal number of tabs to open in parallel'
106 ) 109 )
107 parser.add_argument( 110 parser.add_argument(
108 'list', type=str, 111 'list', type=str,
109 help='URL list to process' 112 help='URL list to process'
110 ) 113 )
111 parser.add_argument( 114 parser.add_argument(
112 'outdir', type=str, 115 'outdir', type=str,
113 help='directory to write data into' 116 help='directory to write data into'
114 ) 117 )
115 parameters = parser.parse_args() 118 parameters = parser.parse_args()
116 119
117 import buildtools.packagerGecko as packager 120 import buildtools.packagerGecko as packager
118 cleanup = [] 121 cleanup = []
119 try: 122 try:
120 base_dir = os.path.dirname(os.path.abspath(__file__)) 123 base_dir = os.path.dirname(os.path.abspath(__file__))
121 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') 124 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi')
122 os.close(handle) 125 os.close(handle)
123 cleanup.append(crawlerxpi) 126 cleanup.append(crawlerxpi)
124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) 127 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)
125 128
126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi' 129 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon -1865-latest.xpi'
127 if parameters.abpdir: 130 if parameters.abpdir:
128 handle, abpxpi = tempfile.mkstemp(suffix='.xpi') 131 handle, abpxpi = tempfile.mkstemp(suffix='.xpi')
129 os.close(handle) 132 os.close(handle)
130 cleanup.append(abpxpi) 133 cleanup.append(abpxpi)
131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) 134 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild =True)
132 135
133 profile = FirefoxProfile( 136 profile = FirefoxProfile(
134 addons=[ 137 addons=[
135 crawlerxpi, 138 crawlerxpi,
136 abpxpi, 139 abpxpi,
137 ], 140 ],
138 preferences={ 141 preferences={
139 'browser.startup.homepage': 'about:blank', 142 'browser.startup.homepage': 'about:blank',
140 'browser.tabs.warnOnCloseOtherTabs': False, 143 'browser.tabs.warnOnCloseOtherTabs': False,
141 'browser.uitour.enabled': False, 144 'browser.uitour.enabled': False,
142 'prompts.tab_modal.enabled': False, 145 'prompts.tab_modal.enabled': False,
143 'startup.homepage_welcome_url': 'about:blank', 146 'startup.homepage_welcome_url': 'about:blank',
144 'startup.homepage_welcome_url.additional': 'about:blank', 147 'startup.homepage_welcome_url.additional': 'about:blank',
145 'xpinstall.signatures.required': False, 148 'xpinstall.signatures.required': False,
146 } 149 }
147 ) 150 )
148 151
149 abpsettings = os.path.join(profile.profile, 'adblockplus') 152 abpsettings = os.path.join(profile.profile, 'adblockplus')
150 os.makedirs(abpsettings) 153 os.makedirs(abpsettings)
151 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: 154 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle:
152 print >>handle, '# Adblock Plus preferences' 155 print >>handle, '# Adblock Plus preferences'
153 print >>handle, 'version=4' 156 print >>handle, 'version=4'
154 for url in parameters.filters: 157 for url in parameters.filters:
155 if '=' in url: 158 if '=' in url:
156 path, url = url.split('=', 1) 159 path, url = url.split('=', 1)
157 with open(path, 'r') as source: 160 with open(path, 'r') as source:
158 data = source.read() 161 data = source.read()
159 else: 162 else:
160 data = urllib.urlopen(url).read() 163 data = urllib.urlopen(url).read()
161 print >>handle, '[Subscription]' 164 print >>handle, '[Subscription]'
162 print >>handle, 'url=%s' % url 165 print >>handle, 'url=%s' % url
163 print >>handle, '[Subscription filters]' 166 print >>handle, '[Subscription filters]'
164 print >>handle, '\n'.join(data.splitlines()[1:]) 167 print >>handle, '\n'.join(data.splitlines()[1:])
165 finally: 168 finally:
166 for path in cleanup: 169 for path in cleanup:
167 os.unlink(path) 170 os.unlink(path)
168 171
169 server = None 172 server = None
170 try: 173 try:
171 port = random.randrange(2000, 60000) 174 port = random.randrange(2000, 60000)
172 print "Communicating with client on port %i" % port 175 print "Communicating with client on port %i" % port
173 176
174 app = CrawlerApp(parameters) 177 app = CrawlerApp(parameters)
175 server = make_server('localhost', port, app) 178 server = make_server('localhost', port, app)
176 app.server = server 179 app.server = server
177 threading.Thread(target=lambda: server.serve_forever()).start() 180 threading.Thread(target=lambda: server.serve_forever()).start()
178 181
179 runner = FirefoxRunner( 182 runner = FirefoxRunner(
180 profile=profile, 183 profile=profile,
181 binary=parameters.binary, 184 binary=parameters.binary,
182 cmdargs=['--crawler-port', str(port)], 185 cmdargs=['--crawler-port', str(port)],
183 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), 186 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),
184 ) 187 )
185 while app.urls: 188 while app.urls:
186 runner.start() 189 runner.start()
187 runner.wait() 190 runner.wait()
188 finally: 191 finally:
189 if server: 192 if server:
190 server.shutdown() 193 server.shutdown()
191 profile.cleanup() 194 profile.cleanup()
192 195
193 if __name__ == '__main__': 196 if __name__ == '__main__':
194 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 197 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
195 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") 198 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")
196 199
197 try: 200 try:
198 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) 201 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])
199 except subprocess.CalledProcessError as e: 202 except subprocess.CalledProcessError as e:
200 print >>sys.stderr, e 203 print >>sys.stderr, e
201 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" 204 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"
202 205
203 run() 206 run()
LEFTRIGHT

Powered by Google App Engine
This is Rietveld