Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: run.py

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)
Patch Set: Addressed comments Created May 7, 2015, 12:04 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« lib/debug.js ('K') | « metadata.gecko ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 import argparse
5 import datetime
6 import errno
7 import hashlib
8 import io
9 import json
10 import os
11 import random
12 import subprocess
13 import sys
14 import tempfile
15 import threading
16 import urllib
17 import urlparse
18 from wsgiref.simple_server import make_server
19
20 from mozprofile import FirefoxProfile
21 from mozrunner import FirefoxRunner
22
23 class CrawlerApp:
24 def __init__(self, parameters):
25 self.parameters = parameters
26 with io.open(self.parameters.list, 'r', encoding='utf-8') as file:
27 self.urls = [line.strip() for line in file]
28
29 def __call__(self, environ, start_response):
30 path = environ.get('PATH_INFO', '')
31 if path == '/parameters':
32 start_response('200 OK', [('Content-Type', 'application/json')])
33 return [json.dumps({
34 'urls': self.urls,
35 'timeout': self.parameters.timeout * 1000,
36 'maxtabs': self.parameters.maxtabs,
37 })]
38 if path == '/save':
39 try:
40 request_body_size = int(environ.get('CONTENT_LENGTH', 0))
41 except ValueError:
42 start_response('411 Length Required', [])
43 return []
44
45 data = json.loads(environ['wsgi.input'].read(request_body_size))
46 self.urls.remove(data['url'])
47
48 parsedurl = urlparse.urlparse(data['url'])
49 urlhash = hashlib.new('md5', data['url']).hexdigest()
50 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f')
51 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)
52 datapath = os.path.join(self.parameters.outdir, basename + ".json")
53 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")
54 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")
55
56 try:
57 os.makedirs(self.parameters.outdir)
58 except OSError as e:
59 if e.errno != errno.EEXIST:
60 raise
61
62 screenshot = data.pop("screenshot", None)
63 if screenshot:
64 with open(screenshotpath, 'wb') as file:
65 response = urllib.urlopen(screenshot)
66 try:
67 file.write(response.read())
68 finally:
69 response.close()
70
71 source = data.pop("source", None)
72 if source:
73 with io.open(sourcepath, 'w', encoding='utf-8') as file:
74 file.write(source)
75
76 with io.open(datapath, 'w', encoding='utf-8') as file:
77 file.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_k eys=True)) + u'\n')
78 start_response('204 No Content', [])
79 return []
80
81 start_response('404 Not Found', [])
82 return []
83
84 def run():
85 parser = argparse.ArgumentParser(description='Run crawler')
86 parser.add_argument(
87 '-b', '--binary', type=str,
88 help='path to the Firefox binary'
89 )
90 parser.add_argument(
91 '-a', '--abpdir', type=str,
92 help='path to the Adblock Plus repository'
93 )
94 parser.add_argument(
95 '-f', '--filters', metavar='url', type=str, nargs='+',
96 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/ /easylist-downloads.adblockplus.org/exceptionrules.txt"],
97 help='filter lists to install in Adblock Plus. The arguments can also have t he format path=url, the data will be read from the specified path then.'
98 )
99 parser.add_argument(
100 '-t', '--timeout', type=int, default=300,
101 help='Load timeout (seconds)'
102 )
103 parser.add_argument(
104 '-x', '--maxtabs', type=int, default=15,
105 help='Maximal number of tabs to open in parallel'
106 )
107 parser.add_argument(
108 'list', type=str,
109 help='URL list to process'
110 )
111 parser.add_argument(
112 'outdir', type=str,
113 help='directory to write data into'
114 )
115 parameters = parser.parse_args()
116
117 import buildtools.packagerGecko as packager
118 cleanup = []
119 try:
120 base_dir = os.path.dirname(__file__)
121 file, crawlerxpi = tempfile.mkstemp(suffix='.xpi')
122 os.close(file)
123 cleanup.append(crawlerxpi)
124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)
125
126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi'
127 if parameters.abpdir:
128 file, abpxpi = tempfile.mkstemp(suffix='.xpi')
129 os.close(file)
130 cleanup.append(abpxpi)
131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True)
132
133 profile = FirefoxProfile(
134 addons=[
135 crawlerxpi,
136 abpxpi,
137 ],
138 preferences={
139 'browser.uitour.enabled': False,
140 'prompts.tab_modal.enabled': False,
141 }
142 )
143
144 abpsettings = os.path.join(profile.profile, 'adblockplus')
145 os.makedirs(abpsettings)
146 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as file:
147 print >>file, '# Adblock Plus preferences'
148 print >>file, 'version=4'
149 for url in parameters.filters:
150 if '=' in url:
151 path, url = url.split('=', 1)
152 with open(path, 'r') as source:
153 data = source.read()
154 else:
155 data = urllib.urlopen(url).read()
156 print >>file, '[Subscription]'
157 print >>file, 'url=%s' % url
158 print >>file, '[Subscription filters]'
159 print >>file, '\n'.join(data.splitlines()[1:])
160 finally:
161 for path in cleanup:
162 os.unlink(path)
163
164 server = None
165 try:
166 port = random.randrange(2000, 60000)
167 print "Communicating with client on port %i" % port
168
169 app = CrawlerApp(parameters)
170 server = make_server('localhost', port, app)
171
172 thread = threading.Thread(target=server.serve_forever)
173 thread.daemon = True
174 thread.start()
175
176 runner = FirefoxRunner(
177 profile=profile,
178 binary=parameters.binary,
179 cmdargs=['--crawler-port', str(port)],
180 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),
181 )
182 while app.urls:
183 runner.start()
184 runner.wait()
185 finally:
186 if server:
187 server.shutdown()
188 profile.cleanup()
189
190 if __name__ == '__main__':
191 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
192 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")
193
194 try:
195 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])
196 except subprocess.CalledProcessError as e:
197 print >>sys.stderr, e
198 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"
199
200 run()
OLDNEW
« lib/debug.js ('K') | « metadata.gecko ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld