Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: run.py

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)
Patch Set: Created April 24, 2015, 3:38 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« metadata.gecko ('K') | « metadata.gecko ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 import argparse
5 import datetime
6 import errno
7 import hashlib
8 import io
9 import json
10 import os
11 import random
12 import subprocess
13 import sys
14 import tempfile
15 import threading
16 import urllib
17 import urlparse
18 from wsgiref.simple_server import make_server
19
20 from mozprofile import FirefoxProfile
21 from mozrunner import FirefoxRunner
22
23 class CrawlerApp:
24 server = None
25 def __init__(self, parameters):
26 self.parameters = parameters
27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle:
28 self.urls = map(unicode.strip, handle.readlines())
29
30 def __call__(self, environ, start_response):
31 path = environ.get('PATH_INFO', '')
32 if path == '/parameters':
33 start_response('200 OK', [('Content-Type', 'application/json')])
34 return [json.dumps({
35 'urls': self.urls,
36 'timeout': self.parameters.timeout * 1000,
37 'maxtabs': self.parameters.maxtabs,
38 })]
39 elif path == '/save':
40 try:
41 request_body_size = int(environ.get('CONTENT_LENGTH', 0))
42 except (ValueError):
43 start_response('400 Bad Request', [])
44 return ''
45
46 data = json.loads(environ['wsgi.input'].read(request_body_size))
47 self.urls.remove(data['url'])
48
49 parsedurl = urlparse.urlparse(data['url'])
50 urlhash = hashlib.new('md5', data['url']).hexdigest()
51 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f')
52 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)
53 datapath = os.path.join(self.parameters.outdir, basename + ".json")
54 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")
55 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")
56
57 try:
58 os.makedirs(self.parameters.outdir)
59 except OSError as e:
60 if e.errno != errno.EEXIST:
61 raise
62
63 if "screenshot" in data:
64 with open(screenshotpath, 'wb') as handle:
65 handle.write(urllib.urlopen(data["screenshot"]).read())
66 del data["screenshot"]
67
68 if "source" in data:
69 with io.open(sourcepath, 'w', encoding='utf-8') as handle:
70 handle.write(data["source"])
71 del data["source"]
72
73 with io.open(datapath, 'w', encoding='utf-8') as handle:
74 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n')
Sebastian Noack 2015/04/27 14:55:50 How about json.dump(data, handle, ..)?
75 start_response('204 No Content', [])
76 return ''
77
78 start_response('404 Not Found', [])
79 return ''
80
81 def run():
82 parser = argparse.ArgumentParser(description='Run crawler')
83 parser.add_argument(
84 '-b', '--binary', type=str,
85 help='path to the Firefox binary'
86 )
87 parser.add_argument(
88 '-a', '--abpdir', type=str,
89 help='path to the Adblock Plus repository'
90 )
91 parser.add_argument(
92 '-f', '--filters', metavar='url', type=str, nargs='+',
93 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/ /easylist-downloads.adblockplus.org/exceptionrules.txt"],
94 help='filter lists to install in Adblock Plus. The arguments can also have t he format path=url, the data will be read from the specified path then.'
95 )
96 parser.add_argument(
97 '-t', '--timeout', type=int, default=300,
98 help='Load timeout (seconds)'
99 )
100 parser.add_argument(
101 '-x', '--maxtabs', type=int, default=15,
102 help='Maximal number of tabs to open in parallel'
103 )
104 parser.add_argument(
105 'list', type=str,
106 help='URL list to process'
107 )
108 parser.add_argument(
109 'outdir', type=str,
110 help='directory to write data into'
111 )
112 parameters = parser.parse_args()
113
114 import buildtools.packagerGecko as packager
115 cleanup = []
116 try:
117 base_dir = os.path.dirname(__file__)
118 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi')
119 os.close(handle)
120 cleanup.append(crawlerxpi)
121 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)
122
123 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi'
124 if parameters.abpdir:
125 handle, abpxpi = tempfile.mkstemp(suffix='.xpi')
126 os.close(handle)
127 cleanup.append(abpxpi)
128 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True)
129
130 profile = FirefoxProfile(
131 addons=[
132 crawlerxpi,
133 abpxpi,
134 ],
135 preferences={
136 'browser.uitour.enabled': False,
137 'prompts.tab_modal.enabled': False,
138 }
139 )
140
141 abpsettings = os.path.join(profile.profile, 'adblockplus')
142 os.makedirs(abpsettings)
143 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle:
144 print >>handle, '# Adblock Plus preferences'
145 print >>handle, 'version=4'
146 for url in parameters.filters:
147 if '=' in url:
148 path, url = url.split('=', 1)
149 with open(path, 'r') as source:
150 data = source.read()
151 else:
152 data = urllib.urlopen(url).read()
153 print >>handle, '[Subscription]'
154 print >>handle, 'url=%s' % url
155 print >>handle, '[Subscription filters]'
156 print >>handle, '\n'.join(data.splitlines()[1:])
157 finally:
158 for path in cleanup:
159 os.unlink(path)
160
161 server = None
162 try:
163 port = random.randrange(2000, 60000)
164 print "Communicating with client on port %i" % port
165
166 app = CrawlerApp(parameters)
167 server = make_server('localhost', port, app)
168 app.server = server
Wladimir Palant 2015/05/07 00:04:59 Done.
169 threading.Thread(target=lambda: server.serve_forever()).start()
170
171 runner = FirefoxRunner(
172 profile=profile,
173 binary=parameters.binary,
174 cmdargs=['--crawler-port', str(port)],
175 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),
176 )
177 while app.urls:
178 runner.start()
179 runner.wait()
180 finally:
181 if server:
182 server.shutdown()
183 profile.cleanup()
184
185 if __name__ == '__main__':
186 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
187 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")
188
189 try:
190 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])
191 except subprocess.CalledProcessError as e:
192 print >>sys.stderr, e
193 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"
194
195 run()
OLDNEW
« metadata.gecko ('K') | « metadata.gecko ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld