Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python | |
2 # coding: utf-8 | |
3 | |
4 import argparse | |
5 import datetime | |
6 import errno | |
7 import hashlib | |
8 import io | |
9 import json | |
10 import os | |
11 import random | |
12 import subprocess | |
13 import sys | |
14 import tempfile | |
15 import threading | |
16 import urllib | |
17 import urlparse | |
18 from wsgiref.simple_server import make_server | |
19 | |
20 from mozprofile import FirefoxProfile | |
21 from mozrunner import FirefoxRunner | |
22 | |
23 class CrawlerApp: | |
24 server = None | |
25 def __init__(self, parameters): | |
26 self.parameters = parameters | |
27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: | |
28 self.urls = map(unicode.strip, handle.readlines()) | |
29 | |
30 def __call__(self, environ, start_response): | |
31 path = environ.get('PATH_INFO', '') | |
32 if path == '/parameters': | |
33 start_response('200 OK', [('Content-Type', 'application/json')]) | |
34 return [json.dumps({ | |
35 'urls': self.urls, | |
36 'timeout': self.parameters.timeout * 1000, | |
37 'maxtabs': self.parameters.maxtabs, | |
38 })] | |
39 elif path == '/save': | |
40 try: | |
41 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) | |
42 except (ValueError): | |
43 start_response('400 Bad Request', []) | |
44 return '' | |
45 | |
46 data = json.loads(environ['wsgi.input'].read(request_body_size)) | |
47 self.urls.remove(data['url']) | |
48 | |
49 parsedurl = urlparse.urlparse(data['url']) | |
50 urlhash = hashlib.new('md5', data['url']).hexdigest() | |
51 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f') | |
52 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) | |
53 datapath = os.path.join(self.parameters.outdir, basename + ".json") | |
54 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") | |
55 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") | |
56 | |
57 try: | |
58 os.makedirs(self.parameters.outdir) | |
59 except OSError as e: | |
60 if e.errno != errno.EEXIST: | |
61 raise | |
62 | |
63 if "screenshot" in data: | |
64 with open(screenshotpath, 'wb') as handle: | |
65 handle.write(urllib.urlopen(data["screenshot"]).read()) | |
66 del data["screenshot"] | |
67 | |
68 if "source" in data: | |
69 with io.open(sourcepath, 'w', encoding='utf-8') as handle: | |
70 handle.write(data["source"]) | |
71 del data["source"] | |
72 | |
73 with io.open(datapath, 'w', encoding='utf-8') as handle: | |
74 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n') | |
Sebastian Noack
2015/04/27 14:55:50
How about json.dump(data, handle, ..)?
| |
75 start_response('204 No Content', []) | |
76 return '' | |
77 | |
78 start_response('404 Not Found', []) | |
79 return '' | |
80 | |
81 def run(): | |
82 parser = argparse.ArgumentParser(description='Run crawler') | |
83 parser.add_argument( | |
84 '-b', '--binary', type=str, | |
85 help='path to the Firefox binary' | |
86 ) | |
87 parser.add_argument( | |
88 '-a', '--abpdir', type=str, | |
89 help='path to the Adblock Plus repository' | |
90 ) | |
91 parser.add_argument( | |
92 '-f', '--filters', metavar='url', type=str, nargs='+', | |
93 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/ /easylist-downloads.adblockplus.org/exceptionrules.txt"], | |
94 help='filter lists to install in Adblock Plus. The arguments can also have t he format path=url, the data will be read from the specified path then.' | |
95 ) | |
96 parser.add_argument( | |
97 '-t', '--timeout', type=int, default=300, | |
98 help='Load timeout (seconds)' | |
99 ) | |
100 parser.add_argument( | |
101 '-x', '--maxtabs', type=int, default=15, | |
102 help='Maximal number of tabs to open in parallel' | |
103 ) | |
104 parser.add_argument( | |
105 'list', type=str, | |
106 help='URL list to process' | |
107 ) | |
108 parser.add_argument( | |
109 'outdir', type=str, | |
110 help='directory to write data into' | |
111 ) | |
112 parameters = parser.parse_args() | |
113 | |
114 import buildtools.packagerGecko as packager | |
115 cleanup = [] | |
116 try: | |
117 base_dir = os.path.dirname(__file__) | |
118 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | |
119 os.close(handle) | |
120 cleanup.append(crawlerxpi) | |
121 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | |
122 | |
123 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi' | |
124 if parameters.abpdir: | |
125 handle, abpxpi = tempfile.mkstemp(suffix='.xpi') | |
126 os.close(handle) | |
127 cleanup.append(abpxpi) | |
128 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) | |
129 | |
130 profile = FirefoxProfile( | |
131 addons=[ | |
132 crawlerxpi, | |
133 abpxpi, | |
134 ], | |
135 preferences={ | |
136 'browser.uitour.enabled': False, | |
137 'prompts.tab_modal.enabled': False, | |
138 } | |
139 ) | |
140 | |
141 abpsettings = os.path.join(profile.profile, 'adblockplus') | |
142 os.makedirs(abpsettings) | |
143 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: | |
144 print >>handle, '# Adblock Plus preferences' | |
145 print >>handle, 'version=4' | |
146 for url in parameters.filters: | |
147 if '=' in url: | |
148 path, url = url.split('=', 1) | |
149 with open(path, 'r') as source: | |
150 data = source.read() | |
151 else: | |
152 data = urllib.urlopen(url).read() | |
153 print >>handle, '[Subscription]' | |
154 print >>handle, 'url=%s' % url | |
155 print >>handle, '[Subscription filters]' | |
156 print >>handle, '\n'.join(data.splitlines()[1:]) | |
157 finally: | |
158 for path in cleanup: | |
159 os.unlink(path) | |
160 | |
161 server = None | |
162 try: | |
163 port = random.randrange(2000, 60000) | |
164 print "Communicating with client on port %i" % port | |
165 | |
166 app = CrawlerApp(parameters) | |
167 server = make_server('localhost', port, app) | |
168 app.server = server | |
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
169 threading.Thread(target=lambda: server.serve_forever()).start() | |
170 | |
171 runner = FirefoxRunner( | |
172 profile=profile, | |
173 binary=parameters.binary, | |
174 cmdargs=['--crawler-port', str(port)], | |
175 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), | |
176 ) | |
177 while app.urls: | |
178 runner.start() | |
179 runner.wait() | |
180 finally: | |
181 if server: | |
182 server.shutdown() | |
183 profile.cleanup() | |
184 | |
185 if __name__ == '__main__': | |
186 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
187 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | |
188 | |
189 try: | |
190 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | |
191 except subprocess.CalledProcessError as e: | |
192 print >>sys.stderr, e | |
193 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | |
194 | |
195 run() | |
OLD | NEW |