Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding: utf-8 | 2 # coding: utf-8 |
3 | 3 |
4 import argparse | 4 import argparse |
5 import datetime | 5 import datetime |
6 import errno | 6 import errno |
7 import hashlib | 7 import hashlib |
8 import io | 8 import io |
9 import json | 9 import json |
10 import os | 10 import os |
11 import random | 11 import random |
12 import subprocess | 12 import subprocess |
13 import sys | 13 import sys |
14 import tempfile | 14 import tempfile |
15 import threading | 15 import threading |
16 import urllib | 16 import urllib |
17 import urlparse | 17 import urlparse |
18 from wsgiref.simple_server import make_server | 18 from wsgiref.simple_server import make_server |
19 | 19 |
20 from mozprofile import FirefoxProfile | 20 from mozprofile import FirefoxProfile |
21 from mozrunner import FirefoxRunner | 21 from mozrunner import FirefoxRunner |
22 | 22 |
23 class CrawlerApp: | 23 class CrawlerApp: |
24 server = None | |
25 def __init__(self, parameters): | 24 def __init__(self, parameters): |
26 self.parameters = parameters | 25 self.parameters = parameters |
27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: | 26 with io.open(self.parameters.list, 'r', encoding='utf-8') as file: |
28 self.urls = map(unicode.strip, handle.readlines()) | 27 self.urls = [line.strip() for line in file] |
29 | 28 |
30 def __call__(self, environ, start_response): | 29 def __call__(self, environ, start_response): |
31 path = environ.get('PATH_INFO', '') | 30 path = environ.get('PATH_INFO', '') |
32 if path == '/parameters': | 31 if path == '/parameters': |
33 start_response('200 OK', [('Content-Type', 'application/json')]) | 32 start_response('200 OK', [('Content-Type', 'application/json')]) |
34 return [json.dumps({ | 33 return [json.dumps({ |
35 'urls': self.urls, | 34 'urls': self.urls, |
36 'timeout': self.parameters.timeout * 1000, | 35 'timeout': self.parameters.timeout * 1000, |
37 'maxtabs': self.parameters.maxtabs, | 36 'maxtabs': self.parameters.maxtabs, |
38 })] | 37 })] |
39 elif path == '/save': | 38 if path == '/save': |
40 try: | 39 try: |
41 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) | 40 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) |
42 except (ValueError): | 41 except ValueError: |
43 start_response('400 Bad Request', []) | 42 start_response('411 Length Required', []) |
44 return '' | 43 return [] |
45 | 44 |
46 data = json.loads(environ['wsgi.input'].read(request_body_size)) | 45 data = json.loads(environ['wsgi.input'].read(request_body_size)) |
47 self.urls.remove(data['url']) | 46 self.urls.remove(data['url']) |
48 | 47 |
49 parsedurl = urlparse.urlparse(data['url']) | 48 parsedurl = urlparse.urlparse(data['url']) |
50 urlhash = hashlib.new('md5', data['url']).hexdigest() | 49 urlhash = hashlib.new('md5', data['url']).hexdigest() |
51 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f') | 50 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f') |
52 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) | 51 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) |
53 datapath = os.path.join(self.parameters.outdir, basename + ".json") | 52 datapath = os.path.join(self.parameters.outdir, basename + ".json") |
54 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") | 53 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") |
55 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") | 54 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") |
56 | 55 |
57 try: | 56 try: |
58 os.makedirs(self.parameters.outdir) | 57 os.makedirs(self.parameters.outdir) |
59 except OSError as e: | 58 except OSError as e: |
60 if e.errno != errno.EEXIST: | 59 if e.errno != errno.EEXIST: |
61 raise | 60 raise |
62 | 61 |
63 if "screenshot" in data: | 62 screenshot = data.pop("screenshot", None) |
64 with open(screenshotpath, 'wb') as handle: | 63 if screenshot: |
65 handle.write(urllib.urlopen(data["screenshot"]).read()) | 64 with open(screenshotpath, 'wb') as file: |
66 del data["screenshot"] | 65 response = urllib.urlopen(screenshot) |
67 | 66 try: |
68 if "source" in data: | 67 file.write(response.read()) |
69 with io.open(sourcepath, 'w', encoding='utf-8') as handle: | 68 finally: |
70 handle.write(data["source"]) | 69 response.close() |
71 del data["source"] | 70 |
72 | 71 source = data.pop("source", None) |
73 with io.open(datapath, 'w', encoding='utf-8') as handle: | 72 if source: |
74 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n') | 73 with io.open(sourcepath, 'w', encoding='utf-8') as file: |
Sebastian Noack
2015/04/27 14:55:50
How about json.dump(data, handle, ..)?
| |
74 file.write(source) | |
75 | |
76 with io.open(datapath, 'w', encoding='utf-8') as file: | |
77 file.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_k eys=True)) + u'\n') | |
75 start_response('204 No Content', []) | 78 start_response('204 No Content', []) |
76 return '' | 79 return [] |
77 | 80 |
78 start_response('404 Not Found', []) | 81 start_response('404 Not Found', []) |
79 return '' | 82 return [] |
80 | 83 |
81 def run(): | 84 def run(): |
82 parser = argparse.ArgumentParser(description='Run crawler') | 85 parser = argparse.ArgumentParser(description='Run crawler') |
83 parser.add_argument( | 86 parser.add_argument( |
84 '-b', '--binary', type=str, | 87 '-b', '--binary', type=str, |
85 help='path to the Firefox binary' | 88 help='path to the Firefox binary' |
86 ) | 89 ) |
87 parser.add_argument( | 90 parser.add_argument( |
88 '-a', '--abpdir', type=str, | 91 '-a', '--abpdir', type=str, |
89 help='path to the Adblock Plus repository' | 92 help='path to the Adblock Plus repository' |
(...skipping 18 matching lines...) Expand all Loading... | |
108 parser.add_argument( | 111 parser.add_argument( |
109 'outdir', type=str, | 112 'outdir', type=str, |
110 help='directory to write data into' | 113 help='directory to write data into' |
111 ) | 114 ) |
112 parameters = parser.parse_args() | 115 parameters = parser.parse_args() |
113 | 116 |
114 import buildtools.packagerGecko as packager | 117 import buildtools.packagerGecko as packager |
115 cleanup = [] | 118 cleanup = [] |
116 try: | 119 try: |
117 base_dir = os.path.dirname(__file__) | 120 base_dir = os.path.dirname(__file__) |
118 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | 121 file, crawlerxpi = tempfile.mkstemp(suffix='.xpi') |
119 os.close(handle) | 122 os.close(file) |
120 cleanup.append(crawlerxpi) | 123 cleanup.append(crawlerxpi) |
121 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | 124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) |
122 | 125 |
123 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi' | 126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi' |
124 if parameters.abpdir: | 127 if parameters.abpdir: |
125 handle, abpxpi = tempfile.mkstemp(suffix='.xpi') | 128 file, abpxpi = tempfile.mkstemp(suffix='.xpi') |
126 os.close(handle) | 129 os.close(file) |
127 cleanup.append(abpxpi) | 130 cleanup.append(abpxpi) |
128 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) | 131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) |
129 | 132 |
130 profile = FirefoxProfile( | 133 profile = FirefoxProfile( |
131 addons=[ | 134 addons=[ |
132 crawlerxpi, | 135 crawlerxpi, |
133 abpxpi, | 136 abpxpi, |
134 ], | 137 ], |
135 preferences={ | 138 preferences={ |
136 'browser.uitour.enabled': False, | 139 'browser.uitour.enabled': False, |
137 'prompts.tab_modal.enabled': False, | 140 'prompts.tab_modal.enabled': False, |
138 } | 141 } |
139 ) | 142 ) |
140 | 143 |
141 abpsettings = os.path.join(profile.profile, 'adblockplus') | 144 abpsettings = os.path.join(profile.profile, 'adblockplus') |
142 os.makedirs(abpsettings) | 145 os.makedirs(abpsettings) |
143 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: | 146 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as file: |
144 print >>handle, '# Adblock Plus preferences' | 147 print >>file, '# Adblock Plus preferences' |
145 print >>handle, 'version=4' | 148 print >>file, 'version=4' |
146 for url in parameters.filters: | 149 for url in parameters.filters: |
147 if '=' in url: | 150 if '=' in url: |
148 path, url = url.split('=', 1) | 151 path, url = url.split('=', 1) |
149 with open(path, 'r') as source: | 152 with open(path, 'r') as source: |
150 data = source.read() | 153 data = source.read() |
151 else: | 154 else: |
152 data = urllib.urlopen(url).read() | 155 data = urllib.urlopen(url).read() |
153 print >>handle, '[Subscription]' | 156 print >>file, '[Subscription]' |
154 print >>handle, 'url=%s' % url | 157 print >>file, 'url=%s' % url |
155 print >>handle, '[Subscription filters]' | 158 print >>file, '[Subscription filters]' |
156 print >>handle, '\n'.join(data.splitlines()[1:]) | 159 print >>file, '\n'.join(data.splitlines()[1:]) |
157 finally: | 160 finally: |
158 for path in cleanup: | 161 for path in cleanup: |
159 os.unlink(path) | 162 os.unlink(path) |
160 | 163 |
161 server = None | 164 server = None |
162 try: | 165 try: |
163 port = random.randrange(2000, 60000) | 166 port = random.randrange(2000, 60000) |
164 print "Communicating with client on port %i" % port | 167 print "Communicating with client on port %i" % port |
165 | 168 |
166 app = CrawlerApp(parameters) | 169 app = CrawlerApp(parameters) |
167 server = make_server('localhost', port, app) | 170 server = make_server('localhost', port, app) |
168 app.server = server | 171 |
Wladimir Palant
2015/05/07 00:04:59
Done.
| |
169 threading.Thread(target=lambda: server.serve_forever()).start() | 172 thread = threading.Thread(target=server.serve_forever) |
173 thread.daemon = True | |
174 thread.start() | |
170 | 175 |
171 runner = FirefoxRunner( | 176 runner = FirefoxRunner( |
172 profile=profile, | 177 profile=profile, |
173 binary=parameters.binary, | 178 binary=parameters.binary, |
174 cmdargs=['--crawler-port', str(port)], | 179 cmdargs=['--crawler-port', str(port)], |
175 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), | 180 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), |
176 ) | 181 ) |
177 while app.urls: | 182 while app.urls: |
178 runner.start() | 183 runner.start() |
179 runner.wait() | 184 runner.wait() |
180 finally: | 185 finally: |
181 if server: | 186 if server: |
182 server.shutdown() | 187 server.shutdown() |
183 profile.cleanup() | 188 profile.cleanup() |
184 | 189 |
185 if __name__ == '__main__': | 190 if __name__ == '__main__': |
186 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 191 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
187 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | 192 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") |
188 | 193 |
189 try: | 194 try: |
190 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | 195 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) |
191 except subprocess.CalledProcessError as e: | 196 except subprocess.CalledProcessError as e: |
192 print >>sys.stderr, e | 197 print >>sys.stderr, e |
193 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | 198 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" |
194 | 199 |
195 run() | 200 run() |
LEFT | RIGHT |