OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # coding: utf-8 |
| 3 |
| 4 import argparse |
| 5 import datetime |
| 6 import errno |
| 7 import hashlib |
| 8 import io |
| 9 import json |
| 10 import os |
| 11 import random |
| 12 import subprocess |
| 13 import sys |
| 14 import tempfile |
| 15 import threading |
| 16 import urllib |
| 17 import urlparse |
| 18 from wsgiref.simple_server import make_server |
| 19 |
| 20 from mozprofile import FirefoxProfile |
| 21 from mozrunner import FirefoxRunner |
| 22 |
| 23 class CrawlerApp: |
| 24 def __init__(self, parameters): |
| 25 self.parameters = parameters |
| 26 with io.open(self.parameters.list, 'r', encoding='utf-8') as file: |
| 27 self.urls = [line.strip() for line in file] |
| 28 |
| 29 def __call__(self, environ, start_response): |
| 30 path = environ.get('PATH_INFO', '') |
| 31 if path == '/parameters': |
| 32 start_response('200 OK', [('Content-Type', 'application/json')]) |
| 33 return [json.dumps({ |
| 34 'urls': self.urls, |
| 35 'timeout': self.parameters.timeout * 1000, |
| 36 'maxtabs': self.parameters.maxtabs, |
| 37 })] |
| 38 if path == '/save': |
| 39 try: |
| 40 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) |
| 41 except ValueError: |
| 42 start_response('411 Length Required', []) |
| 43 return [] |
| 44 |
| 45 data = json.loads(environ['wsgi.input'].read(request_body_size)) |
| 46 self.urls.remove(data['url']) |
| 47 |
| 48 parsedurl = urlparse.urlparse(data['url']) |
| 49 urlhash = hashlib.new('md5', data['url']).hexdigest() |
| 50 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st
rftime('%Y-%m-%dT%H%M%S.%f') |
| 51 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) |
| 52 datapath = os.path.join(self.parameters.outdir, basename + ".json") |
| 53 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") |
| 54 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") |
| 55 |
| 56 try: |
| 57 os.makedirs(self.parameters.outdir) |
| 58 except OSError as e: |
| 59 if e.errno != errno.EEXIST: |
| 60 raise |
| 61 |
| 62 screenshot = data.pop("screenshot", None) |
| 63 if screenshot: |
| 64 with open(screenshotpath, 'wb') as file: |
| 65 response = urllib.urlopen(screenshot) |
| 66 try: |
| 67 file.write(response.read()) |
| 68 finally: |
| 69 response.close() |
| 70 |
| 71 source = data.pop("source", None) |
| 72 if source: |
| 73 with io.open(sourcepath, 'w', encoding='utf-8') as file: |
| 74 file.write(source) |
| 75 |
| 76 with io.open(datapath, 'w', encoding='utf-8') as file: |
| 77 file.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_k
eys=True)) + u'\n') |
| 78 start_response('204 No Content', []) |
| 79 return [] |
| 80 |
| 81 start_response('404 Not Found', []) |
| 82 return [] |
| 83 |
| 84 def run(): |
| 85 parser = argparse.ArgumentParser(description='Run crawler') |
| 86 parser.add_argument( |
| 87 '-b', '--binary', type=str, |
| 88 help='path to the Firefox binary' |
| 89 ) |
| 90 parser.add_argument( |
| 91 '-a', '--abpdir', type=str, |
| 92 help='path to the Adblock Plus repository' |
| 93 ) |
| 94 parser.add_argument( |
| 95 '-f', '--filters', metavar='url', type=str, nargs='+', |
| 96 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/
/easylist-downloads.adblockplus.org/exceptionrules.txt"], |
| 97 help='filter lists to install in Adblock Plus. The arguments can also have t
he format path=url, the data will be read from the specified path then.' |
| 98 ) |
| 99 parser.add_argument( |
| 100 '-t', '--timeout', type=int, default=300, |
| 101 help='Load timeout (seconds)' |
| 102 ) |
| 103 parser.add_argument( |
| 104 '-x', '--maxtabs', type=int, default=15, |
| 105 help='Maximal number of tabs to open in parallel' |
| 106 ) |
| 107 parser.add_argument( |
| 108 'list', type=str, |
| 109 help='URL list to process' |
| 110 ) |
| 111 parser.add_argument( |
| 112 'outdir', type=str, |
| 113 help='directory to write data into' |
| 114 ) |
| 115 parameters = parser.parse_args() |
| 116 |
| 117 import buildtools.packagerGecko as packager |
| 118 cleanup = [] |
| 119 try: |
| 120 base_dir = os.path.dirname(__file__) |
| 121 file, crawlerxpi = tempfile.mkstemp(suffix='.xpi') |
| 122 os.close(file) |
| 123 cleanup.append(crawlerxpi) |
| 124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) |
| 125 |
| 126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186
5-latest.xpi' |
| 127 if parameters.abpdir: |
| 128 file, abpxpi = tempfile.mkstemp(suffix='.xpi') |
| 129 os.close(file) |
| 130 cleanup.append(abpxpi) |
| 131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) |
| 132 |
| 133 profile = FirefoxProfile( |
| 134 addons=[ |
| 135 crawlerxpi, |
| 136 abpxpi, |
| 137 ], |
| 138 preferences={ |
| 139 'browser.uitour.enabled': False, |
| 140 'prompts.tab_modal.enabled': False, |
| 141 } |
| 142 ) |
| 143 |
| 144 abpsettings = os.path.join(profile.profile, 'adblockplus') |
| 145 os.makedirs(abpsettings) |
| 146 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as file: |
| 147 print >>file, '# Adblock Plus preferences' |
| 148 print >>file, 'version=4' |
| 149 for url in parameters.filters: |
| 150 if '=' in url: |
| 151 path, url = url.split('=', 1) |
| 152 with open(path, 'r') as source: |
| 153 data = source.read() |
| 154 else: |
| 155 data = urllib.urlopen(url).read() |
| 156 print >>file, '[Subscription]' |
| 157 print >>file, 'url=%s' % url |
| 158 print >>file, '[Subscription filters]' |
| 159 print >>file, '\n'.join(data.splitlines()[1:]) |
| 160 finally: |
| 161 for path in cleanup: |
| 162 os.unlink(path) |
| 163 |
| 164 server = None |
| 165 try: |
| 166 port = random.randrange(2000, 60000) |
| 167 print "Communicating with client on port %i" % port |
| 168 |
| 169 app = CrawlerApp(parameters) |
| 170 server = make_server('localhost', port, app) |
| 171 |
| 172 thread = threading.Thread(target=server.serve_forever) |
| 173 thread.daemon = True |
| 174 thread.start() |
| 175 |
| 176 runner = FirefoxRunner( |
| 177 profile=profile, |
| 178 binary=parameters.binary, |
| 179 cmdargs=['--crawler-port', str(port)], |
| 180 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), |
| 181 ) |
| 182 while app.urls: |
| 183 runner.start() |
| 184 runner.wait() |
| 185 finally: |
| 186 if server: |
| 187 server.shutdown() |
| 188 profile.cleanup() |
| 189 |
| 190 if __name__ == '__main__': |
| 191 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| 192 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") |
| 193 |
| 194 try: |
| 195 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) |
| 196 except subprocess.CalledProcessError as e: |
| 197 print >>sys.stderr, e |
| 198 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" |
| 199 |
| 200 run() |
OLD | NEW |