Index: run.py |
=================================================================== |
new file mode 100755 |
--- /dev/null |
+++ b/run.py |
@@ -0,0 +1,195 @@ |
+#!/usr/bin/env python |
+# coding: utf-8 |
+ |
+import argparse |
+import datetime |
+import errno |
+import hashlib |
+import io |
+import json |
+import os |
+import random |
+import subprocess |
+import sys |
+import tempfile |
+import threading |
+import urllib |
+import urlparse |
+from wsgiref.simple_server import make_server |
+ |
+from mozprofile import FirefoxProfile |
+from mozrunner import FirefoxRunner |
+ |
+class CrawlerApp: |
+ server = None |
+ def __init__(self, parameters): |
+ self.parameters = parameters |
+ with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: |
+ self.urls = map(unicode.strip, handle.readlines()) |
+ |
+ def __call__(self, environ, start_response): |
+ path = environ.get('PATH_INFO', '') |
+ if path == '/parameters': |
+ start_response('200 OK', [('Content-Type', 'application/json')]) |
+ return [json.dumps({ |
+ 'urls': self.urls, |
+ 'timeout': self.parameters.timeout * 1000, |
+ 'maxtabs': self.parameters.maxtabs, |
+ })] |
+ elif path == '/save': |
+ try: |
+ request_body_size = int(environ.get('CONTENT_LENGTH', 0)) |
+ except (ValueError): |
+ start_response('400 Bad Request', []) |
+ return '' |
+ |
+ data = json.loads(environ['wsgi.input'].read(request_body_size)) |
+ self.urls.remove(data['url']) |
+ |
+ parsedurl = urlparse.urlparse(data['url']) |
+ urlhash = hashlib.new('md5', data['url']).hexdigest() |
+ timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).strftime('%Y-%m-%dT%H%M%S.%f') |
+ basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) |
+ datapath = os.path.join(self.parameters.outdir, basename + ".json") |
+ screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") |
+ sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") |
+ |
+ try: |
+ os.makedirs(self.parameters.outdir) |
+ except OSError as e: |
+ if e.errno != errno.EEXIST: |
+ raise |
+ |
+ if "screenshot" in data: |
+ with open(screenshotpath, 'wb') as handle: |
+ handle.write(urllib.urlopen(data["screenshot"]).read()) |
+ del data["screenshot"] |
+ |
+ if "source" in data: |
+ with io.open(sourcepath, 'w', encoding='utf-8') as handle: |
+ handle.write(data["source"]) |
+ del data["source"] |
+ |
+ with io.open(datapath, 'w', encoding='utf-8') as handle: |
+ handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True)) + u'\n') |
Sebastian Noack
2015/04/27 14:55:50
How about json.dump(data, handle, ..)?
|
+ start_response('204 No Content', []) |
+ return '' |
+ |
+ start_response('404 Not Found', []) |
+ return '' |
+ |
+def run(): |
+ parser = argparse.ArgumentParser(description='Run crawler') |
+ parser.add_argument( |
+ '-b', '--binary', type=str, |
+ help='path to the Firefox binary' |
+ ) |
+ parser.add_argument( |
+ '-a', '--abpdir', type=str, |
+ help='path to the Adblock Plus repository' |
+ ) |
+ parser.add_argument( |
+ '-f', '--filters', metavar='url', type=str, nargs='+', |
+ default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https://easylist-downloads.adblockplus.org/exceptionrules.txt"], |
+ help='filter lists to install in Adblock Plus. The arguments can also have the format path=url, the data will be read from the specified path then.' |
+ ) |
+ parser.add_argument( |
+ '-t', '--timeout', type=int, default=300, |
+ help='Load timeout (seconds)' |
+ ) |
+ parser.add_argument( |
+ '-x', '--maxtabs', type=int, default=15, |
+ help='Maximal number of tabs to open in parallel' |
+ ) |
+ parser.add_argument( |
+ 'list', type=str, |
+ help='URL list to process' |
+ ) |
+ parser.add_argument( |
+ 'outdir', type=str, |
+ help='directory to write data into' |
+ ) |
+ parameters = parser.parse_args() |
+ |
+ import buildtools.packagerGecko as packager |
+ cleanup = [] |
+ try: |
+ base_dir = os.path.dirname(__file__) |
+ handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') |
+ os.close(handle) |
+ cleanup.append(crawlerxpi) |
+ packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) |
+ |
+ abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-1865-latest.xpi' |
+ if parameters.abpdir: |
+ handle, abpxpi = tempfile.mkstemp(suffix='.xpi') |
+ os.close(handle) |
+ cleanup.append(abpxpi) |
+ packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) |
+ |
+ profile = FirefoxProfile( |
+ addons=[ |
+ crawlerxpi, |
+ abpxpi, |
+ ], |
+ preferences={ |
+ 'browser.uitour.enabled': False, |
+ 'prompts.tab_modal.enabled': False, |
+ } |
+ ) |
+ |
+ abpsettings = os.path.join(profile.profile, 'adblockplus') |
+ os.makedirs(abpsettings) |
+ with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: |
+ print >>handle, '# Adblock Plus preferences' |
+ print >>handle, 'version=4' |
+ for url in parameters.filters: |
+ if '=' in url: |
+ path, url = url.split('=', 1) |
+ with open(path, 'r') as source: |
+ data = source.read() |
+ else: |
+ data = urllib.urlopen(url).read() |
+ print >>handle, '[Subscription]' |
+ print >>handle, 'url=%s' % url |
+ print >>handle, '[Subscription filters]' |
+ print >>handle, '\n'.join(data.splitlines()[1:]) |
+ finally: |
+ for path in cleanup: |
+ os.unlink(path) |
+ |
+ server = None |
+ try: |
+ port = random.randrange(2000, 60000) |
+ print "Communicating with client on port %i" % port |
+ |
+ app = CrawlerApp(parameters) |
+ server = make_server('localhost', port, app) |
+ app.server = server |
Wladimir Palant
2015/05/07 00:04:59
Done.
|
+ threading.Thread(target=lambda: server.serve_forever()).start() |
+ |
+ runner = FirefoxRunner( |
+ profile=profile, |
+ binary=parameters.binary, |
+ cmdargs=['--crawler-port', str(port)], |
+ env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), |
+ ) |
+ while app.urls: |
+ runner.start() |
+ runner.wait() |
+ finally: |
+ if server: |
+ server.shutdown() |
+ profile.cleanup() |
+ |
+if __name__ == '__main__': |
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
+ DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") |
+ |
+ try: |
+ subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) |
+ except subprocess.CalledProcessError as e: |
+ print >>sys.stderr, e |
+ print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" |
+ |
+ run() |