run.py - Issue 5288886037118976: Adblock Plus Crawler rewrite

Side by Side Diff: run.py

Issue 5288886037118976: Adblock Plus Crawler rewrite (Closed)

Patch Set: Created April 24, 2015, 3:38 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # coding: utf-8

	3

	4 import argparse

	5 import datetime

	6 import errno

	7 import hashlib

	8 import io

	9 import json

	10 import os

	11 import random

	12 import subprocess

	13 import sys

	14 import tempfile

	15 import threading

	16 import urllib

	17 import urlparse

	18 from wsgiref.simple_server import make_server

	19

	20 from mozprofile import FirefoxProfile

	21 from mozrunner import FirefoxRunner

	22

	23 class CrawlerApp:

	24 server = None

	25 def __init__(self, parameters):

	26 self.parameters = parameters

	27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle:

	28 self.urls = map(unicode.strip, handle.readlines())

	29

	30 def __call__(self, environ, start_response):

	31 path = environ.get('PATH_INFO', '')

	32 if path == '/parameters':

	33 start_response('200 OK', [('Content-Type', 'application/json')])

	34 return [json.dumps({

	35 'urls': self.urls,

	36 'timeout': self.parameters.timeout * 1000,

	37 'maxtabs': self.parameters.maxtabs,

	38 })]

	39 elif path == '/save':

	40 try:

	41 request_body_size = int(environ.get('CONTENT_LENGTH', 0))

	42 except (ValueError):

	43 start_response('400 Bad Request', [])

	44 return ''

	45

	46 data = json.loads(environ['wsgi.input'].read(request_body_size))

	47 self.urls.remove(data['url'])

	48

	49 parsedurl = urlparse.urlparse(data['url'])

	50 urlhash = hashlib.new('md5', data['url']).hexdigest()

	51 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f')

	52 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)

	53 datapath = os.path.join(self.parameters.outdir, basename + ".json")

	54 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")

	55 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")

	56

	57 try:

	58 os.makedirs(self.parameters.outdir)

	59 except OSError as e:

	60 if e.errno != errno.EEXIST:

	61 raise

	62

	63 if "screenshot" in data:

	64 with open(screenshotpath, 'wb') as handle:

	65 handle.write(urllib.urlopen(data["screenshot"]).read())

	66 del data["screenshot"]

	67

	68 if "source" in data:

	69 with io.open(sourcepath, 'w', encoding='utf-8') as handle:

	70 handle.write(data["source"])

	71 del data["source"]

	72

	73 with io.open(datapath, 'w', encoding='utf-8') as handle:

	74 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n')
	Sebastian Noack 2015/04/27 14:55:50 How about json.dump(data, handle, ..)? How about json.dump(data, handle, ..)?
	75 start_response('204 No Content', [])

	76 return ''

	77

	78 start_response('404 Not Found', [])

	79 return ''

	80

	81 def run():

	82 parser = argparse.ArgumentParser(description='Run crawler')

	83 parser.add_argument(

	84 '-b', '--binary', type=str,

	85 help='path to the Firefox binary'

	86 )

	87 parser.add_argument(

	88 '-a', '--abpdir', type=str,

	89 help='path to the Adblock Plus repository'

	90 )

	91 parser.add_argument(

	92 '-f', '--filters', metavar='url', type=str, nargs='+',

	93 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/ /easylist-downloads.adblockplus.org/exceptionrules.txt"],

	94 help='filter lists to install in Adblock Plus. The arguments can also have t he format path=url, the data will be read from the specified path then.'

	95 )

	96 parser.add_argument(

	97 '-t', '--timeout', type=int, default=300,

	98 help='Load timeout (seconds)'

	99 )

	100 parser.add_argument(

	101 '-x', '--maxtabs', type=int, default=15,

	102 help='Maximal number of tabs to open in parallel'

	103 )

	104 parser.add_argument(

	105 'list', type=str,

	106 help='URL list to process'

	107 )

	108 parser.add_argument(

	109 'outdir', type=str,

	110 help='directory to write data into'

	111 )

	112 parameters = parser.parse_args()

	113

	114 import buildtools.packagerGecko as packager

	115 cleanup = []

	116 try:

	117 base_dir = os.path.dirname(__file__)

	118 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi')

	119 os.close(handle)

	120 cleanup.append(crawlerxpi)

	121 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)

	122

	123 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi'

	124 if parameters.abpdir:

	125 handle, abpxpi = tempfile.mkstemp(suffix='.xpi')

	126 os.close(handle)

	127 cleanup.append(abpxpi)

	128 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True)

	129

	130 profile = FirefoxProfile(

	131 addons=[

	132 crawlerxpi,

	133 abpxpi,

	134 ],

	135 preferences={

	136 'browser.uitour.enabled': False,

	137 'prompts.tab_modal.enabled': False,

	138 }

	139 )

	140

	141 abpsettings = os.path.join(profile.profile, 'adblockplus')

	142 os.makedirs(abpsettings)

	143 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle:

	144 print >>handle, '# Adblock Plus preferences'

	145 print >>handle, 'version=4'

	146 for url in parameters.filters:

	147 if '=' in url:

	148 path, url = url.split('=', 1)

	149 with open(path, 'r') as source:

	150 data = source.read()

	151 else:

	152 data = urllib.urlopen(url).read()

	153 print >>handle, '[Subscription]'

	154 print >>handle, 'url=%s' % url

	155 print >>handle, '[Subscription filters]'

	156 print >>handle, '\n'.join(data.splitlines()[1:])

	157 finally:

	158 for path in cleanup:

	159 os.unlink(path)

	160

	161 server = None

	162 try:

	163 port = random.randrange(2000, 60000)

	164 print "Communicating with client on port %i" % port

	165

	166 app = CrawlerApp(parameters)

	167 server = make_server('localhost', port, app)

	168 app.server = server
	Wladimir Palant 2015/05/07 00:04:59 Done. Show quoted text On 2015/04/28 10:42:25, Sebastian Noack wrote: > Seems like you never access app.server. So this assignment is unneeded. Done.
	169 threading.Thread(target=lambda: server.serve_forever()).start()

	170

	171 runner = FirefoxRunner(

	172 profile=profile,

	173 binary=parameters.binary,

	174 cmdargs=['--crawler-port', str(port)],

	175 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),

	176 )

	177 while app.urls:

	178 runner.start()

	179 runner.wait()

	180 finally:

	181 if server:

	182 server.shutdown()

	183 profile.cleanup()

	184

	185 if __name__ == '__main__':

	186 BASE_DIR = os.path.dirname(os.path.abspath(__file__))

	187 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")

	188

	189 try:

	190 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])

	191 except subprocess.CalledProcessError as e:

	192 print >>sys.stderr, e

	193 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"

	194

	195 run()

OLD	NEW

« metadata.gecko ('K') | « metadata.gecko ('k') | no next file » | no next file with comments »