run.py - Issue 29338442: Issue 3815 - Fix TabAllocator. Now it returns tab with initialized outerWindowID

Delta Between Two Patch Sets: run.py

Issue 29338442: Issue 3815 - Fix TabAllocator. Now it returns tab with initialized outerWindowID (Closed)

Left Patch Set: eliminate race conditions Created April 11, 2016, 3:06 p.m.

Right Patch Set: remove additional empty line Created Sept. 16, 2016, 12:33 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding: utf-8	2 # coding: utf-8

3	3

4 import argparse	4 import argparse

5 import datetime	5 import datetime

6 import errno	6 import errno

7 import hashlib	7 import hashlib

8 import io	8 import io

9 import json	9 import json

10 import os	10 import os

11 import random	11 import random

12 import subprocess	12 import subprocess

13 import sys	13 import sys

14 import tempfile	14 import tempfile

15 import threading	15 import threading

16 import urllib	16 import urllib

17 import urlparse	17 import urlparse

18 from wsgiref.simple_server import make_server	18 from wsgiref.simple_server import make_server

19	19

20 from mozprofile import FirefoxProfile	20 from mozprofile import FirefoxProfile

21 from mozrunner import FirefoxRunner	21 from mozrunner import FirefoxRunner

22	22

	23

23 class CrawlerApp:	24 class CrawlerApp:

24 server = None	25 server = None

25 def __init__(self, parameters):	26

26 self.parameters = parameters	27 def __init__(self, parameters):

27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle:	28 self.parameters = parameters

28 self.urls = map(unicode.strip, handle.readlines())	29 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle:

29	30 self.urls = map(unicode.strip, handle.readlines())

30 def __call__(self, environ, start_response):	31

31 path = environ.get('PATH_INFO', '')	32 def __call__(self, environ, start_response):

32 if path == '/parameters':	33 path = environ.get('PATH_INFO', '')

33 start_response('200 OK', [('Content-Type', 'application/json')])	34 if path == '/parameters':

34 return [json.dumps({	35 start_response('200 OK', [('Content-Type', 'application/json')])

35 'urls': self.urls,	36 return [json.dumps({

36 'timeout': self.parameters.timeout * 1000,	37 'urls': self.urls,

37 'maxtabs': self.parameters.maxtabs,	38 'timeout': self.parameters.timeout * 1000,

38 })]	39 'maxtabs': self.parameters.maxtabs,

39 elif path == '/save':	40 })]

40 try:	41 elif path == '/save':

41 request_body_size = int(environ.get('CONTENT_LENGTH', 0))	42 try:

42 except (ValueError):	43 request_body_size = int(environ.get('CONTENT_LENGTH', 0))

43 start_response('400 Bad Request', [])	44 except (ValueError):

	45 start_response('400 Bad Request', [])

	46 return ''

	47

	48 data = json.loads(environ['wsgi.input'].read(request_body_size))

	49 self.urls.remove(data['url'])

	50

	51 fullurl = data['url']

	52 if not urlparse.urlparse(fullurl).scheme:

	53 fullurl = 'http://' + fullurl

	54 parsedurl = urlparse.urlparse(fullurl)

	55 urlhash = hashlib.new('md5', data['url']).hexdigest()

	56 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000 .0).strftime('%Y-%m-%dT%H%M%S.%f')

	57 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)

	58 datapath = os.path.join(self.parameters.outdir, basename + ".json")

	59 screenshotpath = os.path.join(self.parameters.outdir, basename + ".j pg")

	60 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")

	61

	62 try:

	63 os.makedirs(self.parameters.outdir)

	64 except OSError as e:

	65 if e.errno != errno.EEXIST:

	66 raise

	67

	68 if "screenshot" in data:

	69 with open(screenshotpath, 'wb') as handle:

	70 handle.write(urllib.urlopen(data["screenshot"]).read())

	71 del data["screenshot"]

	72

	73 if "source" in data:

	74 with io.open(sourcepath, 'w', encoding='utf-8') as handle:

	75 handle.write(data["source"])

	76 del data["source"]

	77

	78 with io.open(datapath, 'w', encoding='utf-8') as handle:

	79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal se, sort_keys=True)) + u'\n')

	80 start_response('204 No Content', [])

	81 return ''

	82

	83 start_response('404 Not Found', [])

44 return ''	84 return ''

45	85

46 data = json.loads(environ['wsgi.input'].read(request_body_size))

47 self.urls.remove(data['url'])

48

49 fullurl = data['url']

50 if not urlparse.urlparse(fullurl).scheme:

51 fullurl = 'http://' + fullurl

52 parsedurl = urlparse.urlparse(fullurl)

53 urlhash = hashlib.new('md5', data['url']).hexdigest()

54 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st rftime('%Y-%m-%dT%H%M%S.%f')

55 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)

56 datapath = os.path.join(self.parameters.outdir, basename + ".json")

57 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")

58 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")

59

60 try:

61 os.makedirs(self.parameters.outdir)

62 except OSError as e:

63 if e.errno != errno.EEXIST:

64 raise

65

66 if "screenshot" in data:

67 with open(screenshotpath, 'wb') as handle:

68 handle.write(urllib.urlopen(data["screenshot"]).read())

69 del data["screenshot"]

70

71 if "source" in data:

72 with io.open(sourcepath, 'w', encoding='utf-8') as handle:

73 handle.write(data["source"])

74 del data["source"]

75

76 with io.open(datapath, 'w', encoding='utf-8') as handle:

77 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort _keys=True)) + u'\n')

78 start_response('204 No Content', [])

79 return ''

80

81 start_response('404 Not Found', [])

82 return ''

83	86

84 def run():	87 def run():

85 parser = argparse.ArgumentParser(description='Run crawler')	88 parser = argparse.ArgumentParser(description='Run crawler')

86 parser.add_argument(	89 parser.add_argument(

87 '-b', '--binary', type=str,	90 '-b', '--binary', type=str,

88 help='path to the Firefox binary'	91 help='path to the Firefox binary'

89 )	92 )

90 parser.add_argument(	93 parser.add_argument(

91 '-a', '--abpdir', type=str,	94 '-a', '--abpdir', type=str,

92 help='path to the Adblock Plus repository'	95 help='path to the Adblock Plus repository'

93 )	96 )

94 parser.add_argument(	97 parser.add_argument(

95 '-f', '--filters', metavar='url', type=str, nargs='+',	98 '-f', '--filters', metavar='url', type=str, nargs='+',

96 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/ /easylist-downloads.adblockplus.org/exceptionrules.txt"],	99 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "htt ps://easylist-downloads.adblockplus.org/exceptionrules.txt"],

97 help='filter lists to install in Adblock Plus. The arguments can also have t he format path=url, the data will be read from the specified path then.'	100 help='filter lists to install in Adblock Plus. The arguments can also ha ve the format path=url, the data will be read from the specified path then.'

98 )	101 )

99 parser.add_argument(	102 parser.add_argument(

100 '-t', '--timeout', type=int, default=300,	103 '-t', '--timeout', type=int, default=300,

101 help='Load timeout (seconds)'	104 help='Load timeout (seconds)'

102 )	105 )

103 parser.add_argument(	106 parser.add_argument(

104 '-x', '--maxtabs', type=int, default=15,	107 '-x', '--maxtabs', type=int, default=15,

105 help='Maximal number of tabs to open in parallel'	108 help='Maximal number of tabs to open in parallel'

106 )	109 )

107 parser.add_argument(	110 parser.add_argument(

108 'list', type=str,	111 'list', type=str,

109 help='URL list to process'	112 help='URL list to process'

110 )	113 )

111 parser.add_argument(	114 parser.add_argument(

112 'outdir', type=str,	115 'outdir', type=str,

113 help='directory to write data into'	116 help='directory to write data into'

114 )	117 )

115 parameters = parser.parse_args()	118 parameters = parser.parse_args()

116	119

117 import buildtools.packagerGecko as packager	120 import buildtools.packagerGecko as packager

118 cleanup = []	121 cleanup = []

119 try:	122 try:

120 base_dir = os.path.dirname(os.path.abspath(__file__))	123 base_dir = os.path.dirname(os.path.abspath(__file__))

121 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi')	124 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi')

122 os.close(handle)	125 os.close(handle)

123 cleanup.append(crawlerxpi)	126 cleanup.append(crawlerxpi)

124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)	127 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)

125	128

126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186 5-latest.xpi'	129 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon -1865-latest.xpi'

127 if parameters.abpdir:	130 if parameters.abpdir:

128 handle, abpxpi = tempfile.mkstemp(suffix='.xpi')	131 handle, abpxpi = tempfile.mkstemp(suffix='.xpi')

129 os.close(handle)	132 os.close(handle)

130 cleanup.append(abpxpi)	133 cleanup.append(abpxpi)

131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True)	134 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild =True)

132	135

133 profile = FirefoxProfile(	136 profile = FirefoxProfile(

134 addons=[	137 addons=[

135 crawlerxpi,	138 crawlerxpi,

136 abpxpi,	139 abpxpi,

137 ],	140 ],

138 preferences={	141 preferences={

139 'browser.startup.homepage': 'about:blank',	142 'browser.startup.homepage': 'about:blank',

140 'browser.tabs.warnOnCloseOtherTabs': False,	143 'browser.tabs.warnOnCloseOtherTabs': False,

141 'browser.uitour.enabled': False,	144 'browser.uitour.enabled': False,

142 'prompts.tab_modal.enabled': False,	145 'prompts.tab_modal.enabled': False,

143 'startup.homepage_welcome_url': 'about:blank',	146 'startup.homepage_welcome_url': 'about:blank',

144 'startup.homepage_welcome_url.additional': 'about:blank',	147 'startup.homepage_welcome_url.additional': 'about:blank',

145 'xpinstall.signatures.required': False,	148 'xpinstall.signatures.required': False,

146 }	149 }

147 )	150 )

148	151

149 abpsettings = os.path.join(profile.profile, 'adblockplus')	152 abpsettings = os.path.join(profile.profile, 'adblockplus')

150 os.makedirs(abpsettings)	153 os.makedirs(abpsettings)

151 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle:	154 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle:

152 print >>handle, '# Adblock Plus preferences'	155 print >>handle, '# Adblock Plus preferences'

153 print >>handle, 'version=4'	156 print >>handle, 'version=4'

154 for url in parameters.filters:	157 for url in parameters.filters:

155 if '=' in url:	158 if '=' in url:

156 path, url = url.split('=', 1)	159 path, url = url.split('=', 1)

157 with open(path, 'r') as source:	160 with open(path, 'r') as source:

158 data = source.read()	161 data = source.read()

159 else:	162 else:

160 data = urllib.urlopen(url).read()	163 data = urllib.urlopen(url).read()

161 print >>handle, '[Subscription]'	164 print >>handle, '[Subscription]'

162 print >>handle, 'url=%s' % url	165 print >>handle, 'url=%s' % url

163 print >>handle, '[Subscription filters]'	166 print >>handle, '[Subscription filters]'

164 print >>handle, '\n'.join(data.splitlines()[1:])	167 print >>handle, '\n'.join(data.splitlines()[1:])

165 finally:	168 finally:

166 for path in cleanup:	169 for path in cleanup:

167 os.unlink(path)	170 os.unlink(path)

168	171

169 server = None	172 server = None

170 try:	173 try:

171 port = random.randrange(2000, 60000)	174 port = random.randrange(2000, 60000)

172 print "Communicating with client on port %i" % port	175 print "Communicating with client on port %i" % port

173	176

174 app = CrawlerApp(parameters)	177 app = CrawlerApp(parameters)

175 server = make_server('localhost', port, app)	178 server = make_server('localhost', port, app)

176 app.server = server	179 app.server = server

177 threading.Thread(target=lambda: server.serve_forever()).start()	180 threading.Thread(target=lambda: server.serve_forever()).start()

178	181

179 runner = FirefoxRunner(	182 runner = FirefoxRunner(

180 profile=profile,	183 profile=profile,

181 binary=parameters.binary,	184 binary=parameters.binary,

182 cmdargs=['--crawler-port', str(port)],	185 cmdargs=['--crawler-port', str(port)],

183 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),	186 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),

184 )	187 )

185 while app.urls:	188 while app.urls:

186 runner.start()	189 runner.start()

187 runner.wait()	190 runner.wait()

188 finally:	191 finally:

189 if server:	192 if server:

190 server.shutdown()	193 server.shutdown()

191 profile.cleanup()	194 profile.cleanup()

192	195

193 if __name__ == '__main__':	196 if __name__ == '__main__':

194 BASE_DIR = os.path.dirname(os.path.abspath(__file__))	197 BASE_DIR = os.path.dirname(os.path.abspath(__file__))

195 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")	198 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")

196	199

197 try:	200 try:

198 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])	201 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])

199 except subprocess.CalledProcessError as e:	202 except subprocess.CalledProcessError as e:

200 print >>sys.stderr, e	203 print >>sys.stderr, e

201 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"	204 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"

202	205

203 run()	206 run()

LEFT	RIGHT