updateRules.py - Issue 8401090: Modified server-side rules generation to produce prioritized domain lists instead of complete suffi…

Side by Side Diff: updateRules.py

Issue 8401090: Modified server-side rules generation to produce prioritized domain lists instead of complete suffi… (Closed)

Patch Set: Created Sept. 25, 2012, 11:32 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # coding: utf-8	2 # coding: utf-8

3	3

4 # This Source Code is subject to the terms of the Mozilla Public License	4 # This Source Code is subject to the terms of the Mozilla Public License

5 # version 2.0 (the "License"). You can obtain a copy of the License at	5 # version 2.0 (the "License"). You can obtain a copy of the License at

6 # http://mozilla.org/MPL/2.0/.	6 # http://mozilla.org/MPL/2.0/.

7	7

8 """	8 """

9 Update the dictionaries in the rules	9 Update the dictionaries in the rules

10 ====================================	10 ====================================

(...skipping 220 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za',	231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za',

232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za',	232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za',

233	233

234 # From http://en.wikipedia.org/wiki/.zm	234 # From http://en.wikipedia.org/wiki/.zm

235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm',	235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm',

236	236

237 # From http://en.wikipedia.org/wiki/.zw	237 # From http://en.wikipedia.org/wiki/.zw

238 'co.zw', 'ac.zw', 'org.zw',	238 'co.zw', 'ac.zw', 'org.zw',

239 ]	239 ]

240	240

241 def getSuffixes(target, items):

242 suffixes = {}

243 for item, priority in items.iteritems():

244 suffix = item[-1] if len(item) else ''

245 if not suffix in suffixes:

246 suffixes[suffix] = {}

247 suffixes[suffix][item[:-1]] = priority

248 for suffix, items in suffixes.iteritems():

249 if len(items.keys()) == 1:

250 item, priority = items.items()[0]

251 target[suffix] = ''.join(reversed(item)) + ' ' + str(priority)

252 else:

253 target[suffix] = {}

254 getSuffixes(target[suffix], items)

255

256 def urlopen(url, attempts=3):	241 def urlopen(url, attempts=3):

257 """	242 """

258 Tries to open a particular URL, retries on failure.	243 Tries to open a particular URL, retries on failure.

259 """	244 """

260 for i in range(attempts):	245 for i in range(attempts):

261 try:	246 try:

262 return urllib.urlopen(url)	247 return urllib.urlopen(url)

263 except IOError, e:	248 except IOError, e:

264 error = e	249 error = e

265 time.sleep(5)	250 time.sleep(5)

(...skipping 18 matching lines...) Expand all Loading...
284 tld = line	269 tld = line

285	270

286 if tld:	271 if tld:

287 yield tld	272 yield tld

288	273

289 def getTLDs(domains, tldPriority):	274 def getTLDs(domains, tldPriority):

290 for tld in iterateTLDs():	275 for tld in iterateTLDs():

291 if not tld in domains:	276 if not tld in domains:

292 domains[tld] = tldPriority	277 domains[tld] = tldPriority

293	278

294 def updateSchemes(rules):

295 rules['scheme'] = {}

296 getSuffixes(rules['scheme'], schemes)

297

298 def updateDomains(rules):	279 def updateDomains(rules):

299 domains = {}	280 domains = {}

300 reader = codecs.getreader('utf-8')(sys.stdin)	281 reader = codecs.getreader('utf-8')(sys.stdin)

301 i = 0	282 i = 0

302 for domain in itertools.chain(reader.readlines(), additionalDomains):	283 for domain in itertools.chain(reader.readlines(), additionalDomains):

303 domain = domain.rstrip()	284 domain = domain.rstrip()

304 if not domain or domain in domains:	285 if not domain or domain in domains:

305 continue	286 continue

306 domains[domain] = i	287 domains[domain] = i

307 i += 1	288 i += 1

308	289

309 maxPriority = i	290 maxPriority = i

310 for domain in domains.iterkeys():	291 for domain in domains.iterkeys():

311 domains[domain] = maxPriority - domains[domain]	292 domains[domain] = maxPriority - domains[domain]

312	293

313 # Extract TLDs from domain list	294 # Extract TLDs from domain list

314 for domain, priority in domains.items():	295 for domain, priority in domains.items():

315 while True:	296 while True:

316 if not re.search(r'^[^.]+\.+', domain):	297 if not re.search(r'^[^.]+\.+', domain):

317 break	298 break

318 domain = re.sub(r'^[^.]+\.+', '', domain)	299 domain = re.sub(r'^[^.]+\.+', '', domain)

319 if not domain:	300 if not domain:

320 break	301 break

321 if not domain in domains or domains[domain] < priority - maxPriority:	302 if not domain in domains or domains[domain] < priority - maxPriority:

322 domains[domain] = priority - maxPriority	303 domains[domain] = priority - maxPriority

323	304

324 # Fill up with "official" TLDs	305 # Fill up with "official" TLDs

325 getTLDs(domains, -maxPriority)	306 getTLDs(domains, -maxPriority)

326	307

327 rules['domain'] = {}	308 rules['domain'] = domains

328 getSuffixes(rules['domain'], domains)

329	309

330 def writeRules(rules):	310 def writeRules(rules):

331 path = os.path.join('defaults', 'rules.json')	311 path = os.path.join('defaults', 'rules.json')
	Thomas Greiner 2012/09/25 12:31:37 Do you really want the file to be located at /defa Do you really want the file to be located at /defaults/rules.json? Wladimir Palant 2012/09/25 13:26:00 Actually, it's defaults/rules.json (relative to th Show quoted text On 2012/09/25 12:31:37, Thomas Greiner wrote: > Do you really want the file to be located at /defaults/rules.json? Actually, it's defaults/rules.json (relative to the current work directory). This is a bit ugly because it makes the assumption that the script will always be run from the extension root but other than that it should be fine.
332 file = codecs.open(path, 'rb', encoding='utf-8')	312 file = codecs.open(path, 'rb', encoding='utf-8')

333 data = file.read()	313 data = file.read()

334 file.close()	314 file.close()

335	315

336 marker = '// Automatically generated dictionaries'	316 marker = '// Automatically generated dictionaries'

337 markerIndex = data.find(marker)	317 markerIndex = data.find(marker)

338 if markerIndex < 0:	318 if markerIndex < 0:

339 raise Exception('Insertion marker not found in %s' % path)	319 raise Exception('Insertion marker not found in %s' % path)

340 data = data[0:markerIndex + len(marker)] + '\n'	320 data = data[0:markerIndex + len(marker)] + '\n'

341 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n'	321 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n'

342	322

343 file = codecs.open(path, 'wb', encoding='utf-8')	323 file = codecs.open(path, 'wb', encoding='utf-8')

344 file.write(data)	324 file.write(data)

345 file.close()	325 file.close()

346	326

347 def updateRules():	327 def updateRules():

348 rules = {}	328 rules = {}

349 rules['domainReferrals'] = domainReferrals	329 rules['domainReferrals'] = domainReferrals

350 updateSchemes(rules)	330 rules['scheme'] = schemes

351 updateDomains(rules)	331 updateDomains(rules)

352 writeRules(rules)	332 writeRules(rules)

353	333

354 if __name__ == "__main__":	334 if __name__ == "__main__":

355 updateRules()	335 updateRules()

OLD	NEW

« no previous file with comments | « defaults/rules.json ('k') | no next file » | no next file with comments »