Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: updateRules.py

Issue 8382011: Applied changes from emailed code review (Closed)
Patch Set: Created Sept. 28, 2012, 1:40 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « lib/updateRules.js ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: updateRules.py
===================================================================
new file mode 100644
--- /dev/null
+++ b/updateRules.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# This Source Code is subject to the terms of the Mozilla Public License
+# version 2.0 (the "License"). You can obtain a copy of the License at
+# http://mozilla.org/MPL/2.0/.
+
+"""
+Update the dictionaries in the rules
+====================================
+
+ This script generates the dictionaries in the defaults/rules.js file based
+ on various sources like the list of public suffixes (http://publicsuffix.org/).
+"""
+
+import sys
+import os
+import re
+import urllib
+import codecs
+import json
+import itertools
+
+schemes = {
+ 'http:': 4,
+ 'https:': 3,
+ 'ftp:': 2,
+ 'irc:': 1,
+}
+
+domainReferrals = {
+ 'amazon.co.uk': 'tag=uf07d-21',
+ 'amazon.com': 'tag=uf024-20',
+ 'amazon.de': 'tag=uf0e6-21',
+ 'amazon.fr': 'tag=uf02b-21',
+ 'amazon.es': 'tag=uf07-21',
+ 'amazon.it': 'tag=uf08d-21',
+ 'ozon.ru': 'partner=urlfixer',
+}
+
+additionalDomains = [
+ 'fab.com',
+ 'ku.dk',
+ 'google.cz',
+ 'komplett.ie',
+ 'lotto.ie',
+ 'bt.yahoo.com',
+ 'o.co',
+ 'bet.hu',
+ 'haz.de',
+ 'sas.com',
+ 'nic.ir',
+ 'tomtop.com',
+ 'uwa.edu.au',
+ 'spacex.com',
+ 'eif.org',
+ 'geld.de',
+ # From http://www.wikipedia.org/
+ 'en.wikipedia.org', 'ja.wikipedia.org', 'de.wikipedia.org', 'es.wikipedia.org', 'ru.wikipedia.org', 'fr.wikipedia.org', 'it.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org', 'zh.wikipedia.org', 'ar.wikipedia.org', 'bg.wikipedia.org', 'ca.wikipedia.org', 'cs.wikipedia.org', 'da.wikipedia.org', 'de.wikipedia.org', 'en.wikipedia.org', 'es.wikipedia.org', 'eo.wikipedia.org', 'eu.wikipedia.org', 'fa.wikipedia.org', 'fr.wikipedia.org', 'ko.wikipedia.org', 'hi.wikipedia.org', 'hr.wikipedia.org', 'id.wikipedia.org', 'it.wikipedia.org', 'he.wikipedia.org', 'lt.wikipedia.org', 'hu.wikipedia.org', 'ms.wikipedia.org', 'nl.wikipedia.org', 'ja.wikipedia.org', 'no.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org', 'kk.wikipedia.org', 'ro.wikipedia.org', 'ru.wikipedia.org', 'sk.wikipedia.org', 'sl.wikipedia.org', 'sr.wikipedia.org', 'fi.wikipedia.org', 'sv.wikipedia.org', 'tr.wikipedia.org', 'uk.wikipedia.org', 'vi.wikipedia.org', 'vo.wikipedia.org', 'war.wikipedia.org', 'zh.wikipedia.org', 'af.wikipedia.org', 'als.wikipedia.org', 'am.wikipedia.org', 'an.wikipedia.org', 'ast.wikipedia.org', 'ht.wikipedia.org', 'az.wikipedia.org', 'bn.wikipedia.org', 'ba.wikipedia.org', 'be.wikipedia.org', 'bpy.wikipedia.org', 'bs.wikipedia.org', 'br.wikipedia.org', 'cv.wikipedia.org', 'cy.wikipedia.org', 'et.wikipedia.org', 'el.wikipedia.org', 'fy.wikipedia.org', 'ga.wikipedia.org', 'gl.wikipedia.org', 'gu.wikipedia.org', 'hy.wikipedia.org', 'io.wikipedia.org', 'ia.wikipedia.org', 'is.wikipedia.org', 'jv.wikipedia.org', 'kn.wikipedia.org', 'ka.wikipedia.org', 'ku.wikipedia.org', 'la.wikipedia.org', 'lv.wikipedia.org', 'lb.wikipedia.org', 'lmo.wikipedia.org', 'mk.wikipedia.org', 'mg.wikipedia.org', 'ml.wikipedia.org', 'mr.wikipedia.org', 'my.wikipedia.org', 'new.wikipedia.org', 'ne.wikipedia.org', 'nn.wikipedia.org', 'nap.wikipedia.org', 'oc.wikipedia.org', 'pms.wikipedia.org', 'nds.wikipedia.org', 'qu.wikipedia.org', 'pnb.wikipedia.org', 'sq.wikipedia.org', 'scn.wikipedia.org', 'simple.wikipedia.org', 'ceb.wikipedia.org', 'sh.wikipedia.org', 'su.wikipedia.org', 'sw.wikipedia.org', 'tl.wikipedia.org', 'ta.wikipedia.org', 'tt.wikipedia.org', 'te.wikipedia.org', 'th.wikipedia.org', 'bug.wikipedia.org', 'ur.wikipedia.org', 'wa.wikipedia.org', 'yo.wikipedia.org', 'diq.wikipedia.org', 'ace.wikipedia.org', 'frp.wikipedia.org', 'arc.wikipedia.org', 'gn.wikipedia.org', 'av.wikipedia.org', 'ay.wikipedia.org', 'bjn.wikipedia.org', 'bh.wikipedia.org', 'bcl.wikipedia.org', 'bar.wikipedia.org', 'bo.wikipedia.org', 'co.wikipedia.org', 'pdc.wikipedia.org', 'dv.wikipedia.org', 'nv.wikipedia.org', 'ang.wikipedia.org', 'eml.wikipedia.org', 'myv.wikipedia.org', 'ext.wikipedia.org', 'hif.wikipedia.org', 'fo.wikipedia.org', 'frr.wikipedia.org', 'fur.wikipedia.org', 'gv.wikipedia.org', 'gag.wikipedia.org', 'gd.wikipedia.org', 'gan.wikipedia.org', 'glk.wikipedia.org', 'hak.wikipedia.org', 'xal.wikipedia.org', 'haw.wikipedia.org', 'hsb.wikipedia.org', 'ilo.wikipedia.org', 'ie.wikipedia.org', 'os.wikipedia.org', 'kl.wikipedia.org', 'pam.wikipedia.org', 'csb.wikipedia.org', 'kw.wikipedia.org', 'km.wikipedia.org', 'rw.wikipedia.org', 'kv.wikipedia.org', 'ky.wikipedia.org', 'mrj.wikipedia.org', 'lad.wikipedia.org', 'lbe.wikipedia.org', 'lij.wikipedia.org', 'li.wikipedia.org', 'ln.wikipedia.org', 'jbo.wikipedia.org', 'mt.wikipedia.org', 'mi.wikipedia.org', 'xmf.wikipedia.org', 'arz.wikipedia.org', 'mzn.wikipedia.org', 'mdf.wikipedia.org', 'mn.wikipedia.org', 'nah.wikipedia.org', 'nrm.wikipedia.org', 'nov.wikipedia.org', 'ce.wikipedia.org', 'mhr.wikipedia.org', 'or.wikipedia.org', 'as.wikipedia.org', 'uz.wikipedia.org', 'pi.wikipedia.org', 'pag.wikipedia.org', 'pa.wikipedia.org', 'pap.wikipedia.org', 'ps.wikipedia.org', 'koi.wikipedia.org', 'pfl.wikipedia.org', 'pcd.wikipedia.org', 'krc.wikipedia.org', 'crh.wikipedia.org', 'ksh.wikipedia.org', 'rm.wikipedia.org', 'rue.wikipedia.org', 'sa.wikipedia.org', 'se.wikipedia.org', 'sc.wikipedia.org', 'sah.wikipedia.org', 'sco.wikipedia.org', 'stq.wikipedia.org', 'si.wikipedia.org', 'szl.wikipedia.org', 'so.wikipedia.org', 'ckb.wikipedia.org', 'tg.wikipedia.org', 'tpi.wikipedia.org', 'to.wikipedia.org', 'tk.wikipedia.org', 'udm.wikipedia.org', 'ug.wikipedia.org', 'vec.wikipedia.org', 'vls.wikipedia.org', 'wo.wikipedia.org', 'wuu.wikipedia.org', 'yi.wikipedia.org', 'zea.wikipedia.org', 'kbd.wikipedia.org', 'ak.wikipedia.org', 'ab.wikipedia.org', 'bm.wikipedia.org', 'bi.wikipedia.org', 'bxr.wikipedia.org', 'ch.wikipedia.org', 'ny.wikipedia.org', 'za.wikipedia.org', 'dsb.wikipedia.org', 'ee.wikipedia.org', 'ff.wikipedia.org', 'ki.wikipedia.org', 'got.wikipedia.org', 'ha.wikipedia.org', 'ig.wikipedia.org', 'iu.wikipedia.org', 'ik.wikipedia.org', 'ks.wikipedia.org', 'kg.wikipedia.org', 'lo.wikipedia.org', 'ltg.wikipedia.org', 'lg.wikipedia.org', 'cdo.wikipedia.org', 'mwl.wikipedia.org', 'mo.wikipedia.org', 'fj.wikipedia.org', 'na.wikipedia.org', 'cr.wikipedia.org', 'pih.wikipedia.org', 'om.wikipedia.org', 'pnt.wikipedia.org', 'kaa.wikipedia.org', 'dz.wikipedia.org', 'rmy.wikipedia.org', 'rn.wikipedia.org', 'sm.wikipedia.org', 'sg.wikipedia.org', 'st.wikipedia.org', 'nso.wikipedia.org', 'tn.wikipedia.org', 'sn.wikipedia.org', 'sd.wikipedia.org', 'cu.wikipedia.org', 'ss.wikipedia.org', 'srn.wikipedia.org', 'ty.wikipedia.org', 'kab.wikipedia.org', 'tet.wikipedia.org', 'ti.wikipedia.org', 'chr.wikipedia.org', 'tum.wikipedia.org', 'ts.wikipedia.org', 'chy.wikipedia.org', 've.wikipedia.org', 'tw.wikipedia.org', 'vep.wikipedia.org', 'xh.wikipedia.org', 'zu.wikipedia.org', 'de.wikipedia.org', 'pl.wikipedia.org', 'ja.wikipedia.org', 'zh.wikipedia.org', 'ru.wikipedia.org', 'eo.wikipedia.org', 'vi.wikipedia.org',
+ # From http://www.google.com/supported_domains
+ 'google.com', 'google.ad', 'google.ae', 'google.com.af', 'google.com.ag', 'google.com.ai', 'google.am', 'google.co.ao', 'google.com.ar', 'google.as', 'google.at', 'google.com.au', 'google.az', 'google.ba', 'google.com.bd', 'google.be', 'google.bf', 'google.bg', 'google.com.bh', 'google.bi', 'google.bj', 'google.com.bn', 'google.com.bo', 'google.com.br', 'google.bs', 'google.co.bw', 'google.by', 'google.com.bz', 'google.ca', 'google.cd', 'google.cf', 'google.cg', 'google.ch', 'google.ci', 'google.co.ck', 'google.cl', 'google.cm', 'google.cn', 'google.com.co', 'google.co.cr', 'google.com.cu', 'google.cv', 'google.com.cy', 'google.cz', 'google.de', 'google.dj', 'google.dk', 'google.dm', 'google.com.do', 'google.dz', 'google.com.ec', 'google.ee', 'google.com.eg', 'google.es', 'google.com.et', 'google.fi', 'google.com.fj', 'google.fm', 'google.fr', 'google.ga', 'google.ge', 'google.gg', 'google.com.gh', 'google.com.gi', 'google.gl', 'google.gm', 'google.gp', 'google.gr', 'google.com.gt', 'google.gy', 'google.com.hk', 'google.hn', 'google.hr', 'google.ht', 'google.hu', 'google.co.id', 'google.ie', 'google.co.il', 'google.im', 'google.co.in', 'google.iq', 'google.is', 'google.it', 'google.je', 'google.com.jm', 'google.jo', 'google.co.jp', 'google.co.ke', 'google.com.kh', 'google.ki', 'google.kg', 'google.co.kr', 'google.com.kw', 'google.kz', 'google.la', 'google.com.lb', 'google.li', 'google.lk', 'google.co.ls', 'google.lt', 'google.lu', 'google.lv', 'google.com.ly', 'google.co.ma', 'google.md', 'google.me', 'google.mg', 'google.mk', 'google.ml', 'google.mn', 'google.ms', 'google.com.mt', 'google.mu', 'google.mv', 'google.mw', 'google.com.mx', 'google.com.my', 'google.co.mz', 'google.com.na', 'google.com.nf', 'google.com.ng', 'google.com.ni', 'google.ne', 'google.nl', 'google.no', 'google.com.np', 'google.nr', 'google.nu', 'google.co.nz', 'google.com.om', 'google.com.pa', 'google.com.pe', 'google.com.ph', 'google.com.pk', 'google.pl', 'google.pn', 'google.com.pr', 'google.ps', 'google.pt', 'google.com.py', 'google.com.qa', 'google.ro', 'google.ru', 'google.rw', 'google.com.sa', 'google.com.sb', 'google.sc', 'google.se', 'google.com.sg', 'google.sh', 'google.si', 'google.sk', 'google.com.sl', 'google.sn', 'google.so', 'google.sm', 'google.st', 'google.com.sv', 'google.td', 'google.tg', 'google.co.th', 'google.com.tj', 'google.tk', 'google.tl', 'google.tm', 'google.tn', 'google.to', 'google.com.tr', 'google.tt', 'google.com.tw', 'google.co.tz', 'google.com.ua', 'google.co.ug', 'google.co.uk', 'google.com.uy', 'google.co.uz', 'google.com.vc', 'google.co.ve', 'google.vg', 'google.co.vi', 'google.com.vn', 'google.vu', 'google.ws', 'google.rs', 'google.co.za', 'google.co.zm', 'google.co.zw', 'google.cat',
+ # From http://www.ebay.ch/ (eBay-Websites)
+ 'mercadolibre.com.ar', 'ebay.com.au', 'ebay.at', 'ebay.be', 'mercadolivre.com.br', 'ebay.com.cn', 'ebay.dk', 'ebay.de', 'ebay.fr', 'ebay.gr', 'ebay.co.uk', 'ebay.com.hk', 'ebay.in', 'ebay.ie', 'ebay.it', 'ebay.ca', 'auction.co.kr', 'ebay.com.my', 'mercadolibre.com.mx', 'pages.ebay.com', 'ebay.nl', 'ebay.no', 'ebay.ph', 'ebay.pl', 'ebay.ru', 'ebay.se', 'ebay.com.sg', 'ebay.es', 'ruten.com.tw', 'ebay.co.th', 'gittigidiyor.com', 'ebay.cz', 'ebay.com', 'ebay.vn',
+ # From http://www.amazon.com/ (footer)
+ 'amazon.ca', 'amazon.cn', 'amazon.fr', 'amazon.de', 'amazon.it', 'amazon.co.jp', 'amazon.es', 'amazon.co.uk',
+
+ # From http://en.wikipedia.org/wiki/.gov#States_in_GOV
+ 'al.gov', 'alabama.gov',
+ 'alaska.gov',
+ 'az.gov',
+ 'ar.gov', 'arkansas.gov',
+ 'ca.gov', 'california.gov',
+ 'colorado.gov',
+ 'ct.gov',
+ 'delaware.gov',
+ 'florida.gov', 'fl.gov',
+ 'georgia.gov', 'ga.gov',
+ 'guam.gov',
+ 'hawaii.gov',
+ 'idaho.gov',
+ 'illinois.gov',
+ 'in.gov',
+ 'iowa.gov', 'ia.gov',
+ 'ks.gov', 'kansas.gov',
+ 'ky.gov', 'kentucky.gov',
+ 'louisiana.gov',
+ 'maine.gov',
+ 'maryland.gov',
+ 'mass.gov',
+ 'michigan.gov',
+ 'mn.gov',
+ 'mississippi.gov',
+ 'mo.gov',
+ 'mt.gov', 'montana.gov',
+ 'nebraska.gov',
+ 'nv.gov',
+ 'nh.gov', 'visitnh.gov',
+ 'nj.gov', 'newjersey.gov',
+ 'newmexico.gov',
+ 'ny.gov',
+ 'nc.gov', 'northcarolina.gov',
+ 'nd.gov',
+ 'ohio.gov', 'oh.gov',
+ 'ok.gov',
+ 'oregon.gov',
+ 'pa.gov', 'pennsylvania.gov',
+ 'pr.gov',
+ 'ri.gov',
+ 'sc.gov',
+ 'sd.gov',
+ 'tennessee.gov', 'tn.gov',
+ 'texas.gov',
+ 'utah.gov',
+ 'vermont.gov',
+ 'virginia.gov',
+ 'wa.gov', 'washington.gov',
+ 'wv.gov',
+ 'wisconsin.gov',
+ 'wyoming.gov',
+ 'dc.gov',
+]
+
+additionalTLDs = [
+ # From http://en.wikipedia.org/wiki/.ar
+ 'com.ar', 'edu.ar', 'gob.ar', 'gov.ar', 'int.ar', 'mil.ar', 'net.ar', 'org.ar', 'tur.ar',
+
+ # From http://en.wikipedia.org/wiki/.au
+ 'com.au', 'net.au', 'org.au', 'edu.au', 'gov.au', 'csiro.au', 'asn.au', 'id.au',
+
+ # From http://en.wikipedia.org/wiki/.bd
+ 'com.bd', 'edu.bd', 'ac.bd', 'net.bd', 'gov.bd', 'org.bd', 'mil.bd',
+
+ # From http://en.wikipedia.org/wiki/.bn
+ 'com.bn', 'edu.bn', 'gov.bn', 'net.bn', 'org.bn',
+
+ # From http://en.wikipedia.org/wiki/.ck
+ 'co.ck', 'org.ck', 'edu.ck', 'gov.ck', 'net.ck', 'gen.ck', 'biz.ck', 'info.ck',
+
+ # From http://en.wikipedia.org/wiki/.cy
+ 'ac.cy', 'net.cy', 'gov.cy', 'org.cy', 'pro.cy', 'name.cy', 'ekloges.cy',
+ 'tm.cy', 'ltd.cy', 'biz.cy', 'press.cy', 'parliament.cy', 'com.cy',
+
+ # From http://en.wikipedia.org/wiki/.er
+ 'com.er', 'edu.er', 'gov.er', 'mil.er', 'net.er', 'org.er', 'ind.er',
+
+ # From http://en.wikipedia.org/wiki/.et
+ 'com.et', 'gov.et', 'org.et', 'edu.et', 'net.et', 'biz.et', 'name.et', 'info.et',
+
+ # From http://en.wikipedia.org/wiki/.fj
+ 'ac.fj', 'biz.fj', 'com.fj', 'info.fj', 'mil.fj', 'name.fj', 'net.fj', 'org.fj', 'pro.fj',
+
+ # From http://en.wikipedia.org/wiki/.fk
+ 'co.fk', 'org.fk', 'gov.fk', 'ac.fk', 'nom.fk', 'net.fk',
+
+ # From http://en.wikipedia.org/wiki/.gt
+ 'com.gt', 'edu.gt', 'net.gt', 'gob.gt', 'org.gt', 'mil.gt', 'ind.gt',
+
+ # From http://en.wikipedia.org/wiki/.gu
+ 'com.gu', 'net.gu', 'gov.gu', 'org.gu', 'edu.gu',
+
+ # From http://en.wikipedia.org/wiki/.il
+ 'ac.il', 'co.il', 'org.il', 'net.il', 'k12.il', 'gov.il', 'muni.il', 'idf.il',
+
+ # From http://en.wikipedia.org/wiki/.jm
+ 'com.jm', 'net.jm', 'org.jm', 'edu.jm', 'gov.jm', 'mil.jm',
+
+ # From http://en.wikipedia.org/wiki/.ke
+ 'co.ke', 'or.ke', 'ne.ke', 'go.ke', 'ac.ke', 'sc.ke', 'me.ke', 'mobi.ke', 'info.ke',
+
+ # From http://en.wikipedia.org/wiki/.kh
+ 'per.kh', 'com.kh', 'edu.kh', 'gov.kh', 'mil.kh', 'net.kh', 'org.kh',
+
+ # From http://en.wikipedia.org/wiki/.kw
+ 'edu.kw', 'com.kw', 'net.kw', 'org.kw', 'gov.kw',
+
+ # From http://en.wikipedia.org/wiki/.mm
+ 'net.mm', 'com.mm', 'edu.mm', 'org.mm', 'gov.mm',
+
+ # From http://en.wikipedia.org/wiki/.mt
+ 'com.mt', 'org.mt', 'net.mt', 'edu.mt', 'gov.mt',
+
+ # From http://en.wikipedia.org/wiki/.mz
+ 'adv.mz', 'ac.mz', 'co.mz', 'org.mz', 'gov.mz', 'edu.mz',
+
+ # From http://en.wikipedia.org/wiki/.ni
+ 'gob.ni', 'co.ni', 'com.ni', 'ac.ni', 'edu.ni', 'org.ni', 'nom.ni', 'net.ni', 'mil.ni',
+
+ # From http://en.wikipedia.org/wiki/.np
+ 'com.np', 'edu.np', 'gov.np', 'mil.np', 'net.np', 'org.np',
+
+ # From http://en.wikipedia.org/wiki/.nz
+ 'ac.nz', 'co.nz', 'geek.nz', 'gen.nz', 'maori.nz', 'net.nz', 'org.nz', 'school.nz',
+ 'cri.nz', 'govt.nz', 'iwi.nz', 'parliament.nz', 'mil.nz', 'health.nz',
+
+ # From http://en.wikipedia.org/wiki/.om
+ 'com.om', 'co.om', 'edu.om', 'ac.om', 'sch.om', 'gov.om', 'net.om', 'org.om',
+ 'mil.om', 'museum.om', 'biz.om', 'pro.om', 'med.om',
+
+ # From http://en.wikipedia.org/wiki/.pg
+ 'com.pg', 'net.pg', 'ac.pg', 'gov.pg', 'mil.pg', 'org.pg',
+
+ # From http://en.wikipedia.org/wiki/.py
+ 'org.py', 'edu.py', 'mil.py', 'gov.py', 'net.py', 'com.py', 'coop.py',
+
+ # From http://en.wikipedia.org/wiki/.qa
+ 'com.qa', 'net.qa', 'org.qa', 'gov.qa', 'edu.qa', 'mil.qa', 'name.qa', 'sch.qa',
+
+ # From http://en.wikipedia.org/wiki/.sv
+ 'edu.sv', 'gob.sv', 'com.sv', 'org.sv', 'red.sv',
+
+ # From http://en.wikipedia.org/wiki/.tr
+ 'com.tr', 'gen.tr', 'org.tr', 'biz.tr', 'info.tr', 'av.tr', 'dr.tr', 'pol.tr',
+ 'bel.tr', 'tsk.tr', 'bbs.tr', 'k12.tr', 'edu.tr', 'name.tr', 'net.tr', 'gov.tr',
+ 'web.tr', 'tel.tr', 'tv.tr', 'nc.tr',
+
+ # From http://en.wikipedia.org/wiki/.uk
+ 'ac.uk', 'co.uk', 'gov.uk', 'judiciary.uk', 'ltd.uk', 'me.uk', 'mod.uk', 'net.uk',
+ 'nhs.uk', 'nic.uk', 'org.uk', 'parliament.uk', 'plc.uk', 'police.uk', 'sch.uk',
+
+ # From http://en.wikipedia.org/wiki/.uy
+ 'com.uy', 'edu.uy', 'gub.uy', 'net.uy', 'mil.uy', 'org.uy',
+
+ # From http://en.wikipedia.org/wiki/.ve
+ 'com.ve', 'net.ve', 'org.ve', 'info.ve', 'co.ve', 'web.ve', 'gob.ve', 'edu.ve', 'mil.ve', 'tec.ve',
+
+ # From http://en.wikipedia.org/wiki/.ye
+ 'com.ye', 'co.ye', 'ltd.ye', 'me.ye', 'net.ye', 'org.ye', 'plc.ye', 'gov.ye',
+
+ # From http://en.wikipedia.org/wiki/.za
+ 'ac.za', 'city.za', 'co.za', 'edu.za', 'gov.za', 'law.za', 'mil.za', 'nom.za', 'org.za', 'school.za',
+ 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za',
+ 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'olivetti.za', 'pix.za',
+
+ # From http://en.wikipedia.org/wiki/.zm
+ 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm',
+
+ # From http://en.wikipedia.org/wiki/.zw
+ 'co.zw', 'ac.zw', 'org.zw',
+]
+
+def getSuffixes(target, items):
+ suffixes = {}
+ for item, priority in items.iteritems():
+ suffix = item[-1] if len(item) else ''
+ if not suffix in suffixes:
+ suffixes[suffix] = {}
+ suffixes[suffix][item[:-1]] = priority
+ for suffix, items in suffixes.iteritems():
+ if len(items.keys()) == 1:
+ item, priority = items.items()[0]
+ target[suffix] = ''.join(reversed(item)) + ' ' + str(priority)
+ else:
+ target[suffix] = {}
+ getSuffixes(target[suffix], items)
+
+def urlopen(url, attempts=3):
+ """
+ Tries to open a particular URL, retries on failure.
+ """
+ for i in range(attempts):
+ try:
+ return urllib.urlopen(url)
+ except IOError, e:
+ error = e
+ time.sleep(5)
+ raise error
+
+def iterateTLDs():
+ for tld in additionalTLDs:
+ yield tld
+
+ url = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
+ resource = urlopen(url)
+ for line in resource.read().decode('utf-8').splitlines():
+ line = line.rstrip()
+ if line.startswith("//"):
+ continue
+
+ if line.startswith('*.'):
+ tld = line[2:]
+ elif line.startswith('!'):
+ tld = line[1:]
+ else:
+ tld = line
+
+ if tld:
+ yield tld
+
+def getTLDs(domains, tldPriority):
+ for tld in iterateTLDs():
+ if not tld in domains:
+ domains[tld] = tldPriority
+
+def updateSchemes(rules):
+ rules['scheme'] = {}
+ getSuffixes(rules['scheme'], schemes)
+
+def updateDomains(rules):
+ domains = {}
+ reader = codecs.getreader('utf-8')(sys.stdin)
+ i = 0
+ for domain in itertools.chain(reader.readlines(), additionalDomains):
+ domain = domain.rstrip()
+ if not domain or domain in domains:
+ continue
+ domains[domain] = i
+ i += 1
+
+ maxPriority = i
+ for domain in domains.iterkeys():
+ domains[domain] = maxPriority - domains[domain]
+
+ # Extract TLDs from domain list
+ for domain, priority in domains.items():
+ while True:
+ if not re.search(r'^[^.]+\.+', domain):
+ break
+ domain = re.sub(r'^[^.]+\.+', '', domain)
+ if not domain:
+ break
+ if not domain in domains or domains[domain] < priority - maxPriority:
+ domains[domain] = priority - maxPriority
+
+ # Fill up with "official" TLDs
+ getTLDs(domains, -maxPriority)
+
+ rules['domain'] = {}
+ getSuffixes(rules['domain'], domains)
+
+def writeRules(rules):
+ path = os.path.join('defaults', 'rules.json')
+ file = codecs.open(path, 'rb', encoding='utf-8')
+ data = file.read()
+ file.close()
+
+ marker = '// Automatically generated dictionaries'
+ markerIndex = data.find(marker)
+ if markerIndex < 0:
+ raise Exception('Insertion marker not found in %s' % path)
+ data = data[0:markerIndex + len(marker)] + '\n'
+ data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separators = (',', ':'))[1:-1] + '\n}\n'
+
+ file = codecs.open(path, 'wb', encoding='utf-8')
+ file.write(data)
+ file.close()
+
+def updateRules():
+ rules = {}
+ rules['domainReferrals'] = domainReferrals
+ updateSchemes(rules)
+ updateDomains(rules)
+ writeRules(rules)
+
+if __name__ == "__main__":
+ updateRules()
« no previous file with comments | « lib/updateRules.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld