| OLD | NEW | 
|---|
| (Empty) |  | 
|  | 1 #!/usr/bin/env python | 
|  | 2 # coding: utf-8 | 
|  | 3 | 
|  | 4 # This Source Code is subject to the terms of the Mozilla Public License | 
|  | 5 # version 2.0 (the "License"). You can obtain a copy of the License at | 
|  | 6 # http://mozilla.org/MPL/2.0/. | 
|  | 7 | 
|  | 8 """ | 
|  | 9 Update the dictionaries in the rules | 
|  | 10 ==================================== | 
|  | 11 | 
|  | 12   This script generates the dictionaries in the defaults/rules.js file based | 
|  | 13   on various sources like the list of public suffixes (http://publicsuffix.org/)
     . | 
|  | 14 """ | 
|  | 15 | 
|  | 16 import sys | 
|  | 17 import os | 
|  | 18 import re | 
|  | 19 import urllib | 
|  | 20 import codecs | 
|  | 21 import json | 
|  | 22 import itertools | 
|  | 23 | 
|  | 24 schemes = { | 
|  | 25  'http:': 4, | 
|  | 26  'https:': 3, | 
|  | 27  'ftp:': 2, | 
|  | 28  'irc:': 1, | 
|  | 29 } | 
|  | 30 | 
|  | 31 domainReferrals = { | 
|  | 32   'amazon.co.uk': 'tag=uf07d-21', | 
|  | 33   'amazon.com': 'tag=uf024-20', | 
|  | 34   'amazon.de': 'tag=uf0e6-21', | 
|  | 35   'amazon.fr': 'tag=uf02b-21', | 
|  | 36   'amazon.es': 'tag=uf07-21', | 
|  | 37   'amazon.it': 'tag=uf08d-21', | 
|  | 38   'ozon.ru': 'partner=urlfixer', | 
|  | 39 } | 
|  | 40 | 
|  | 41 additionalDomains = [ | 
|  | 42   'fab.com', | 
|  | 43   'ku.dk', | 
|  | 44   'google.cz', | 
|  | 45   'komplett.ie', | 
|  | 46   'lotto.ie', | 
|  | 47   'bt.yahoo.com', | 
|  | 48   'o.co', | 
|  | 49   'bet.hu', | 
|  | 50   'haz.de', | 
|  | 51   'sas.com', | 
|  | 52   'nic.ir', | 
|  | 53   'tomtop.com', | 
|  | 54   'uwa.edu.au', | 
|  | 55   'spacex.com', | 
|  | 56   'eif.org', | 
|  | 57   'geld.de', | 
|  | 58   # From http://www.wikipedia.org/ | 
|  | 59   'en.wikipedia.org', 'ja.wikipedia.org', 'de.wikipedia.org', 'es.wikipedia.org'
     , 'ru.wikipedia.org', 'fr.wikipedia.org', 'it.wikipedia.org', 'pl.wikipedia.org'
     , 'pt.wikipedia.org', 'zh.wikipedia.org', 'ar.wikipedia.org', 'bg.wikipedia.org'
     , 'ca.wikipedia.org', 'cs.wikipedia.org', 'da.wikipedia.org', 'de.wikipedia.org'
     , 'en.wikipedia.org', 'es.wikipedia.org', 'eo.wikipedia.org', 'eu.wikipedia.org'
     , 'fa.wikipedia.org', 'fr.wikipedia.org', 'ko.wikipedia.org', 'hi.wikipedia.org'
     , 'hr.wikipedia.org', 'id.wikipedia.org', 'it.wikipedia.org', 'he.wikipedia.org'
     , 'lt.wikipedia.org', 'hu.wikipedia.org', 'ms.wikipedia.org', 'nl.wikipedia.org'
     , 'ja.wikipedia.org', 'no.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org'
     , 'kk.wikipedia.org', 'ro.wikipedia.org', 'ru.wikipedia.org', 'sk.wikipedia.org'
     , 'sl.wikipedia.org', 'sr.wikipedia.org', 'fi.wikipedia.org', 'sv.wikipedia.org'
     , 'tr.wikipedia.org', 'uk.wikipedia.org', 'vi.wikipedia.org', 'vo.wikipedia.org'
     , 'war.wikipedia.org', 'zh.wikipedia.org', 'af.wikipedia.org', 'als.wikipedia.or
     g', 'am.wikipedia.org', 'an.wikipedia.org', 'ast.wikipedia.org', 'ht.wikipedia.o
     rg', 'az.wikipedia.org', 'bn.wikipedia.org', 'ba.wikipedia.org', 'be.wikipedia.o
     rg', 'bpy.wikipedia.org', 'bs.wikipedia.org', 'br.wikipedia.org', 'cv.wikipedia.
     org', 'cy.wikipedia.org', 'et.wikipedia.org', 'el.wikipedia.org', 'fy.wikipedia.
     org', 'ga.wikipedia.org', 'gl.wikipedia.org', 'gu.wikipedia.org', 'hy.wikipedia.
     org', 'io.wikipedia.org', 'ia.wikipedia.org', 'is.wikipedia.org', 'jv.wikipedia.
     org', 'kn.wikipedia.org', 'ka.wikipedia.org', 'ku.wikipedia.org', 'la.wikipedia.
     org', 'lv.wikipedia.org', 'lb.wikipedia.org', 'lmo.wikipedia.org', 'mk.wikipedia
     .org', 'mg.wikipedia.org', 'ml.wikipedia.org', 'mr.wikipedia.org', 'my.wikipedia
     .org', 'new.wikipedia.org', 'ne.wikipedia.org', 'nn.wikipedia.org', 'nap.wikiped
     ia.org', 'oc.wikipedia.org', 'pms.wikipedia.org', 'nds.wikipedia.org', 'qu.wikip
     edia.org', 'pnb.wikipedia.org', 'sq.wikipedia.org', 'scn.wikipedia.org', 'simple
     .wikipedia.org', 'ceb.wikipedia.org', 'sh.wikipedia.org', 'su.wikipedia.org', 's
     w.wikipedia.org', 'tl.wikipedia.org', 'ta.wikipedia.org', 'tt.wikipedia.org', 't
     e.wikipedia.org', 'th.wikipedia.org', 'bug.wikipedia.org', 'ur.wikipedia.org', '
     wa.wikipedia.org', 'yo.wikipedia.org', 'diq.wikipedia.org', 'ace.wikipedia.org',
      'frp.wikipedia.org', 'arc.wikipedia.org', 'gn.wikipedia.org', 'av.wikipedia.org
     ', 'ay.wikipedia.org', 'bjn.wikipedia.org', 'bh.wikipedia.org', 'bcl.wikipedia.o
     rg', 'bar.wikipedia.org', 'bo.wikipedia.org', 'co.wikipedia.org', 'pdc.wikipedia
     .org', 'dv.wikipedia.org', 'nv.wikipedia.org', 'ang.wikipedia.org', 'eml.wikiped
     ia.org', 'myv.wikipedia.org', 'ext.wikipedia.org', 'hif.wikipedia.org', 'fo.wiki
     pedia.org', 'frr.wikipedia.org', 'fur.wikipedia.org', 'gv.wikipedia.org', 'gag.w
     ikipedia.org', 'gd.wikipedia.org', 'gan.wikipedia.org', 'glk.wikipedia.org', 'ha
     k.wikipedia.org', 'xal.wikipedia.org', 'haw.wikipedia.org', 'hsb.wikipedia.org',
      'ilo.wikipedia.org', 'ie.wikipedia.org', 'os.wikipedia.org', 'kl.wikipedia.org'
     , 'pam.wikipedia.org', 'csb.wikipedia.org', 'kw.wikipedia.org', 'km.wikipedia.or
     g', 'rw.wikipedia.org', 'kv.wikipedia.org', 'ky.wikipedia.org', 'mrj.wikipedia.o
     rg', 'lad.wikipedia.org', 'lbe.wikipedia.org', 'lij.wikipedia.org', 'li.wikipedi
     a.org', 'ln.wikipedia.org', 'jbo.wikipedia.org', 'mt.wikipedia.org', 'mi.wikiped
     ia.org', 'xmf.wikipedia.org', 'arz.wikipedia.org', 'mzn.wikipedia.org', 'mdf.wik
     ipedia.org', 'mn.wikipedia.org', 'nah.wikipedia.org', 'nrm.wikipedia.org', 'nov.
     wikipedia.org', 'ce.wikipedia.org', 'mhr.wikipedia.org', 'or.wikipedia.org', 'as
     .wikipedia.org', 'uz.wikipedia.org', 'pi.wikipedia.org', 'pag.wikipedia.org', 'p
     a.wikipedia.org', 'pap.wikipedia.org', 'ps.wikipedia.org', 'koi.wikipedia.org', 
     'pfl.wikipedia.org', 'pcd.wikipedia.org', 'krc.wikipedia.org', 'crh.wikipedia.or
     g', 'ksh.wikipedia.org', 'rm.wikipedia.org', 'rue.wikipedia.org', 'sa.wikipedia.
     org', 'se.wikipedia.org', 'sc.wikipedia.org', 'sah.wikipedia.org', 'sco.wikipedi
     a.org', 'stq.wikipedia.org', 'si.wikipedia.org', 'szl.wikipedia.org', 'so.wikipe
     dia.org', 'ckb.wikipedia.org', 'tg.wikipedia.org', 'tpi.wikipedia.org', 'to.wiki
     pedia.org', 'tk.wikipedia.org', 'udm.wikipedia.org', 'ug.wikipedia.org', 'vec.wi
     kipedia.org', 'vls.wikipedia.org', 'wo.wikipedia.org', 'wuu.wikipedia.org', 'yi.
     wikipedia.org', 'zea.wikipedia.org', 'kbd.wikipedia.org', 'ak.wikipedia.org', 'a
     b.wikipedia.org', 'bm.wikipedia.org', 'bi.wikipedia.org', 'bxr.wikipedia.org', '
     ch.wikipedia.org', 'ny.wikipedia.org', 'za.wikipedia.org', 'dsb.wikipedia.org', 
     'ee.wikipedia.org', 'ff.wikipedia.org', 'ki.wikipedia.org', 'got.wikipedia.org',
      'ha.wikipedia.org', 'ig.wikipedia.org', 'iu.wikipedia.org', 'ik.wikipedia.org',
      'ks.wikipedia.org', 'kg.wikipedia.org', 'lo.wikipedia.org', 'ltg.wikipedia.org'
     , 'lg.wikipedia.org', 'cdo.wikipedia.org', 'mwl.wikipedia.org', 'mo.wikipedia.or
     g', 'fj.wikipedia.org', 'na.wikipedia.org', 'cr.wikipedia.org', 'pih.wikipedia.o
     rg', 'om.wikipedia.org', 'pnt.wikipedia.org', 'kaa.wikipedia.org', 'dz.wikipedia
     .org', 'rmy.wikipedia.org', 'rn.wikipedia.org', 'sm.wikipedia.org', 'sg.wikipedi
     a.org', 'st.wikipedia.org', 'nso.wikipedia.org', 'tn.wikipedia.org', 'sn.wikiped
     ia.org', 'sd.wikipedia.org', 'cu.wikipedia.org', 'ss.wikipedia.org', 'srn.wikipe
     dia.org', 'ty.wikipedia.org', 'kab.wikipedia.org', 'tet.wikipedia.org', 'ti.wiki
     pedia.org', 'chr.wikipedia.org', 'tum.wikipedia.org', 'ts.wikipedia.org', 'chy.w
     ikipedia.org', 've.wikipedia.org', 'tw.wikipedia.org', 'vep.wikipedia.org', 'xh.
     wikipedia.org', 'zu.wikipedia.org', 'de.wikipedia.org', 'pl.wikipedia.org', 'ja.
     wikipedia.org', 'zh.wikipedia.org', 'ru.wikipedia.org', 'eo.wikipedia.org', 'vi.
     wikipedia.org', | 
|  | 60   # From http://www.google.com/supported_domains | 
|  | 61   'google.com', 'google.ad', 'google.ae', 'google.com.af', 'google.com.ag', 'goo
     gle.com.ai', 'google.am', 'google.co.ao', 'google.com.ar', 'google.as', 'google.
     at', 'google.com.au', 'google.az', 'google.ba', 'google.com.bd', 'google.be', 'g
     oogle.bf', 'google.bg', 'google.com.bh', 'google.bi', 'google.bj', 'google.com.b
     n', 'google.com.bo', 'google.com.br', 'google.bs', 'google.co.bw', 'google.by', 
     'google.com.bz', 'google.ca', 'google.cd', 'google.cf', 'google.cg', 'google.ch'
     , 'google.ci', 'google.co.ck', 'google.cl', 'google.cm', 'google.cn', 'google.co
     m.co', 'google.co.cr', 'google.com.cu', 'google.cv', 'google.com.cy', 'google.cz
     ', 'google.de', 'google.dj', 'google.dk', 'google.dm', 'google.com.do', 'google.
     dz', 'google.com.ec', 'google.ee', 'google.com.eg', 'google.es', 'google.com.et'
     , 'google.fi', 'google.com.fj', 'google.fm', 'google.fr', 'google.ga', 'google.g
     e', 'google.gg', 'google.com.gh', 'google.com.gi', 'google.gl', 'google.gm', 'go
     ogle.gp', 'google.gr', 'google.com.gt', 'google.gy', 'google.com.hk', 'google.hn
     ', 'google.hr', 'google.ht', 'google.hu', 'google.co.id', 'google.ie', 'google.c
     o.il', 'google.im', 'google.co.in', 'google.iq', 'google.is', 'google.it', 'goog
     le.je', 'google.com.jm', 'google.jo', 'google.co.jp', 'google.co.ke', 'google.co
     m.kh', 'google.ki', 'google.kg', 'google.co.kr', 'google.com.kw', 'google.kz', '
     google.la', 'google.com.lb', 'google.li', 'google.lk', 'google.co.ls', 'google.l
     t', 'google.lu', 'google.lv', 'google.com.ly', 'google.co.ma', 'google.md', 'goo
     gle.me', 'google.mg', 'google.mk', 'google.ml', 'google.mn', 'google.ms', 'googl
     e.com.mt', 'google.mu', 'google.mv', 'google.mw', 'google.com.mx', 'google.com.m
     y', 'google.co.mz', 'google.com.na', 'google.com.nf', 'google.com.ng', 'google.c
     om.ni', 'google.ne', 'google.nl', 'google.no', 'google.com.np', 'google.nr', 'go
     ogle.nu', 'google.co.nz', 'google.com.om', 'google.com.pa', 'google.com.pe', 'go
     ogle.com.ph', 'google.com.pk', 'google.pl', 'google.pn', 'google.com.pr', 'googl
     e.ps', 'google.pt', 'google.com.py', 'google.com.qa', 'google.ro', 'google.ru', 
     'google.rw', 'google.com.sa', 'google.com.sb', 'google.sc', 'google.se', 'google
     .com.sg', 'google.sh', 'google.si', 'google.sk', 'google.com.sl', 'google.sn', '
     google.so', 'google.sm', 'google.st', 'google.com.sv', 'google.td', 'google.tg',
      'google.co.th', 'google.com.tj', 'google.tk', 'google.tl', 'google.tm', 'google
     .tn', 'google.to', 'google.com.tr', 'google.tt', 'google.com.tw', 'google.co.tz'
     , 'google.com.ua', 'google.co.ug', 'google.co.uk', 'google.com.uy', 'google.co.u
     z', 'google.com.vc', 'google.co.ve', 'google.vg', 'google.co.vi', 'google.com.vn
     ', 'google.vu', 'google.ws', 'google.rs', 'google.co.za', 'google.co.zm', 'googl
     e.co.zw', 'google.cat', | 
|  | 62   # From http://www.ebay.ch/ (eBay-Websites) | 
|  | 63   'mercadolibre.com.ar', 'ebay.com.au', 'ebay.at', 'ebay.be', 'mercadolivre.com.
     br', 'ebay.com.cn', 'ebay.dk', 'ebay.de', 'ebay.fr', 'ebay.gr', 'ebay.co.uk', 'e
     bay.com.hk', 'ebay.in', 'ebay.ie', 'ebay.it', 'ebay.ca', 'auction.co.kr', 'ebay.
     com.my', 'mercadolibre.com.mx', 'pages.ebay.com', 'ebay.nl', 'ebay.no', 'ebay.ph
     ', 'ebay.pl', 'ebay.ru', 'ebay.se', 'ebay.com.sg', 'ebay.es', 'ruten.com.tw', 'e
     bay.co.th', 'gittigidiyor.com', 'ebay.cz', 'ebay.com', 'ebay.vn', | 
|  | 64   # From http://www.amazon.com/ (footer) | 
|  | 65   'amazon.ca', 'amazon.cn', 'amazon.fr', 'amazon.de', 'amazon.it', 'amazon.co.jp
     ', 'amazon.es', 'amazon.co.uk', | 
|  | 66 | 
|  | 67   # From http://en.wikipedia.org/wiki/.gov#States_in_GOV | 
|  | 68   'al.gov', 'alabama.gov', | 
|  | 69   'alaska.gov', | 
|  | 70   'az.gov', | 
|  | 71   'ar.gov', 'arkansas.gov', | 
|  | 72   'ca.gov', 'california.gov', | 
|  | 73   'colorado.gov', | 
|  | 74   'ct.gov', | 
|  | 75   'delaware.gov', | 
|  | 76   'florida.gov', 'fl.gov', | 
|  | 77   'georgia.gov', 'ga.gov', | 
|  | 78   'guam.gov', | 
|  | 79   'hawaii.gov', | 
|  | 80   'idaho.gov', | 
|  | 81   'illinois.gov', | 
|  | 82   'in.gov', | 
|  | 83   'iowa.gov', 'ia.gov', | 
|  | 84   'ks.gov', 'kansas.gov', | 
|  | 85   'ky.gov', 'kentucky.gov', | 
|  | 86   'louisiana.gov', | 
|  | 87   'maine.gov', | 
|  | 88   'maryland.gov', | 
|  | 89   'mass.gov', | 
|  | 90   'michigan.gov', | 
|  | 91   'mn.gov', | 
|  | 92   'mississippi.gov', | 
|  | 93   'mo.gov', | 
|  | 94   'mt.gov', 'montana.gov', | 
|  | 95   'nebraska.gov', | 
|  | 96   'nv.gov', | 
|  | 97   'nh.gov', 'visitnh.gov', | 
|  | 98   'nj.gov', 'newjersey.gov', | 
|  | 99   'newmexico.gov', | 
|  | 100   'ny.gov', | 
|  | 101   'nc.gov', 'northcarolina.gov', | 
|  | 102   'nd.gov', | 
|  | 103   'ohio.gov', 'oh.gov', | 
|  | 104   'ok.gov', | 
|  | 105   'oregon.gov', | 
|  | 106   'pa.gov', 'pennsylvania.gov', | 
|  | 107   'pr.gov', | 
|  | 108   'ri.gov', | 
|  | 109   'sc.gov', | 
|  | 110   'sd.gov', | 
|  | 111   'tennessee.gov', 'tn.gov', | 
|  | 112   'texas.gov', | 
|  | 113   'utah.gov', | 
|  | 114   'vermont.gov', | 
|  | 115   'virginia.gov', | 
|  | 116   'wa.gov', 'washington.gov', | 
|  | 117   'wv.gov', | 
|  | 118   'wisconsin.gov', | 
|  | 119   'wyoming.gov', | 
|  | 120   'dc.gov', | 
|  | 121 ] | 
|  | 122 | 
|  | 123 additionalTLDs = [ | 
|  | 124   # From http://en.wikipedia.org/wiki/.ar | 
|  | 125   'com.ar', 'edu.ar', 'gob.ar', 'gov.ar', 'int.ar', 'mil.ar', 'net.ar', 'org.ar'
     , 'tur.ar', | 
|  | 126 | 
|  | 127   # From http://en.wikipedia.org/wiki/.au | 
|  | 128   'com.au', 'net.au', 'org.au', 'edu.au', 'gov.au', 'csiro.au', 'asn.au', 'id.au
     ', | 
|  | 129 | 
|  | 130   # From http://en.wikipedia.org/wiki/.bd | 
|  | 131   'com.bd', 'edu.bd', 'ac.bd', 'net.bd', 'gov.bd', 'org.bd', 'mil.bd', | 
|  | 132 | 
|  | 133   # From http://en.wikipedia.org/wiki/.bn | 
|  | 134   'com.bn', 'edu.bn', 'gov.bn', 'net.bn', 'org.bn', | 
|  | 135 | 
|  | 136   # From http://en.wikipedia.org/wiki/.ck | 
|  | 137   'co.ck', 'org.ck', 'edu.ck', 'gov.ck', 'net.ck', 'gen.ck', 'biz.ck', 'info.ck'
     , | 
|  | 138 | 
|  | 139   # From http://en.wikipedia.org/wiki/.cy | 
|  | 140   'ac.cy', 'net.cy', 'gov.cy', 'org.cy', 'pro.cy', 'name.cy', 'ekloges.cy', | 
|  | 141   'tm.cy', 'ltd.cy', 'biz.cy', 'press.cy', 'parliament.cy', 'com.cy', | 
|  | 142 | 
|  | 143   # From http://en.wikipedia.org/wiki/.er | 
|  | 144   'com.er', 'edu.er', 'gov.er', 'mil.er', 'net.er', 'org.er', 'ind.er', | 
|  | 145 | 
|  | 146   # From http://en.wikipedia.org/wiki/.et | 
|  | 147   'com.et', 'gov.et', 'org.et', 'edu.et', 'net.et', 'biz.et', 'name.et', 'info.e
     t', | 
|  | 148 | 
|  | 149   # From http://en.wikipedia.org/wiki/.fj | 
|  | 150   'ac.fj', 'biz.fj', 'com.fj', 'info.fj', 'mil.fj', 'name.fj', 'net.fj', 'org.fj
     ', 'pro.fj', | 
|  | 151 | 
|  | 152   # From http://en.wikipedia.org/wiki/.fk | 
|  | 153   'co.fk', 'org.fk', 'gov.fk', 'ac.fk', 'nom.fk', 'net.fk', | 
|  | 154 | 
|  | 155   # From http://en.wikipedia.org/wiki/.gt | 
|  | 156   'com.gt', 'edu.gt', 'net.gt', 'gob.gt', 'org.gt', 'mil.gt', 'ind.gt', | 
|  | 157 | 
|  | 158   # From http://en.wikipedia.org/wiki/.gu | 
|  | 159   'com.gu', 'net.gu', 'gov.gu', 'org.gu', 'edu.gu', | 
|  | 160 | 
|  | 161   # From http://en.wikipedia.org/wiki/.il | 
|  | 162   'ac.il', 'co.il', 'org.il', 'net.il', 'k12.il', 'gov.il', 'muni.il', 'idf.il', | 
|  | 163 | 
|  | 164   # From http://en.wikipedia.org/wiki/.jm | 
|  | 165   'com.jm', 'net.jm', 'org.jm', 'edu.jm', 'gov.jm', 'mil.jm', | 
|  | 166 | 
|  | 167   # From http://en.wikipedia.org/wiki/.ke | 
|  | 168   'co.ke', 'or.ke', 'ne.ke', 'go.ke', 'ac.ke', 'sc.ke', 'me.ke', 'mobi.ke', 'inf
     o.ke', | 
|  | 169 | 
|  | 170   # From http://en.wikipedia.org/wiki/.kh | 
|  | 171   'per.kh', 'com.kh', 'edu.kh', 'gov.kh', 'mil.kh', 'net.kh', 'org.kh', | 
|  | 172 | 
|  | 173   # From http://en.wikipedia.org/wiki/.kw | 
|  | 174   'edu.kw', 'com.kw', 'net.kw', 'org.kw', 'gov.kw', | 
|  | 175 | 
|  | 176   # From http://en.wikipedia.org/wiki/.mm | 
|  | 177   'net.mm', 'com.mm', 'edu.mm', 'org.mm', 'gov.mm', | 
|  | 178 | 
|  | 179   # From http://en.wikipedia.org/wiki/.mt | 
|  | 180   'com.mt', 'org.mt', 'net.mt', 'edu.mt', 'gov.mt', | 
|  | 181 | 
|  | 182   # From http://en.wikipedia.org/wiki/.mz | 
|  | 183   'adv.mz', 'ac.mz', 'co.mz', 'org.mz', 'gov.mz', 'edu.mz', | 
|  | 184 | 
|  | 185   # From http://en.wikipedia.org/wiki/.ni | 
|  | 186   'gob.ni', 'co.ni', 'com.ni', 'ac.ni', 'edu.ni', 'org.ni', 'nom.ni', 'net.ni', 
     'mil.ni', | 
|  | 187 | 
|  | 188   # From http://en.wikipedia.org/wiki/.np | 
|  | 189   'com.np', 'edu.np', 'gov.np', 'mil.np', 'net.np', 'org.np', | 
|  | 190 | 
|  | 191   # From http://en.wikipedia.org/wiki/.nz | 
|  | 192   'ac.nz', 'co.nz', 'geek.nz', 'gen.nz', 'maori.nz', 'net.nz', 'org.nz', 'school
     .nz', | 
|  | 193   'cri.nz', 'govt.nz', 'iwi.nz', 'parliament.nz', 'mil.nz', 'health.nz', | 
|  | 194 | 
|  | 195   # From http://en.wikipedia.org/wiki/.om | 
|  | 196   'com.om', 'co.om', 'edu.om', 'ac.om', 'sch.om', 'gov.om', 'net.om', 'org.om', | 
|  | 197   'mil.om', 'museum.om', 'biz.om', 'pro.om', 'med.om', | 
|  | 198 | 
|  | 199   # From http://en.wikipedia.org/wiki/.pg | 
|  | 200   'com.pg', 'net.pg', 'ac.pg', 'gov.pg', 'mil.pg', 'org.pg', | 
|  | 201 | 
|  | 202   # From http://en.wikipedia.org/wiki/.py | 
|  | 203   'org.py', 'edu.py', 'mil.py', 'gov.py', 'net.py', 'com.py', 'coop.py', | 
|  | 204 | 
|  | 205   # From http://en.wikipedia.org/wiki/.qa | 
|  | 206   'com.qa', 'net.qa', 'org.qa', 'gov.qa', 'edu.qa', 'mil.qa', 'name.qa', 'sch.qa
     ', | 
|  | 207 | 
|  | 208   # From http://en.wikipedia.org/wiki/.sv | 
|  | 209   'edu.sv', 'gob.sv', 'com.sv', 'org.sv', 'red.sv', | 
|  | 210 | 
|  | 211   # From http://en.wikipedia.org/wiki/.tr | 
|  | 212   'com.tr', 'gen.tr', 'org.tr', 'biz.tr', 'info.tr', 'av.tr', 'dr.tr', 'pol.tr', | 
|  | 213   'bel.tr', 'tsk.tr', 'bbs.tr', 'k12.tr', 'edu.tr', 'name.tr', 'net.tr', 'gov.tr
     ', | 
|  | 214   'web.tr', 'tel.tr', 'tv.tr', 'nc.tr', | 
|  | 215 | 
|  | 216   # From http://en.wikipedia.org/wiki/.uk | 
|  | 217   'ac.uk', 'co.uk', 'gov.uk', 'judiciary.uk', 'ltd.uk', 'me.uk', 'mod.uk', 'net.
     uk', | 
|  | 218   'nhs.uk', 'nic.uk', 'org.uk',  'parliament.uk', 'plc.uk', 'police.uk', 'sch.uk
     ', | 
|  | 219 | 
|  | 220   # From http://en.wikipedia.org/wiki/.uy | 
|  | 221   'com.uy', 'edu.uy', 'gub.uy', 'net.uy', 'mil.uy', 'org.uy', | 
|  | 222 | 
|  | 223   # From http://en.wikipedia.org/wiki/.ve | 
|  | 224   'com.ve', 'net.ve', 'org.ve', 'info.ve', 'co.ve', 'web.ve', 'gob.ve', 'edu.ve'
     , 'mil.ve', 'tec.ve', | 
|  | 225 | 
|  | 226   # From http://en.wikipedia.org/wiki/.ye | 
|  | 227   'com.ye', 'co.ye', 'ltd.ye', 'me.ye', 'net.ye', 'org.ye', 'plc.ye', 'gov.ye', | 
|  | 228 | 
|  | 229   # From http://en.wikipedia.org/wiki/.za | 
|  | 230   'ac.za', 'city.za', 'co.za', 'edu.za', 'gov.za', 'law.za', 'mil.za', 'nom.za',
      'org.za', 'school.za', | 
|  | 231   'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', | 
|  | 232   'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o
     livetti.za', 'pix.za', | 
|  | 233 | 
|  | 234   # From http://en.wikipedia.org/wiki/.zm | 
|  | 235   'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', | 
|  | 236 | 
|  | 237   # From http://en.wikipedia.org/wiki/.zw | 
|  | 238   'co.zw', 'ac.zw', 'org.zw', | 
|  | 239 ] | 
|  | 240 | 
|  | 241 def getSuffixes(target, items): | 
|  | 242   suffixes = {} | 
|  | 243   for item, priority in items.iteritems(): | 
|  | 244     suffix = item[-1] if len(item) else '' | 
|  | 245     if not suffix in suffixes: | 
|  | 246       suffixes[suffix] = {} | 
|  | 247     suffixes[suffix][item[:-1]] = priority | 
|  | 248   for suffix, items in suffixes.iteritems(): | 
|  | 249     if len(items.keys()) == 1: | 
|  | 250       item, priority = items.items()[0] | 
|  | 251       target[suffix] = ''.join(reversed(item)) + ' ' + str(priority) | 
|  | 252     else: | 
|  | 253       target[suffix] = {} | 
|  | 254       getSuffixes(target[suffix], items) | 
|  | 255 | 
|  | 256 def urlopen(url, attempts=3): | 
|  | 257   """ | 
|  | 258   Tries to open a particular URL, retries on failure. | 
|  | 259   """ | 
|  | 260   for i in range(attempts): | 
|  | 261     try: | 
|  | 262       return urllib.urlopen(url) | 
|  | 263     except IOError, e: | 
|  | 264       error = e | 
|  | 265       time.sleep(5) | 
|  | 266   raise error | 
|  | 267 | 
|  | 268 def iterateTLDs(): | 
|  | 269   for tld in additionalTLDs: | 
|  | 270     yield tld | 
|  | 271 | 
|  | 272   url = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld
     _names.dat?raw=1' | 
|  | 273   resource = urlopen(url) | 
|  | 274   for line in resource.read().decode('utf-8').splitlines(): | 
|  | 275     line = line.rstrip() | 
|  | 276     if line.startswith("//"): | 
|  | 277       continue | 
|  | 278 | 
|  | 279     if line.startswith('*.'): | 
|  | 280       tld = line[2:] | 
|  | 281     elif line.startswith('!'): | 
|  | 282       tld = line[1:] | 
|  | 283     else: | 
|  | 284       tld = line | 
|  | 285 | 
|  | 286     if tld: | 
|  | 287       yield tld | 
|  | 288 | 
|  | 289 def getTLDs(domains, tldPriority): | 
|  | 290   for tld in iterateTLDs(): | 
|  | 291     if not tld in domains: | 
|  | 292       domains[tld] = tldPriority | 
|  | 293 | 
|  | 294 def updateSchemes(rules): | 
|  | 295   rules['scheme'] = {} | 
|  | 296   getSuffixes(rules['scheme'], schemes) | 
|  | 297 | 
|  | 298 def updateDomains(rules): | 
|  | 299   domains = {} | 
|  | 300   reader = codecs.getreader('utf-8')(sys.stdin) | 
|  | 301   i = 0 | 
|  | 302   for domain in itertools.chain(reader.readlines(), additionalDomains): | 
|  | 303     domain = domain.rstrip() | 
|  | 304     if not domain or domain in domains: | 
|  | 305       continue | 
|  | 306     domains[domain] = i | 
|  | 307     i += 1 | 
|  | 308 | 
|  | 309   maxPriority = i | 
|  | 310   for domain in domains.iterkeys(): | 
|  | 311     domains[domain] = maxPriority - domains[domain] | 
|  | 312 | 
|  | 313   # Extract TLDs from domain list | 
|  | 314   for domain, priority in domains.items(): | 
|  | 315     while True: | 
|  | 316       if not re.search(r'^[^.]+\.+', domain): | 
|  | 317         break | 
|  | 318       domain = re.sub(r'^[^.]+\.+', '', domain) | 
|  | 319       if not domain: | 
|  | 320         break | 
|  | 321       if not domain in domains or domains[domain] < priority - maxPriority: | 
|  | 322         domains[domain] = priority - maxPriority | 
|  | 323 | 
|  | 324   # Fill up with "official" TLDs | 
|  | 325   getTLDs(domains, -maxPriority) | 
|  | 326 | 
|  | 327   rules['domain'] = {} | 
|  | 328   getSuffixes(rules['domain'], domains) | 
|  | 329 | 
|  | 330 def writeRules(rules): | 
|  | 331   path = os.path.join('defaults', 'rules.json') | 
|  | 332   file = codecs.open(path, 'rb', encoding='utf-8') | 
|  | 333   data = file.read() | 
|  | 334   file.close() | 
|  | 335 | 
|  | 336   marker = '// Automatically generated dictionaries' | 
|  | 337   markerIndex = data.find(marker) | 
|  | 338   if markerIndex < 0: | 
|  | 339     raise Exception('Insertion marker not found in %s' % path) | 
|  | 340   data = data[0:markerIndex + len(marker)] + '\n' | 
|  | 341   data += '  ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator
     s = (',', ':'))[1:-1] + '\n}\n' | 
|  | 342 | 
|  | 343   file = codecs.open(path, 'wb', encoding='utf-8') | 
|  | 344   file.write(data) | 
|  | 345   file.close() | 
|  | 346 | 
|  | 347 def updateRules(): | 
|  | 348   rules = {} | 
|  | 349   rules['domainReferrals'] = domainReferrals | 
|  | 350   updateSchemes(rules) | 
|  | 351   updateDomains(rules) | 
|  | 352   writeRules(rules) | 
|  | 353 | 
|  | 354 if __name__ == "__main__": | 
|  | 355   updateRules() | 
| OLD | NEW | 
|---|