| Index: updateRules.py | 
| =================================================================== | 
| new file mode 100644 | 
| --- /dev/null | 
| +++ b/updateRules.py | 
| @@ -0,0 +1,355 @@ | 
| +#!/usr/bin/env python | 
| +# coding: utf-8 | 
| + | 
| +# This Source Code is subject to the terms of the Mozilla Public License | 
| +# version 2.0 (the "License"). You can obtain a copy of the License at | 
| +# http://mozilla.org/MPL/2.0/. | 
| + | 
| +""" | 
| +Update the dictionaries in the rules | 
| +==================================== | 
| + | 
| + This script generates the dictionaries in the defaults/rules.js file based | 
| + on various sources like the list of public suffixes (http://publicsuffix.org/). | 
| +""" | 
| + | 
| +import sys | 
| +import os | 
| +import re | 
| +import urllib | 
| +import codecs | 
| +import json | 
| +import itertools | 
| + | 
| +schemes = { | 
| + 'http:': 4, | 
| + 'https:': 3, | 
| + 'ftp:': 2, | 
| + 'irc:': 1, | 
| +} | 
| + | 
| +domainReferrals = { | 
| + 'amazon.co.uk': 'tag=uf07d-21', | 
| + 'amazon.com': 'tag=uf024-20', | 
| + 'amazon.de': 'tag=uf0e6-21', | 
| + 'amazon.fr': 'tag=uf02b-21', | 
| + 'amazon.es': 'tag=uf07-21', | 
| + 'amazon.it': 'tag=uf08d-21', | 
| + 'ozon.ru': 'partner=urlfixer', | 
| +} | 
| + | 
| +additionalDomains = [ | 
| + 'fab.com', | 
| + 'ku.dk', | 
| + 'google.cz', | 
| + 'komplett.ie', | 
| + 'lotto.ie', | 
| + 'bt.yahoo.com', | 
| + 'o.co', | 
| + 'bet.hu', | 
| + 'haz.de', | 
| + 'sas.com', | 
| + 'nic.ir', | 
| + 'tomtop.com', | 
| + 'uwa.edu.au', | 
| + 'spacex.com', | 
| + 'eif.org', | 
| + 'geld.de', | 
| + # From http://www.wikipedia.org/ | 
| + 'en.wikipedia.org', 'ja.wikipedia.org', 'de.wikipedia.org', 'es.wikipedia.org', 'ru.wikipedia.org', 'fr.wikipedia.org', 'it.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org', 'zh.wikipedia.org', 'ar.wikipedia.org', 'bg.wikipedia.org', 'ca.wikipedia.org', 'cs.wikipedia.org', 'da.wikipedia.org', 'de.wikipedia.org', 'en.wikipedia.org', 'es.wikipedia.org', 'eo.wikipedia.org', 'eu.wikipedia.org', 'fa.wikipedia.org', 'fr.wikipedia.org', 'ko.wikipedia.org', 'hi.wikipedia.org', 'hr.wikipedia.org', 'id.wikipedia.org', 'it.wikipedia.org', 'he.wikipedia.org', 'lt.wikipedia.org', 'hu.wikipedia.org', 'ms.wikipedia.org', 'nl.wikipedia.org', 'ja.wikipedia.org', 'no.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org', 'kk.wikipedia.org', 'ro.wikipedia.org', 'ru.wikipedia.org', 'sk.wikipedia.org', 'sl.wikipedia.org', 'sr.wikipedia.org', 'fi.wikipedia.org', 'sv.wikipedia.org', 'tr.wikipedia.org', 'uk.wikipedia.org', 'vi.wikipedia.org', 'vo.wikipedia.org', 'war.wikipedia.org', 'zh.wikipedia.org', 'af.wikipedia.org', 'als.wikipedia.org', 'am.wikipedia.org', 'an.wikipedia.org', 'ast.wikipedia.org', 'ht.wikipedia.org', 'az.wikipedia.org', 'bn.wikipedia.org', 'ba.wikipedia.org', 'be.wikipedia.org', 'bpy.wikipedia.org', 'bs.wikipedia.org', 'br.wikipedia.org', 'cv.wikipedia.org', 'cy.wikipedia.org', 'et.wikipedia.org', 'el.wikipedia.org', 'fy.wikipedia.org', 'ga.wikipedia.org', 'gl.wikipedia.org', 'gu.wikipedia.org', 'hy.wikipedia.org', 'io.wikipedia.org', 'ia.wikipedia.org', 'is.wikipedia.org', 'jv.wikipedia.org', 'kn.wikipedia.org', 'ka.wikipedia.org', 'ku.wikipedia.org', 'la.wikipedia.org', 'lv.wikipedia.org', 'lb.wikipedia.org', 'lmo.wikipedia.org', 'mk.wikipedia.org', 'mg.wikipedia.org', 'ml.wikipedia.org', 'mr.wikipedia.org', 'my.wikipedia.org', 'new.wikipedia.org', 'ne.wikipedia.org', 'nn.wikipedia.org', 'nap.wikipedia.org', 'oc.wikipedia.org', 'pms.wikipedia.org', 'nds.wikipedia.org', 'qu.wikipedia.org', 'pnb.wikipedia.org', 'sq.wikipedia.org', 'scn.wikipedia.org', 'simple.wikipedia.org', 'ceb.wikipedia.org', 'sh.wikipedia.org', 'su.wikipedia.org', 'sw.wikipedia.org', 'tl.wikipedia.org', 'ta.wikipedia.org', 'tt.wikipedia.org', 'te.wikipedia.org', 'th.wikipedia.org', 'bug.wikipedia.org', 'ur.wikipedia.org', 'wa.wikipedia.org', 'yo.wikipedia.org', 'diq.wikipedia.org', 'ace.wikipedia.org', 'frp.wikipedia.org', 'arc.wikipedia.org', 'gn.wikipedia.org', 'av.wikipedia.org', 'ay.wikipedia.org', 'bjn.wikipedia.org', 'bh.wikipedia.org', 'bcl.wikipedia.org', 'bar.wikipedia.org', 'bo.wikipedia.org', 'co.wikipedia.org', 'pdc.wikipedia.org', 'dv.wikipedia.org', 'nv.wikipedia.org', 'ang.wikipedia.org', 'eml.wikipedia.org', 'myv.wikipedia.org', 'ext.wikipedia.org', 'hif.wikipedia.org', 'fo.wikipedia.org', 'frr.wikipedia.org', 'fur.wikipedia.org', 'gv.wikipedia.org', 'gag.wikipedia.org', 'gd.wikipedia.org', 'gan.wikipedia.org', 'glk.wikipedia.org', 'hak.wikipedia.org', 'xal.wikipedia.org', 'haw.wikipedia.org', 'hsb.wikipedia.org', 'ilo.wikipedia.org', 'ie.wikipedia.org', 'os.wikipedia.org', 'kl.wikipedia.org', 'pam.wikipedia.org', 'csb.wikipedia.org', 'kw.wikipedia.org', 'km.wikipedia.org', 'rw.wikipedia.org', 'kv.wikipedia.org', 'ky.wikipedia.org', 'mrj.wikipedia.org', 'lad.wikipedia.org', 'lbe.wikipedia.org', 'lij.wikipedia.org', 'li.wikipedia.org', 'ln.wikipedia.org', 'jbo.wikipedia.org', 'mt.wikipedia.org', 'mi.wikipedia.org', 'xmf.wikipedia.org', 'arz.wikipedia.org', 'mzn.wikipedia.org', 'mdf.wikipedia.org', 'mn.wikipedia.org', 'nah.wikipedia.org', 'nrm.wikipedia.org', 'nov.wikipedia.org', 'ce.wikipedia.org', 'mhr.wikipedia.org', 'or.wikipedia.org', 'as.wikipedia.org', 'uz.wikipedia.org', 'pi.wikipedia.org', 'pag.wikipedia.org', 'pa.wikipedia.org', 'pap.wikipedia.org', 'ps.wikipedia.org', 'koi.wikipedia.org', 'pfl.wikipedia.org', 'pcd.wikipedia.org', 'krc.wikipedia.org', 'crh.wikipedia.org', 'ksh.wikipedia.org', 'rm.wikipedia.org', 'rue.wikipedia.org', 'sa.wikipedia.org', 'se.wikipedia.org', 'sc.wikipedia.org', 'sah.wikipedia.org', 'sco.wikipedia.org', 'stq.wikipedia.org', 'si.wikipedia.org', 'szl.wikipedia.org', 'so.wikipedia.org', 'ckb.wikipedia.org', 'tg.wikipedia.org', 'tpi.wikipedia.org', 'to.wikipedia.org', 'tk.wikipedia.org', 'udm.wikipedia.org', 'ug.wikipedia.org', 'vec.wikipedia.org', 'vls.wikipedia.org', 'wo.wikipedia.org', 'wuu.wikipedia.org', 'yi.wikipedia.org', 'zea.wikipedia.org', 'kbd.wikipedia.org', 'ak.wikipedia.org', 'ab.wikipedia.org', 'bm.wikipedia.org', 'bi.wikipedia.org', 'bxr.wikipedia.org', 'ch.wikipedia.org', 'ny.wikipedia.org', 'za.wikipedia.org', 'dsb.wikipedia.org', 'ee.wikipedia.org', 'ff.wikipedia.org', 'ki.wikipedia.org', 'got.wikipedia.org', 'ha.wikipedia.org', 'ig.wikipedia.org', 'iu.wikipedia.org', 'ik.wikipedia.org', 'ks.wikipedia.org', 'kg.wikipedia.org', 'lo.wikipedia.org', 'ltg.wikipedia.org', 'lg.wikipedia.org', 'cdo.wikipedia.org', 'mwl.wikipedia.org', 'mo.wikipedia.org', 'fj.wikipedia.org', 'na.wikipedia.org', 'cr.wikipedia.org', 'pih.wikipedia.org', 'om.wikipedia.org', 'pnt.wikipedia.org', 'kaa.wikipedia.org', 'dz.wikipedia.org', 'rmy.wikipedia.org', 'rn.wikipedia.org', 'sm.wikipedia.org', 'sg.wikipedia.org', 'st.wikipedia.org', 'nso.wikipedia.org', 'tn.wikipedia.org', 'sn.wikipedia.org', 'sd.wikipedia.org', 'cu.wikipedia.org', 'ss.wikipedia.org', 'srn.wikipedia.org', 'ty.wikipedia.org', 'kab.wikipedia.org', 'tet.wikipedia.org', 'ti.wikipedia.org', 'chr.wikipedia.org', 'tum.wikipedia.org', 'ts.wikipedia.org', 'chy.wikipedia.org', 've.wikipedia.org', 'tw.wikipedia.org', 'vep.wikipedia.org', 'xh.wikipedia.org', 'zu.wikipedia.org', 'de.wikipedia.org', 'pl.wikipedia.org', 'ja.wikipedia.org', 'zh.wikipedia.org', 'ru.wikipedia.org', 'eo.wikipedia.org', 'vi.wikipedia.org', | 
| + # From http://www.google.com/supported_domains | 
| + 'google.com', 'google.ad', 'google.ae', 'google.com.af', 'google.com.ag', 'google.com.ai', 'google.am', 'google.co.ao', 'google.com.ar', 'google.as', 'google.at', 'google.com.au', 'google.az', 'google.ba', 'google.com.bd', 'google.be', 'google.bf', 'google.bg', 'google.com.bh', 'google.bi', 'google.bj', 'google.com.bn', 'google.com.bo', 'google.com.br', 'google.bs', 'google.co.bw', 'google.by', 'google.com.bz', 'google.ca', 'google.cd', 'google.cf', 'google.cg', 'google.ch', 'google.ci', 'google.co.ck', 'google.cl', 'google.cm', 'google.cn', 'google.com.co', 'google.co.cr', 'google.com.cu', 'google.cv', 'google.com.cy', 'google.cz', 'google.de', 'google.dj', 'google.dk', 'google.dm', 'google.com.do', 'google.dz', 'google.com.ec', 'google.ee', 'google.com.eg', 'google.es', 'google.com.et', 'google.fi', 'google.com.fj', 'google.fm', 'google.fr', 'google.ga', 'google.ge', 'google.gg', 'google.com.gh', 'google.com.gi', 'google.gl', 'google.gm', 'google.gp', 'google.gr', 'google.com.gt', 'google.gy', 'google.com.hk', 'google.hn', 'google.hr', 'google.ht', 'google.hu', 'google.co.id', 'google.ie', 'google.co.il', 'google.im', 'google.co.in', 'google.iq', 'google.is', 'google.it', 'google.je', 'google.com.jm', 'google.jo', 'google.co.jp', 'google.co.ke', 'google.com.kh', 'google.ki', 'google.kg', 'google.co.kr', 'google.com.kw', 'google.kz', 'google.la', 'google.com.lb', 'google.li', 'google.lk', 'google.co.ls', 'google.lt', 'google.lu', 'google.lv', 'google.com.ly', 'google.co.ma', 'google.md', 'google.me', 'google.mg', 'google.mk', 'google.ml', 'google.mn', 'google.ms', 'google.com.mt', 'google.mu', 'google.mv', 'google.mw', 'google.com.mx', 'google.com.my', 'google.co.mz', 'google.com.na', 'google.com.nf', 'google.com.ng', 'google.com.ni', 'google.ne', 'google.nl', 'google.no', 'google.com.np', 'google.nr', 'google.nu', 'google.co.nz', 'google.com.om', 'google.com.pa', 'google.com.pe', 'google.com.ph', 'google.com.pk', 'google.pl', 'google.pn', 'google.com.pr', 'google.ps', 'google.pt', 'google.com.py', 'google.com.qa', 'google.ro', 'google.ru', 'google.rw', 'google.com.sa', 'google.com.sb', 'google.sc', 'google.se', 'google.com.sg', 'google.sh', 'google.si', 'google.sk', 'google.com.sl', 'google.sn', 'google.so', 'google.sm', 'google.st', 'google.com.sv', 'google.td', 'google.tg', 'google.co.th', 'google.com.tj', 'google.tk', 'google.tl', 'google.tm', 'google.tn', 'google.to', 'google.com.tr', 'google.tt', 'google.com.tw', 'google.co.tz', 'google.com.ua', 'google.co.ug', 'google.co.uk', 'google.com.uy', 'google.co.uz', 'google.com.vc', 'google.co.ve', 'google.vg', 'google.co.vi', 'google.com.vn', 'google.vu', 'google.ws', 'google.rs', 'google.co.za', 'google.co.zm', 'google.co.zw', 'google.cat', | 
| + # From http://www.ebay.ch/ (eBay-Websites) | 
| + 'mercadolibre.com.ar', 'ebay.com.au', 'ebay.at', 'ebay.be', 'mercadolivre.com.br', 'ebay.com.cn', 'ebay.dk', 'ebay.de', 'ebay.fr', 'ebay.gr', 'ebay.co.uk', 'ebay.com.hk', 'ebay.in', 'ebay.ie', 'ebay.it', 'ebay.ca', 'auction.co.kr', 'ebay.com.my', 'mercadolibre.com.mx', 'pages.ebay.com', 'ebay.nl', 'ebay.no', 'ebay.ph', 'ebay.pl', 'ebay.ru', 'ebay.se', 'ebay.com.sg', 'ebay.es', 'ruten.com.tw', 'ebay.co.th', 'gittigidiyor.com', 'ebay.cz', 'ebay.com', 'ebay.vn', | 
| + # From http://www.amazon.com/ (footer) | 
| + 'amazon.ca', 'amazon.cn', 'amazon.fr', 'amazon.de', 'amazon.it', 'amazon.co.jp', 'amazon.es', 'amazon.co.uk', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.gov#States_in_GOV | 
| + 'al.gov', 'alabama.gov', | 
| + 'alaska.gov', | 
| + 'az.gov', | 
| + 'ar.gov', 'arkansas.gov', | 
| + 'ca.gov', 'california.gov', | 
| + 'colorado.gov', | 
| + 'ct.gov', | 
| + 'delaware.gov', | 
| + 'florida.gov', 'fl.gov', | 
| + 'georgia.gov', 'ga.gov', | 
| + 'guam.gov', | 
| + 'hawaii.gov', | 
| + 'idaho.gov', | 
| + 'illinois.gov', | 
| + 'in.gov', | 
| + 'iowa.gov', 'ia.gov', | 
| + 'ks.gov', 'kansas.gov', | 
| + 'ky.gov', 'kentucky.gov', | 
| + 'louisiana.gov', | 
| + 'maine.gov', | 
| + 'maryland.gov', | 
| + 'mass.gov', | 
| + 'michigan.gov', | 
| + 'mn.gov', | 
| + 'mississippi.gov', | 
| + 'mo.gov', | 
| + 'mt.gov', 'montana.gov', | 
| + 'nebraska.gov', | 
| + 'nv.gov', | 
| + 'nh.gov', 'visitnh.gov', | 
| + 'nj.gov', 'newjersey.gov', | 
| + 'newmexico.gov', | 
| + 'ny.gov', | 
| + 'nc.gov', 'northcarolina.gov', | 
| + 'nd.gov', | 
| + 'ohio.gov', 'oh.gov', | 
| + 'ok.gov', | 
| + 'oregon.gov', | 
| + 'pa.gov', 'pennsylvania.gov', | 
| + 'pr.gov', | 
| + 'ri.gov', | 
| + 'sc.gov', | 
| + 'sd.gov', | 
| + 'tennessee.gov', 'tn.gov', | 
| + 'texas.gov', | 
| + 'utah.gov', | 
| + 'vermont.gov', | 
| + 'virginia.gov', | 
| + 'wa.gov', 'washington.gov', | 
| + 'wv.gov', | 
| + 'wisconsin.gov', | 
| + 'wyoming.gov', | 
| + 'dc.gov', | 
| +] | 
| + | 
| +additionalTLDs = [ | 
| + # From http://en.wikipedia.org/wiki/.ar | 
| + 'com.ar', 'edu.ar', 'gob.ar', 'gov.ar', 'int.ar', 'mil.ar', 'net.ar', 'org.ar', 'tur.ar', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.au | 
| + 'com.au', 'net.au', 'org.au', 'edu.au', 'gov.au', 'csiro.au', 'asn.au', 'id.au', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.bd | 
| + 'com.bd', 'edu.bd', 'ac.bd', 'net.bd', 'gov.bd', 'org.bd', 'mil.bd', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.bn | 
| + 'com.bn', 'edu.bn', 'gov.bn', 'net.bn', 'org.bn', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.ck | 
| + 'co.ck', 'org.ck', 'edu.ck', 'gov.ck', 'net.ck', 'gen.ck', 'biz.ck', 'info.ck', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.cy | 
| + 'ac.cy', 'net.cy', 'gov.cy', 'org.cy', 'pro.cy', 'name.cy', 'ekloges.cy', | 
| + 'tm.cy', 'ltd.cy', 'biz.cy', 'press.cy', 'parliament.cy', 'com.cy', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.er | 
| + 'com.er', 'edu.er', 'gov.er', 'mil.er', 'net.er', 'org.er', 'ind.er', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.et | 
| + 'com.et', 'gov.et', 'org.et', 'edu.et', 'net.et', 'biz.et', 'name.et', 'info.et', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.fj | 
| + 'ac.fj', 'biz.fj', 'com.fj', 'info.fj', 'mil.fj', 'name.fj', 'net.fj', 'org.fj', 'pro.fj', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.fk | 
| + 'co.fk', 'org.fk', 'gov.fk', 'ac.fk', 'nom.fk', 'net.fk', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.gt | 
| + 'com.gt', 'edu.gt', 'net.gt', 'gob.gt', 'org.gt', 'mil.gt', 'ind.gt', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.gu | 
| + 'com.gu', 'net.gu', 'gov.gu', 'org.gu', 'edu.gu', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.il | 
| + 'ac.il', 'co.il', 'org.il', 'net.il', 'k12.il', 'gov.il', 'muni.il', 'idf.il', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.jm | 
| + 'com.jm', 'net.jm', 'org.jm', 'edu.jm', 'gov.jm', 'mil.jm', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.ke | 
| + 'co.ke', 'or.ke', 'ne.ke', 'go.ke', 'ac.ke', 'sc.ke', 'me.ke', 'mobi.ke', 'info.ke', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.kh | 
| + 'per.kh', 'com.kh', 'edu.kh', 'gov.kh', 'mil.kh', 'net.kh', 'org.kh', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.kw | 
| + 'edu.kw', 'com.kw', 'net.kw', 'org.kw', 'gov.kw', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.mm | 
| + 'net.mm', 'com.mm', 'edu.mm', 'org.mm', 'gov.mm', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.mt | 
| + 'com.mt', 'org.mt', 'net.mt', 'edu.mt', 'gov.mt', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.mz | 
| + 'adv.mz', 'ac.mz', 'co.mz', 'org.mz', 'gov.mz', 'edu.mz', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.ni | 
| + 'gob.ni', 'co.ni', 'com.ni', 'ac.ni', 'edu.ni', 'org.ni', 'nom.ni', 'net.ni', 'mil.ni', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.np | 
| + 'com.np', 'edu.np', 'gov.np', 'mil.np', 'net.np', 'org.np', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.nz | 
| + 'ac.nz', 'co.nz', 'geek.nz', 'gen.nz', 'maori.nz', 'net.nz', 'org.nz', 'school.nz', | 
| + 'cri.nz', 'govt.nz', 'iwi.nz', 'parliament.nz', 'mil.nz', 'health.nz', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.om | 
| + 'com.om', 'co.om', 'edu.om', 'ac.om', 'sch.om', 'gov.om', 'net.om', 'org.om', | 
| + 'mil.om', 'museum.om', 'biz.om', 'pro.om', 'med.om', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.pg | 
| + 'com.pg', 'net.pg', 'ac.pg', 'gov.pg', 'mil.pg', 'org.pg', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.py | 
| + 'org.py', 'edu.py', 'mil.py', 'gov.py', 'net.py', 'com.py', 'coop.py', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.qa | 
| + 'com.qa', 'net.qa', 'org.qa', 'gov.qa', 'edu.qa', 'mil.qa', 'name.qa', 'sch.qa', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.sv | 
| + 'edu.sv', 'gob.sv', 'com.sv', 'org.sv', 'red.sv', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.tr | 
| + 'com.tr', 'gen.tr', 'org.tr', 'biz.tr', 'info.tr', 'av.tr', 'dr.tr', 'pol.tr', | 
| + 'bel.tr', 'tsk.tr', 'bbs.tr', 'k12.tr', 'edu.tr', 'name.tr', 'net.tr', 'gov.tr', | 
| + 'web.tr', 'tel.tr', 'tv.tr', 'nc.tr', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.uk | 
| + 'ac.uk', 'co.uk', 'gov.uk', 'judiciary.uk', 'ltd.uk', 'me.uk', 'mod.uk', 'net.uk', | 
| + 'nhs.uk', 'nic.uk', 'org.uk', 'parliament.uk', 'plc.uk', 'police.uk', 'sch.uk', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.uy | 
| + 'com.uy', 'edu.uy', 'gub.uy', 'net.uy', 'mil.uy', 'org.uy', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.ve | 
| + 'com.ve', 'net.ve', 'org.ve', 'info.ve', 'co.ve', 'web.ve', 'gob.ve', 'edu.ve', 'mil.ve', 'tec.ve', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.ye | 
| + 'com.ye', 'co.ye', 'ltd.ye', 'me.ye', 'net.ye', 'org.ye', 'plc.ye', 'gov.ye', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.za | 
| + 'ac.za', 'city.za', 'co.za', 'edu.za', 'gov.za', 'law.za', 'mil.za', 'nom.za', 'org.za', 'school.za', | 
| + 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', | 
| + 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'olivetti.za', 'pix.za', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.zm | 
| + 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', | 
| + | 
| + # From http://en.wikipedia.org/wiki/.zw | 
| + 'co.zw', 'ac.zw', 'org.zw', | 
| +] | 
| + | 
| +def getSuffixes(target, items): | 
| + suffixes = {} | 
| + for item, priority in items.iteritems(): | 
| + suffix = item[-1] if len(item) else '' | 
| + if not suffix in suffixes: | 
| + suffixes[suffix] = {} | 
| + suffixes[suffix][item[:-1]] = priority | 
| + for suffix, items in suffixes.iteritems(): | 
| + if len(items.keys()) == 1: | 
| + item, priority = items.items()[0] | 
| + target[suffix] = ''.join(reversed(item)) + ' ' + str(priority) | 
| + else: | 
| + target[suffix] = {} | 
| + getSuffixes(target[suffix], items) | 
| + | 
| +def urlopen(url, attempts=3): | 
| + """ | 
| + Tries to open a particular URL, retries on failure. | 
| + """ | 
| + for i in range(attempts): | 
| + try: | 
| + return urllib.urlopen(url) | 
| + except IOError, e: | 
| + error = e | 
| + time.sleep(5) | 
| + raise error | 
| + | 
| +def iterateTLDs(): | 
| + for tld in additionalTLDs: | 
| + yield tld | 
| + | 
| + url = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1' | 
| + resource = urlopen(url) | 
| + for line in resource.read().decode('utf-8').splitlines(): | 
| + line = line.rstrip() | 
| + if line.startswith("//"): | 
| + continue | 
| + | 
| + if line.startswith('*.'): | 
| + tld = line[2:] | 
| + elif line.startswith('!'): | 
| + tld = line[1:] | 
| + else: | 
| + tld = line | 
| + | 
| + if tld: | 
| + yield tld | 
| + | 
| +def getTLDs(domains, tldPriority): | 
| + for tld in iterateTLDs(): | 
| + if not tld in domains: | 
| + domains[tld] = tldPriority | 
| + | 
| +def updateSchemes(rules): | 
| + rules['scheme'] = {} | 
| + getSuffixes(rules['scheme'], schemes) | 
| + | 
| +def updateDomains(rules): | 
| + domains = {} | 
| + reader = codecs.getreader('utf-8')(sys.stdin) | 
| + i = 0 | 
| + for domain in itertools.chain(reader.readlines(), additionalDomains): | 
| + domain = domain.rstrip() | 
| + if not domain or domain in domains: | 
| + continue | 
| + domains[domain] = i | 
| + i += 1 | 
| + | 
| + maxPriority = i | 
| + for domain in domains.iterkeys(): | 
| + domains[domain] = maxPriority - domains[domain] | 
| + | 
| + # Extract TLDs from domain list | 
| + for domain, priority in domains.items(): | 
| + while True: | 
| + if not re.search(r'^[^.]+\.+', domain): | 
| + break | 
| + domain = re.sub(r'^[^.]+\.+', '', domain) | 
| + if not domain: | 
| + break | 
| + if not domain in domains or domains[domain] < priority - maxPriority: | 
| + domains[domain] = priority - maxPriority | 
| + | 
| + # Fill up with "official" TLDs | 
| + getTLDs(domains, -maxPriority) | 
| + | 
| + rules['domain'] = {} | 
| + getSuffixes(rules['domain'], domains) | 
| + | 
| +def writeRules(rules): | 
| + path = os.path.join('defaults', 'rules.json') | 
| + file = codecs.open(path, 'rb', encoding='utf-8') | 
| + data = file.read() | 
| + file.close() | 
| + | 
| + marker = '// Automatically generated dictionaries' | 
| + markerIndex = data.find(marker) | 
| + if markerIndex < 0: | 
| + raise Exception('Insertion marker not found in %s' % path) | 
| + data = data[0:markerIndex + len(marker)] + '\n' | 
| + data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separators = (',', ':'))[1:-1] + '\n}\n' | 
| + | 
| + file = codecs.open(path, 'wb', encoding='utf-8') | 
| + file.write(data) | 
| + file.close() | 
| + | 
| +def updateRules(): | 
| + rules = {} | 
| + rules['domainReferrals'] = domainReferrals | 
| + updateSchemes(rules) | 
| + updateDomains(rules) | 
| + writeRules(rules) | 
| + | 
| +if __name__ == "__main__": | 
| + updateRules() |