| Index: updateRules.py |
| =================================================================== |
| new file mode 100644 |
| --- /dev/null |
| +++ b/updateRules.py |
| @@ -0,0 +1,355 @@ |
| +#!/usr/bin/env python |
| +# coding: utf-8 |
| + |
| +# This Source Code is subject to the terms of the Mozilla Public License |
| +# version 2.0 (the "License"). You can obtain a copy of the License at |
| +# http://mozilla.org/MPL/2.0/. |
| + |
| +""" |
| +Update the dictionaries in the rules |
| +==================================== |
| + |
| + This script generates the dictionaries in the defaults/rules.js file based |
| + on various sources like the list of public suffixes (http://publicsuffix.org/). |
| +""" |
| + |
| +import sys |
| +import os |
| +import re |
| +import urllib |
| +import codecs |
| +import json |
| +import itertools |
| + |
| +schemes = { |
| + 'http:': 4, |
| + 'https:': 3, |
| + 'ftp:': 2, |
| + 'irc:': 1, |
| +} |
| + |
| +domainReferrals = { |
| + 'amazon.co.uk': 'tag=uf07d-21', |
| + 'amazon.com': 'tag=uf024-20', |
| + 'amazon.de': 'tag=uf0e6-21', |
| + 'amazon.fr': 'tag=uf02b-21', |
| + 'amazon.es': 'tag=uf07-21', |
| + 'amazon.it': 'tag=uf08d-21', |
| + 'ozon.ru': 'partner=urlfixer', |
| +} |
| + |
| +additionalDomains = [ |
| + 'fab.com', |
| + 'ku.dk', |
| + 'google.cz', |
| + 'komplett.ie', |
| + 'lotto.ie', |
| + 'bt.yahoo.com', |
| + 'o.co', |
| + 'bet.hu', |
| + 'haz.de', |
| + 'sas.com', |
| + 'nic.ir', |
| + 'tomtop.com', |
| + 'uwa.edu.au', |
| + 'spacex.com', |
| + 'eif.org', |
| + 'geld.de', |
| + # From http://www.wikipedia.org/ |
| + 'en.wikipedia.org', 'ja.wikipedia.org', 'de.wikipedia.org', 'es.wikipedia.org', 'ru.wikipedia.org', 'fr.wikipedia.org', 'it.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org', 'zh.wikipedia.org', 'ar.wikipedia.org', 'bg.wikipedia.org', 'ca.wikipedia.org', 'cs.wikipedia.org', 'da.wikipedia.org', 'de.wikipedia.org', 'en.wikipedia.org', 'es.wikipedia.org', 'eo.wikipedia.org', 'eu.wikipedia.org', 'fa.wikipedia.org', 'fr.wikipedia.org', 'ko.wikipedia.org', 'hi.wikipedia.org', 'hr.wikipedia.org', 'id.wikipedia.org', 'it.wikipedia.org', 'he.wikipedia.org', 'lt.wikipedia.org', 'hu.wikipedia.org', 'ms.wikipedia.org', 'nl.wikipedia.org', 'ja.wikipedia.org', 'no.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org', 'kk.wikipedia.org', 'ro.wikipedia.org', 'ru.wikipedia.org', 'sk.wikipedia.org', 'sl.wikipedia.org', 'sr.wikipedia.org', 'fi.wikipedia.org', 'sv.wikipedia.org', 'tr.wikipedia.org', 'uk.wikipedia.org', 'vi.wikipedia.org', 'vo.wikipedia.org', 'war.wikipedia.org', 'zh.wikipedia.org', 'af.wikipedia.org', 'als.wikipedia.org', 'am.wikipedia.org', 'an.wikipedia.org', 'ast.wikipedia.org', 'ht.wikipedia.org', 'az.wikipedia.org', 'bn.wikipedia.org', 'ba.wikipedia.org', 'be.wikipedia.org', 'bpy.wikipedia.org', 'bs.wikipedia.org', 'br.wikipedia.org', 'cv.wikipedia.org', 'cy.wikipedia.org', 'et.wikipedia.org', 'el.wikipedia.org', 'fy.wikipedia.org', 'ga.wikipedia.org', 'gl.wikipedia.org', 'gu.wikipedia.org', 'hy.wikipedia.org', 'io.wikipedia.org', 'ia.wikipedia.org', 'is.wikipedia.org', 'jv.wikipedia.org', 'kn.wikipedia.org', 'ka.wikipedia.org', 'ku.wikipedia.org', 'la.wikipedia.org', 'lv.wikipedia.org', 'lb.wikipedia.org', 'lmo.wikipedia.org', 'mk.wikipedia.org', 'mg.wikipedia.org', 'ml.wikipedia.org', 'mr.wikipedia.org', 'my.wikipedia.org', 'new.wikipedia.org', 'ne.wikipedia.org', 'nn.wikipedia.org', 'nap.wikipedia.org', 'oc.wikipedia.org', 'pms.wikipedia.org', 'nds.wikipedia.org', 'qu.wikipedia.org', 'pnb.wikipedia.org', 'sq.wikipedia.org', 'scn.wikipedia.org', 'simple.wikipedia.org', 'ceb.wikipedia.org', 'sh.wikipedia.org', 'su.wikipedia.org', 'sw.wikipedia.org', 'tl.wikipedia.org', 'ta.wikipedia.org', 'tt.wikipedia.org', 'te.wikipedia.org', 'th.wikipedia.org', 'bug.wikipedia.org', 'ur.wikipedia.org', 'wa.wikipedia.org', 'yo.wikipedia.org', 'diq.wikipedia.org', 'ace.wikipedia.org', 'frp.wikipedia.org', 'arc.wikipedia.org', 'gn.wikipedia.org', 'av.wikipedia.org', 'ay.wikipedia.org', 'bjn.wikipedia.org', 'bh.wikipedia.org', 'bcl.wikipedia.org', 'bar.wikipedia.org', 'bo.wikipedia.org', 'co.wikipedia.org', 'pdc.wikipedia.org', 'dv.wikipedia.org', 'nv.wikipedia.org', 'ang.wikipedia.org', 'eml.wikipedia.org', 'myv.wikipedia.org', 'ext.wikipedia.org', 'hif.wikipedia.org', 'fo.wikipedia.org', 'frr.wikipedia.org', 'fur.wikipedia.org', 'gv.wikipedia.org', 'gag.wikipedia.org', 'gd.wikipedia.org', 'gan.wikipedia.org', 'glk.wikipedia.org', 'hak.wikipedia.org', 'xal.wikipedia.org', 'haw.wikipedia.org', 'hsb.wikipedia.org', 'ilo.wikipedia.org', 'ie.wikipedia.org', 'os.wikipedia.org', 'kl.wikipedia.org', 'pam.wikipedia.org', 'csb.wikipedia.org', 'kw.wikipedia.org', 'km.wikipedia.org', 'rw.wikipedia.org', 'kv.wikipedia.org', 'ky.wikipedia.org', 'mrj.wikipedia.org', 'lad.wikipedia.org', 'lbe.wikipedia.org', 'lij.wikipedia.org', 'li.wikipedia.org', 'ln.wikipedia.org', 'jbo.wikipedia.org', 'mt.wikipedia.org', 'mi.wikipedia.org', 'xmf.wikipedia.org', 'arz.wikipedia.org', 'mzn.wikipedia.org', 'mdf.wikipedia.org', 'mn.wikipedia.org', 'nah.wikipedia.org', 'nrm.wikipedia.org', 'nov.wikipedia.org', 'ce.wikipedia.org', 'mhr.wikipedia.org', 'or.wikipedia.org', 'as.wikipedia.org', 'uz.wikipedia.org', 'pi.wikipedia.org', 'pag.wikipedia.org', 'pa.wikipedia.org', 'pap.wikipedia.org', 'ps.wikipedia.org', 'koi.wikipedia.org', 'pfl.wikipedia.org', 'pcd.wikipedia.org', 'krc.wikipedia.org', 'crh.wikipedia.org', 'ksh.wikipedia.org', 'rm.wikipedia.org', 'rue.wikipedia.org', 'sa.wikipedia.org', 'se.wikipedia.org', 'sc.wikipedia.org', 'sah.wikipedia.org', 'sco.wikipedia.org', 'stq.wikipedia.org', 'si.wikipedia.org', 'szl.wikipedia.org', 'so.wikipedia.org', 'ckb.wikipedia.org', 'tg.wikipedia.org', 'tpi.wikipedia.org', 'to.wikipedia.org', 'tk.wikipedia.org', 'udm.wikipedia.org', 'ug.wikipedia.org', 'vec.wikipedia.org', 'vls.wikipedia.org', 'wo.wikipedia.org', 'wuu.wikipedia.org', 'yi.wikipedia.org', 'zea.wikipedia.org', 'kbd.wikipedia.org', 'ak.wikipedia.org', 'ab.wikipedia.org', 'bm.wikipedia.org', 'bi.wikipedia.org', 'bxr.wikipedia.org', 'ch.wikipedia.org', 'ny.wikipedia.org', 'za.wikipedia.org', 'dsb.wikipedia.org', 'ee.wikipedia.org', 'ff.wikipedia.org', 'ki.wikipedia.org', 'got.wikipedia.org', 'ha.wikipedia.org', 'ig.wikipedia.org', 'iu.wikipedia.org', 'ik.wikipedia.org', 'ks.wikipedia.org', 'kg.wikipedia.org', 'lo.wikipedia.org', 'ltg.wikipedia.org', 'lg.wikipedia.org', 'cdo.wikipedia.org', 'mwl.wikipedia.org', 'mo.wikipedia.org', 'fj.wikipedia.org', 'na.wikipedia.org', 'cr.wikipedia.org', 'pih.wikipedia.org', 'om.wikipedia.org', 'pnt.wikipedia.org', 'kaa.wikipedia.org', 'dz.wikipedia.org', 'rmy.wikipedia.org', 'rn.wikipedia.org', 'sm.wikipedia.org', 'sg.wikipedia.org', 'st.wikipedia.org', 'nso.wikipedia.org', 'tn.wikipedia.org', 'sn.wikipedia.org', 'sd.wikipedia.org', 'cu.wikipedia.org', 'ss.wikipedia.org', 'srn.wikipedia.org', 'ty.wikipedia.org', 'kab.wikipedia.org', 'tet.wikipedia.org', 'ti.wikipedia.org', 'chr.wikipedia.org', 'tum.wikipedia.org', 'ts.wikipedia.org', 'chy.wikipedia.org', 've.wikipedia.org', 'tw.wikipedia.org', 'vep.wikipedia.org', 'xh.wikipedia.org', 'zu.wikipedia.org', 'de.wikipedia.org', 'pl.wikipedia.org', 'ja.wikipedia.org', 'zh.wikipedia.org', 'ru.wikipedia.org', 'eo.wikipedia.org', 'vi.wikipedia.org', |
| + # From http://www.google.com/supported_domains |
| + 'google.com', 'google.ad', 'google.ae', 'google.com.af', 'google.com.ag', 'google.com.ai', 'google.am', 'google.co.ao', 'google.com.ar', 'google.as', 'google.at', 'google.com.au', 'google.az', 'google.ba', 'google.com.bd', 'google.be', 'google.bf', 'google.bg', 'google.com.bh', 'google.bi', 'google.bj', 'google.com.bn', 'google.com.bo', 'google.com.br', 'google.bs', 'google.co.bw', 'google.by', 'google.com.bz', 'google.ca', 'google.cd', 'google.cf', 'google.cg', 'google.ch', 'google.ci', 'google.co.ck', 'google.cl', 'google.cm', 'google.cn', 'google.com.co', 'google.co.cr', 'google.com.cu', 'google.cv', 'google.com.cy', 'google.cz', 'google.de', 'google.dj', 'google.dk', 'google.dm', 'google.com.do', 'google.dz', 'google.com.ec', 'google.ee', 'google.com.eg', 'google.es', 'google.com.et', 'google.fi', 'google.com.fj', 'google.fm', 'google.fr', 'google.ga', 'google.ge', 'google.gg', 'google.com.gh', 'google.com.gi', 'google.gl', 'google.gm', 'google.gp', 'google.gr', 'google.com.gt', 'google.gy', 'google.com.hk', 'google.hn', 'google.hr', 'google.ht', 'google.hu', 'google.co.id', 'google.ie', 'google.co.il', 'google.im', 'google.co.in', 'google.iq', 'google.is', 'google.it', 'google.je', 'google.com.jm', 'google.jo', 'google.co.jp', 'google.co.ke', 'google.com.kh', 'google.ki', 'google.kg', 'google.co.kr', 'google.com.kw', 'google.kz', 'google.la', 'google.com.lb', 'google.li', 'google.lk', 'google.co.ls', 'google.lt', 'google.lu', 'google.lv', 'google.com.ly', 'google.co.ma', 'google.md', 'google.me', 'google.mg', 'google.mk', 'google.ml', 'google.mn', 'google.ms', 'google.com.mt', 'google.mu', 'google.mv', 'google.mw', 'google.com.mx', 'google.com.my', 'google.co.mz', 'google.com.na', 'google.com.nf', 'google.com.ng', 'google.com.ni', 'google.ne', 'google.nl', 'google.no', 'google.com.np', 'google.nr', 'google.nu', 'google.co.nz', 'google.com.om', 'google.com.pa', 'google.com.pe', 'google.com.ph', 'google.com.pk', 'google.pl', 'google.pn', 'google.com.pr', 'google.ps', 'google.pt', 'google.com.py', 'google.com.qa', 'google.ro', 'google.ru', 'google.rw', 'google.com.sa', 'google.com.sb', 'google.sc', 'google.se', 'google.com.sg', 'google.sh', 'google.si', 'google.sk', 'google.com.sl', 'google.sn', 'google.so', 'google.sm', 'google.st', 'google.com.sv', 'google.td', 'google.tg', 'google.co.th', 'google.com.tj', 'google.tk', 'google.tl', 'google.tm', 'google.tn', 'google.to', 'google.com.tr', 'google.tt', 'google.com.tw', 'google.co.tz', 'google.com.ua', 'google.co.ug', 'google.co.uk', 'google.com.uy', 'google.co.uz', 'google.com.vc', 'google.co.ve', 'google.vg', 'google.co.vi', 'google.com.vn', 'google.vu', 'google.ws', 'google.rs', 'google.co.za', 'google.co.zm', 'google.co.zw', 'google.cat', |
| + # From http://www.ebay.ch/ (eBay-Websites) |
| + 'mercadolibre.com.ar', 'ebay.com.au', 'ebay.at', 'ebay.be', 'mercadolivre.com.br', 'ebay.com.cn', 'ebay.dk', 'ebay.de', 'ebay.fr', 'ebay.gr', 'ebay.co.uk', 'ebay.com.hk', 'ebay.in', 'ebay.ie', 'ebay.it', 'ebay.ca', 'auction.co.kr', 'ebay.com.my', 'mercadolibre.com.mx', 'pages.ebay.com', 'ebay.nl', 'ebay.no', 'ebay.ph', 'ebay.pl', 'ebay.ru', 'ebay.se', 'ebay.com.sg', 'ebay.es', 'ruten.com.tw', 'ebay.co.th', 'gittigidiyor.com', 'ebay.cz', 'ebay.com', 'ebay.vn', |
| + # From http://www.amazon.com/ (footer) |
| + 'amazon.ca', 'amazon.cn', 'amazon.fr', 'amazon.de', 'amazon.it', 'amazon.co.jp', 'amazon.es', 'amazon.co.uk', |
| + |
| + # From http://en.wikipedia.org/wiki/.gov#States_in_GOV |
| + 'al.gov', 'alabama.gov', |
| + 'alaska.gov', |
| + 'az.gov', |
| + 'ar.gov', 'arkansas.gov', |
| + 'ca.gov', 'california.gov', |
| + 'colorado.gov', |
| + 'ct.gov', |
| + 'delaware.gov', |
| + 'florida.gov', 'fl.gov', |
| + 'georgia.gov', 'ga.gov', |
| + 'guam.gov', |
| + 'hawaii.gov', |
| + 'idaho.gov', |
| + 'illinois.gov', |
| + 'in.gov', |
| + 'iowa.gov', 'ia.gov', |
| + 'ks.gov', 'kansas.gov', |
| + 'ky.gov', 'kentucky.gov', |
| + 'louisiana.gov', |
| + 'maine.gov', |
| + 'maryland.gov', |
| + 'mass.gov', |
| + 'michigan.gov', |
| + 'mn.gov', |
| + 'mississippi.gov', |
| + 'mo.gov', |
| + 'mt.gov', 'montana.gov', |
| + 'nebraska.gov', |
| + 'nv.gov', |
| + 'nh.gov', 'visitnh.gov', |
| + 'nj.gov', 'newjersey.gov', |
| + 'newmexico.gov', |
| + 'ny.gov', |
| + 'nc.gov', 'northcarolina.gov', |
| + 'nd.gov', |
| + 'ohio.gov', 'oh.gov', |
| + 'ok.gov', |
| + 'oregon.gov', |
| + 'pa.gov', 'pennsylvania.gov', |
| + 'pr.gov', |
| + 'ri.gov', |
| + 'sc.gov', |
| + 'sd.gov', |
| + 'tennessee.gov', 'tn.gov', |
| + 'texas.gov', |
| + 'utah.gov', |
| + 'vermont.gov', |
| + 'virginia.gov', |
| + 'wa.gov', 'washington.gov', |
| + 'wv.gov', |
| + 'wisconsin.gov', |
| + 'wyoming.gov', |
| + 'dc.gov', |
| +] |
| + |
| +additionalTLDs = [ |
| + # From http://en.wikipedia.org/wiki/.ar |
| + 'com.ar', 'edu.ar', 'gob.ar', 'gov.ar', 'int.ar', 'mil.ar', 'net.ar', 'org.ar', 'tur.ar', |
| + |
| + # From http://en.wikipedia.org/wiki/.au |
| + 'com.au', 'net.au', 'org.au', 'edu.au', 'gov.au', 'csiro.au', 'asn.au', 'id.au', |
| + |
| + # From http://en.wikipedia.org/wiki/.bd |
| + 'com.bd', 'edu.bd', 'ac.bd', 'net.bd', 'gov.bd', 'org.bd', 'mil.bd', |
| + |
| + # From http://en.wikipedia.org/wiki/.bn |
| + 'com.bn', 'edu.bn', 'gov.bn', 'net.bn', 'org.bn', |
| + |
| + # From http://en.wikipedia.org/wiki/.ck |
| + 'co.ck', 'org.ck', 'edu.ck', 'gov.ck', 'net.ck', 'gen.ck', 'biz.ck', 'info.ck', |
| + |
| + # From http://en.wikipedia.org/wiki/.cy |
| + 'ac.cy', 'net.cy', 'gov.cy', 'org.cy', 'pro.cy', 'name.cy', 'ekloges.cy', |
| + 'tm.cy', 'ltd.cy', 'biz.cy', 'press.cy', 'parliament.cy', 'com.cy', |
| + |
| + # From http://en.wikipedia.org/wiki/.er |
| + 'com.er', 'edu.er', 'gov.er', 'mil.er', 'net.er', 'org.er', 'ind.er', |
| + |
| + # From http://en.wikipedia.org/wiki/.et |
| + 'com.et', 'gov.et', 'org.et', 'edu.et', 'net.et', 'biz.et', 'name.et', 'info.et', |
| + |
| + # From http://en.wikipedia.org/wiki/.fj |
| + 'ac.fj', 'biz.fj', 'com.fj', 'info.fj', 'mil.fj', 'name.fj', 'net.fj', 'org.fj', 'pro.fj', |
| + |
| + # From http://en.wikipedia.org/wiki/.fk |
| + 'co.fk', 'org.fk', 'gov.fk', 'ac.fk', 'nom.fk', 'net.fk', |
| + |
| + # From http://en.wikipedia.org/wiki/.gt |
| + 'com.gt', 'edu.gt', 'net.gt', 'gob.gt', 'org.gt', 'mil.gt', 'ind.gt', |
| + |
| + # From http://en.wikipedia.org/wiki/.gu |
| + 'com.gu', 'net.gu', 'gov.gu', 'org.gu', 'edu.gu', |
| + |
| + # From http://en.wikipedia.org/wiki/.il |
| + 'ac.il', 'co.il', 'org.il', 'net.il', 'k12.il', 'gov.il', 'muni.il', 'idf.il', |
| + |
| + # From http://en.wikipedia.org/wiki/.jm |
| + 'com.jm', 'net.jm', 'org.jm', 'edu.jm', 'gov.jm', 'mil.jm', |
| + |
| + # From http://en.wikipedia.org/wiki/.ke |
| + 'co.ke', 'or.ke', 'ne.ke', 'go.ke', 'ac.ke', 'sc.ke', 'me.ke', 'mobi.ke', 'info.ke', |
| + |
| + # From http://en.wikipedia.org/wiki/.kh |
| + 'per.kh', 'com.kh', 'edu.kh', 'gov.kh', 'mil.kh', 'net.kh', 'org.kh', |
| + |
| + # From http://en.wikipedia.org/wiki/.kw |
| + 'edu.kw', 'com.kw', 'net.kw', 'org.kw', 'gov.kw', |
| + |
| + # From http://en.wikipedia.org/wiki/.mm |
| + 'net.mm', 'com.mm', 'edu.mm', 'org.mm', 'gov.mm', |
| + |
| + # From http://en.wikipedia.org/wiki/.mt |
| + 'com.mt', 'org.mt', 'net.mt', 'edu.mt', 'gov.mt', |
| + |
| + # From http://en.wikipedia.org/wiki/.mz |
| + 'adv.mz', 'ac.mz', 'co.mz', 'org.mz', 'gov.mz', 'edu.mz', |
| + |
| + # From http://en.wikipedia.org/wiki/.ni |
| + 'gob.ni', 'co.ni', 'com.ni', 'ac.ni', 'edu.ni', 'org.ni', 'nom.ni', 'net.ni', 'mil.ni', |
| + |
| + # From http://en.wikipedia.org/wiki/.np |
| + 'com.np', 'edu.np', 'gov.np', 'mil.np', 'net.np', 'org.np', |
| + |
| + # From http://en.wikipedia.org/wiki/.nz |
| + 'ac.nz', 'co.nz', 'geek.nz', 'gen.nz', 'maori.nz', 'net.nz', 'org.nz', 'school.nz', |
| + 'cri.nz', 'govt.nz', 'iwi.nz', 'parliament.nz', 'mil.nz', 'health.nz', |
| + |
| + # From http://en.wikipedia.org/wiki/.om |
| + 'com.om', 'co.om', 'edu.om', 'ac.om', 'sch.om', 'gov.om', 'net.om', 'org.om', |
| + 'mil.om', 'museum.om', 'biz.om', 'pro.om', 'med.om', |
| + |
| + # From http://en.wikipedia.org/wiki/.pg |
| + 'com.pg', 'net.pg', 'ac.pg', 'gov.pg', 'mil.pg', 'org.pg', |
| + |
| + # From http://en.wikipedia.org/wiki/.py |
| + 'org.py', 'edu.py', 'mil.py', 'gov.py', 'net.py', 'com.py', 'coop.py', |
| + |
| + # From http://en.wikipedia.org/wiki/.qa |
| + 'com.qa', 'net.qa', 'org.qa', 'gov.qa', 'edu.qa', 'mil.qa', 'name.qa', 'sch.qa', |
| + |
| + # From http://en.wikipedia.org/wiki/.sv |
| + 'edu.sv', 'gob.sv', 'com.sv', 'org.sv', 'red.sv', |
| + |
| + # From http://en.wikipedia.org/wiki/.tr |
| + 'com.tr', 'gen.tr', 'org.tr', 'biz.tr', 'info.tr', 'av.tr', 'dr.tr', 'pol.tr', |
| + 'bel.tr', 'tsk.tr', 'bbs.tr', 'k12.tr', 'edu.tr', 'name.tr', 'net.tr', 'gov.tr', |
| + 'web.tr', 'tel.tr', 'tv.tr', 'nc.tr', |
| + |
| + # From http://en.wikipedia.org/wiki/.uk |
| + 'ac.uk', 'co.uk', 'gov.uk', 'judiciary.uk', 'ltd.uk', 'me.uk', 'mod.uk', 'net.uk', |
| + 'nhs.uk', 'nic.uk', 'org.uk', 'parliament.uk', 'plc.uk', 'police.uk', 'sch.uk', |
| + |
| + # From http://en.wikipedia.org/wiki/.uy |
| + 'com.uy', 'edu.uy', 'gub.uy', 'net.uy', 'mil.uy', 'org.uy', |
| + |
| + # From http://en.wikipedia.org/wiki/.ve |
| + 'com.ve', 'net.ve', 'org.ve', 'info.ve', 'co.ve', 'web.ve', 'gob.ve', 'edu.ve', 'mil.ve', 'tec.ve', |
| + |
| + # From http://en.wikipedia.org/wiki/.ye |
| + 'com.ye', 'co.ye', 'ltd.ye', 'me.ye', 'net.ye', 'org.ye', 'plc.ye', 'gov.ye', |
| + |
| + # From http://en.wikipedia.org/wiki/.za |
| + 'ac.za', 'city.za', 'co.za', 'edu.za', 'gov.za', 'law.za', 'mil.za', 'nom.za', 'org.za', 'school.za', |
| + 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', |
| + 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'olivetti.za', 'pix.za', |
| + |
| + # From http://en.wikipedia.org/wiki/.zm |
| + 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', |
| + |
| + # From http://en.wikipedia.org/wiki/.zw |
| + 'co.zw', 'ac.zw', 'org.zw', |
| +] |
| + |
| +def getSuffixes(target, items): |
| + suffixes = {} |
| + for item, priority in items.iteritems(): |
| + suffix = item[-1] if len(item) else '' |
| + if not suffix in suffixes: |
| + suffixes[suffix] = {} |
| + suffixes[suffix][item[:-1]] = priority |
| + for suffix, items in suffixes.iteritems(): |
| + if len(items.keys()) == 1: |
| + item, priority = items.items()[0] |
| + target[suffix] = ''.join(reversed(item)) + ' ' + str(priority) |
| + else: |
| + target[suffix] = {} |
| + getSuffixes(target[suffix], items) |
| + |
| +def urlopen(url, attempts=3): |
| + """ |
| + Tries to open a particular URL, retries on failure. |
| + """ |
| + for i in range(attempts): |
| + try: |
| + return urllib.urlopen(url) |
| + except IOError, e: |
| + error = e |
| + time.sleep(5) |
| + raise error |
| + |
| +def iterateTLDs(): |
| + for tld in additionalTLDs: |
| + yield tld |
| + |
| + url = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1' |
| + resource = urlopen(url) |
| + for line in resource.read().decode('utf-8').splitlines(): |
| + line = line.rstrip() |
| + if line.startswith("//"): |
| + continue |
| + |
| + if line.startswith('*.'): |
| + tld = line[2:] |
| + elif line.startswith('!'): |
| + tld = line[1:] |
| + else: |
| + tld = line |
| + |
| + if tld: |
| + yield tld |
| + |
| +def getTLDs(domains, tldPriority): |
| + for tld in iterateTLDs(): |
| + if not tld in domains: |
| + domains[tld] = tldPriority |
| + |
| +def updateSchemes(rules): |
| + rules['scheme'] = {} |
| + getSuffixes(rules['scheme'], schemes) |
| + |
| +def updateDomains(rules): |
| + domains = {} |
| + reader = codecs.getreader('utf-8')(sys.stdin) |
| + i = 0 |
| + for domain in itertools.chain(reader.readlines(), additionalDomains): |
| + domain = domain.rstrip() |
| + if not domain or domain in domains: |
| + continue |
| + domains[domain] = i |
| + i += 1 |
| + |
| + maxPriority = i |
| + for domain in domains.iterkeys(): |
| + domains[domain] = maxPriority - domains[domain] |
| + |
| + # Extract TLDs from domain list |
| + for domain, priority in domains.items(): |
| + while True: |
| + if not re.search(r'^[^.]+\.+', domain): |
| + break |
| + domain = re.sub(r'^[^.]+\.+', '', domain) |
| + if not domain: |
| + break |
| + if not domain in domains or domains[domain] < priority - maxPriority: |
| + domains[domain] = priority - maxPriority |
| + |
| + # Fill up with "official" TLDs |
| + getTLDs(domains, -maxPriority) |
| + |
| + rules['domain'] = {} |
| + getSuffixes(rules['domain'], domains) |
| + |
| +def writeRules(rules): |
| + path = os.path.join('defaults', 'rules.json') |
| + file = codecs.open(path, 'rb', encoding='utf-8') |
| + data = file.read() |
| + file.close() |
| + |
| + marker = '// Automatically generated dictionaries' |
| + markerIndex = data.find(marker) |
| + if markerIndex < 0: |
| + raise Exception('Insertion marker not found in %s' % path) |
| + data = data[0:markerIndex + len(marker)] + '\n' |
| + data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separators = (',', ':'))[1:-1] + '\n}\n' |
| + |
| + file = codecs.open(path, 'wb', encoding='utf-8') |
| + file.write(data) |
| + file.close() |
| + |
| +def updateRules(): |
| + rules = {} |
| + rules['domainReferrals'] = domainReferrals |
| + updateSchemes(rules) |
| + updateDomains(rules) |
| + writeRules(rules) |
| + |
| +if __name__ == "__main__": |
| + updateRules() |