Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: updateRules.py

Issue 8382011: Applied changes from emailed code review (Closed)
Patch Set: Created Sept. 28, 2012, 1:40 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « lib/updateRules.js ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 # This Source Code is subject to the terms of the Mozilla Public License
5 # version 2.0 (the "License"). You can obtain a copy of the License at
6 # http://mozilla.org/MPL/2.0/.
7
8 """
9 Update the dictionaries in the rules
10 ====================================
11
12 This script generates the dictionaries in the defaults/rules.js file based
13 on various sources like the list of public suffixes (http://publicsuffix.org/) .
14 """
15
16 import sys
17 import os
18 import re
19 import urllib
20 import codecs
21 import json
22 import itertools
23
24 schemes = {
25 'http:': 4,
26 'https:': 3,
27 'ftp:': 2,
28 'irc:': 1,
29 }
30
31 domainReferrals = {
32 'amazon.co.uk': 'tag=uf07d-21',
33 'amazon.com': 'tag=uf024-20',
34 'amazon.de': 'tag=uf0e6-21',
35 'amazon.fr': 'tag=uf02b-21',
36 'amazon.es': 'tag=uf07-21',
37 'amazon.it': 'tag=uf08d-21',
38 'ozon.ru': 'partner=urlfixer',
39 }
40
41 additionalDomains = [
42 'fab.com',
43 'ku.dk',
44 'google.cz',
45 'komplett.ie',
46 'lotto.ie',
47 'bt.yahoo.com',
48 'o.co',
49 'bet.hu',
50 'haz.de',
51 'sas.com',
52 'nic.ir',
53 'tomtop.com',
54 'uwa.edu.au',
55 'spacex.com',
56 'eif.org',
57 'geld.de',
58 # From http://www.wikipedia.org/
59 'en.wikipedia.org', 'ja.wikipedia.org', 'de.wikipedia.org', 'es.wikipedia.org' , 'ru.wikipedia.org', 'fr.wikipedia.org', 'it.wikipedia.org', 'pl.wikipedia.org' , 'pt.wikipedia.org', 'zh.wikipedia.org', 'ar.wikipedia.org', 'bg.wikipedia.org' , 'ca.wikipedia.org', 'cs.wikipedia.org', 'da.wikipedia.org', 'de.wikipedia.org' , 'en.wikipedia.org', 'es.wikipedia.org', 'eo.wikipedia.org', 'eu.wikipedia.org' , 'fa.wikipedia.org', 'fr.wikipedia.org', 'ko.wikipedia.org', 'hi.wikipedia.org' , 'hr.wikipedia.org', 'id.wikipedia.org', 'it.wikipedia.org', 'he.wikipedia.org' , 'lt.wikipedia.org', 'hu.wikipedia.org', 'ms.wikipedia.org', 'nl.wikipedia.org' , 'ja.wikipedia.org', 'no.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org' , 'kk.wikipedia.org', 'ro.wikipedia.org', 'ru.wikipedia.org', 'sk.wikipedia.org' , 'sl.wikipedia.org', 'sr.wikipedia.org', 'fi.wikipedia.org', 'sv.wikipedia.org' , 'tr.wikipedia.org', 'uk.wikipedia.org', 'vi.wikipedia.org', 'vo.wikipedia.org' , 'war.wikipedia.org', 'zh.wikipedia.org', 'af.wikipedia.org', 'als.wikipedia.or g', 'am.wikipedia.org', 'an.wikipedia.org', 'ast.wikipedia.org', 'ht.wikipedia.o rg', 'az.wikipedia.org', 'bn.wikipedia.org', 'ba.wikipedia.org', 'be.wikipedia.o rg', 'bpy.wikipedia.org', 'bs.wikipedia.org', 'br.wikipedia.org', 'cv.wikipedia. org', 'cy.wikipedia.org', 'et.wikipedia.org', 'el.wikipedia.org', 'fy.wikipedia. org', 'ga.wikipedia.org', 'gl.wikipedia.org', 'gu.wikipedia.org', 'hy.wikipedia. org', 'io.wikipedia.org', 'ia.wikipedia.org', 'is.wikipedia.org', 'jv.wikipedia. org', 'kn.wikipedia.org', 'ka.wikipedia.org', 'ku.wikipedia.org', 'la.wikipedia. org', 'lv.wikipedia.org', 'lb.wikipedia.org', 'lmo.wikipedia.org', 'mk.wikipedia .org', 'mg.wikipedia.org', 'ml.wikipedia.org', 'mr.wikipedia.org', 'my.wikipedia .org', 'new.wikipedia.org', 'ne.wikipedia.org', 'nn.wikipedia.org', 'nap.wikiped ia.org', 'oc.wikipedia.org', 'pms.wikipedia.org', 'nds.wikipedia.org', 'qu.wikip edia.org', 'pnb.wikipedia.org', 'sq.wikipedia.org', 'scn.wikipedia.org', 'simple .wikipedia.org', 'ceb.wikipedia.org', 'sh.wikipedia.org', 'su.wikipedia.org', 's w.wikipedia.org', 'tl.wikipedia.org', 'ta.wikipedia.org', 'tt.wikipedia.org', 't e.wikipedia.org', 'th.wikipedia.org', 'bug.wikipedia.org', 'ur.wikipedia.org', ' wa.wikipedia.org', 'yo.wikipedia.org', 'diq.wikipedia.org', 'ace.wikipedia.org', 'frp.wikipedia.org', 'arc.wikipedia.org', 'gn.wikipedia.org', 'av.wikipedia.org ', 'ay.wikipedia.org', 'bjn.wikipedia.org', 'bh.wikipedia.org', 'bcl.wikipedia.o rg', 'bar.wikipedia.org', 'bo.wikipedia.org', 'co.wikipedia.org', 'pdc.wikipedia .org', 'dv.wikipedia.org', 'nv.wikipedia.org', 'ang.wikipedia.org', 'eml.wikiped ia.org', 'myv.wikipedia.org', 'ext.wikipedia.org', 'hif.wikipedia.org', 'fo.wiki pedia.org', 'frr.wikipedia.org', 'fur.wikipedia.org', 'gv.wikipedia.org', 'gag.w ikipedia.org', 'gd.wikipedia.org', 'gan.wikipedia.org', 'glk.wikipedia.org', 'ha k.wikipedia.org', 'xal.wikipedia.org', 'haw.wikipedia.org', 'hsb.wikipedia.org', 'ilo.wikipedia.org', 'ie.wikipedia.org', 'os.wikipedia.org', 'kl.wikipedia.org' , 'pam.wikipedia.org', 'csb.wikipedia.org', 'kw.wikipedia.org', 'km.wikipedia.or g', 'rw.wikipedia.org', 'kv.wikipedia.org', 'ky.wikipedia.org', 'mrj.wikipedia.o rg', 'lad.wikipedia.org', 'lbe.wikipedia.org', 'lij.wikipedia.org', 'li.wikipedi a.org', 'ln.wikipedia.org', 'jbo.wikipedia.org', 'mt.wikipedia.org', 'mi.wikiped ia.org', 'xmf.wikipedia.org', 'arz.wikipedia.org', 'mzn.wikipedia.org', 'mdf.wik ipedia.org', 'mn.wikipedia.org', 'nah.wikipedia.org', 'nrm.wikipedia.org', 'nov. wikipedia.org', 'ce.wikipedia.org', 'mhr.wikipedia.org', 'or.wikipedia.org', 'as .wikipedia.org', 'uz.wikipedia.org', 'pi.wikipedia.org', 'pag.wikipedia.org', 'p a.wikipedia.org', 'pap.wikipedia.org', 'ps.wikipedia.org', 'koi.wikipedia.org', 'pfl.wikipedia.org', 'pcd.wikipedia.org', 'krc.wikipedia.org', 'crh.wikipedia.or g', 'ksh.wikipedia.org', 'rm.wikipedia.org', 'rue.wikipedia.org', 'sa.wikipedia. org', 'se.wikipedia.org', 'sc.wikipedia.org', 'sah.wikipedia.org', 'sco.wikipedi a.org', 'stq.wikipedia.org', 'si.wikipedia.org', 'szl.wikipedia.org', 'so.wikipe dia.org', 'ckb.wikipedia.org', 'tg.wikipedia.org', 'tpi.wikipedia.org', 'to.wiki pedia.org', 'tk.wikipedia.org', 'udm.wikipedia.org', 'ug.wikipedia.org', 'vec.wi kipedia.org', 'vls.wikipedia.org', 'wo.wikipedia.org', 'wuu.wikipedia.org', 'yi. wikipedia.org', 'zea.wikipedia.org', 'kbd.wikipedia.org', 'ak.wikipedia.org', 'a b.wikipedia.org', 'bm.wikipedia.org', 'bi.wikipedia.org', 'bxr.wikipedia.org', ' ch.wikipedia.org', 'ny.wikipedia.org', 'za.wikipedia.org', 'dsb.wikipedia.org', 'ee.wikipedia.org', 'ff.wikipedia.org', 'ki.wikipedia.org', 'got.wikipedia.org', 'ha.wikipedia.org', 'ig.wikipedia.org', 'iu.wikipedia.org', 'ik.wikipedia.org', 'ks.wikipedia.org', 'kg.wikipedia.org', 'lo.wikipedia.org', 'ltg.wikipedia.org' , 'lg.wikipedia.org', 'cdo.wikipedia.org', 'mwl.wikipedia.org', 'mo.wikipedia.or g', 'fj.wikipedia.org', 'na.wikipedia.org', 'cr.wikipedia.org', 'pih.wikipedia.o rg', 'om.wikipedia.org', 'pnt.wikipedia.org', 'kaa.wikipedia.org', 'dz.wikipedia .org', 'rmy.wikipedia.org', 'rn.wikipedia.org', 'sm.wikipedia.org', 'sg.wikipedi a.org', 'st.wikipedia.org', 'nso.wikipedia.org', 'tn.wikipedia.org', 'sn.wikiped ia.org', 'sd.wikipedia.org', 'cu.wikipedia.org', 'ss.wikipedia.org', 'srn.wikipe dia.org', 'ty.wikipedia.org', 'kab.wikipedia.org', 'tet.wikipedia.org', 'ti.wiki pedia.org', 'chr.wikipedia.org', 'tum.wikipedia.org', 'ts.wikipedia.org', 'chy.w ikipedia.org', 've.wikipedia.org', 'tw.wikipedia.org', 'vep.wikipedia.org', 'xh. wikipedia.org', 'zu.wikipedia.org', 'de.wikipedia.org', 'pl.wikipedia.org', 'ja. wikipedia.org', 'zh.wikipedia.org', 'ru.wikipedia.org', 'eo.wikipedia.org', 'vi. wikipedia.org',
60 # From http://www.google.com/supported_domains
61 'google.com', 'google.ad', 'google.ae', 'google.com.af', 'google.com.ag', 'goo gle.com.ai', 'google.am', 'google.co.ao', 'google.com.ar', 'google.as', 'google. at', 'google.com.au', 'google.az', 'google.ba', 'google.com.bd', 'google.be', 'g oogle.bf', 'google.bg', 'google.com.bh', 'google.bi', 'google.bj', 'google.com.b n', 'google.com.bo', 'google.com.br', 'google.bs', 'google.co.bw', 'google.by', 'google.com.bz', 'google.ca', 'google.cd', 'google.cf', 'google.cg', 'google.ch' , 'google.ci', 'google.co.ck', 'google.cl', 'google.cm', 'google.cn', 'google.co m.co', 'google.co.cr', 'google.com.cu', 'google.cv', 'google.com.cy', 'google.cz ', 'google.de', 'google.dj', 'google.dk', 'google.dm', 'google.com.do', 'google. dz', 'google.com.ec', 'google.ee', 'google.com.eg', 'google.es', 'google.com.et' , 'google.fi', 'google.com.fj', 'google.fm', 'google.fr', 'google.ga', 'google.g e', 'google.gg', 'google.com.gh', 'google.com.gi', 'google.gl', 'google.gm', 'go ogle.gp', 'google.gr', 'google.com.gt', 'google.gy', 'google.com.hk', 'google.hn ', 'google.hr', 'google.ht', 'google.hu', 'google.co.id', 'google.ie', 'google.c o.il', 'google.im', 'google.co.in', 'google.iq', 'google.is', 'google.it', 'goog le.je', 'google.com.jm', 'google.jo', 'google.co.jp', 'google.co.ke', 'google.co m.kh', 'google.ki', 'google.kg', 'google.co.kr', 'google.com.kw', 'google.kz', ' google.la', 'google.com.lb', 'google.li', 'google.lk', 'google.co.ls', 'google.l t', 'google.lu', 'google.lv', 'google.com.ly', 'google.co.ma', 'google.md', 'goo gle.me', 'google.mg', 'google.mk', 'google.ml', 'google.mn', 'google.ms', 'googl e.com.mt', 'google.mu', 'google.mv', 'google.mw', 'google.com.mx', 'google.com.m y', 'google.co.mz', 'google.com.na', 'google.com.nf', 'google.com.ng', 'google.c om.ni', 'google.ne', 'google.nl', 'google.no', 'google.com.np', 'google.nr', 'go ogle.nu', 'google.co.nz', 'google.com.om', 'google.com.pa', 'google.com.pe', 'go ogle.com.ph', 'google.com.pk', 'google.pl', 'google.pn', 'google.com.pr', 'googl e.ps', 'google.pt', 'google.com.py', 'google.com.qa', 'google.ro', 'google.ru', 'google.rw', 'google.com.sa', 'google.com.sb', 'google.sc', 'google.se', 'google .com.sg', 'google.sh', 'google.si', 'google.sk', 'google.com.sl', 'google.sn', ' google.so', 'google.sm', 'google.st', 'google.com.sv', 'google.td', 'google.tg', 'google.co.th', 'google.com.tj', 'google.tk', 'google.tl', 'google.tm', 'google .tn', 'google.to', 'google.com.tr', 'google.tt', 'google.com.tw', 'google.co.tz' , 'google.com.ua', 'google.co.ug', 'google.co.uk', 'google.com.uy', 'google.co.u z', 'google.com.vc', 'google.co.ve', 'google.vg', 'google.co.vi', 'google.com.vn ', 'google.vu', 'google.ws', 'google.rs', 'google.co.za', 'google.co.zm', 'googl e.co.zw', 'google.cat',
62 # From http://www.ebay.ch/ (eBay-Websites)
63 'mercadolibre.com.ar', 'ebay.com.au', 'ebay.at', 'ebay.be', 'mercadolivre.com. br', 'ebay.com.cn', 'ebay.dk', 'ebay.de', 'ebay.fr', 'ebay.gr', 'ebay.co.uk', 'e bay.com.hk', 'ebay.in', 'ebay.ie', 'ebay.it', 'ebay.ca', 'auction.co.kr', 'ebay. com.my', 'mercadolibre.com.mx', 'pages.ebay.com', 'ebay.nl', 'ebay.no', 'ebay.ph ', 'ebay.pl', 'ebay.ru', 'ebay.se', 'ebay.com.sg', 'ebay.es', 'ruten.com.tw', 'e bay.co.th', 'gittigidiyor.com', 'ebay.cz', 'ebay.com', 'ebay.vn',
64 # From http://www.amazon.com/ (footer)
65 'amazon.ca', 'amazon.cn', 'amazon.fr', 'amazon.de', 'amazon.it', 'amazon.co.jp ', 'amazon.es', 'amazon.co.uk',
66
67 # From http://en.wikipedia.org/wiki/.gov#States_in_GOV
68 'al.gov', 'alabama.gov',
69 'alaska.gov',
70 'az.gov',
71 'ar.gov', 'arkansas.gov',
72 'ca.gov', 'california.gov',
73 'colorado.gov',
74 'ct.gov',
75 'delaware.gov',
76 'florida.gov', 'fl.gov',
77 'georgia.gov', 'ga.gov',
78 'guam.gov',
79 'hawaii.gov',
80 'idaho.gov',
81 'illinois.gov',
82 'in.gov',
83 'iowa.gov', 'ia.gov',
84 'ks.gov', 'kansas.gov',
85 'ky.gov', 'kentucky.gov',
86 'louisiana.gov',
87 'maine.gov',
88 'maryland.gov',
89 'mass.gov',
90 'michigan.gov',
91 'mn.gov',
92 'mississippi.gov',
93 'mo.gov',
94 'mt.gov', 'montana.gov',
95 'nebraska.gov',
96 'nv.gov',
97 'nh.gov', 'visitnh.gov',
98 'nj.gov', 'newjersey.gov',
99 'newmexico.gov',
100 'ny.gov',
101 'nc.gov', 'northcarolina.gov',
102 'nd.gov',
103 'ohio.gov', 'oh.gov',
104 'ok.gov',
105 'oregon.gov',
106 'pa.gov', 'pennsylvania.gov',
107 'pr.gov',
108 'ri.gov',
109 'sc.gov',
110 'sd.gov',
111 'tennessee.gov', 'tn.gov',
112 'texas.gov',
113 'utah.gov',
114 'vermont.gov',
115 'virginia.gov',
116 'wa.gov', 'washington.gov',
117 'wv.gov',
118 'wisconsin.gov',
119 'wyoming.gov',
120 'dc.gov',
121 ]
122
123 additionalTLDs = [
124 # From http://en.wikipedia.org/wiki/.ar
125 'com.ar', 'edu.ar', 'gob.ar', 'gov.ar', 'int.ar', 'mil.ar', 'net.ar', 'org.ar' , 'tur.ar',
126
127 # From http://en.wikipedia.org/wiki/.au
128 'com.au', 'net.au', 'org.au', 'edu.au', 'gov.au', 'csiro.au', 'asn.au', 'id.au ',
129
130 # From http://en.wikipedia.org/wiki/.bd
131 'com.bd', 'edu.bd', 'ac.bd', 'net.bd', 'gov.bd', 'org.bd', 'mil.bd',
132
133 # From http://en.wikipedia.org/wiki/.bn
134 'com.bn', 'edu.bn', 'gov.bn', 'net.bn', 'org.bn',
135
136 # From http://en.wikipedia.org/wiki/.ck
137 'co.ck', 'org.ck', 'edu.ck', 'gov.ck', 'net.ck', 'gen.ck', 'biz.ck', 'info.ck' ,
138
139 # From http://en.wikipedia.org/wiki/.cy
140 'ac.cy', 'net.cy', 'gov.cy', 'org.cy', 'pro.cy', 'name.cy', 'ekloges.cy',
141 'tm.cy', 'ltd.cy', 'biz.cy', 'press.cy', 'parliament.cy', 'com.cy',
142
143 # From http://en.wikipedia.org/wiki/.er
144 'com.er', 'edu.er', 'gov.er', 'mil.er', 'net.er', 'org.er', 'ind.er',
145
146 # From http://en.wikipedia.org/wiki/.et
147 'com.et', 'gov.et', 'org.et', 'edu.et', 'net.et', 'biz.et', 'name.et', 'info.e t',
148
149 # From http://en.wikipedia.org/wiki/.fj
150 'ac.fj', 'biz.fj', 'com.fj', 'info.fj', 'mil.fj', 'name.fj', 'net.fj', 'org.fj ', 'pro.fj',
151
152 # From http://en.wikipedia.org/wiki/.fk
153 'co.fk', 'org.fk', 'gov.fk', 'ac.fk', 'nom.fk', 'net.fk',
154
155 # From http://en.wikipedia.org/wiki/.gt
156 'com.gt', 'edu.gt', 'net.gt', 'gob.gt', 'org.gt', 'mil.gt', 'ind.gt',
157
158 # From http://en.wikipedia.org/wiki/.gu
159 'com.gu', 'net.gu', 'gov.gu', 'org.gu', 'edu.gu',
160
161 # From http://en.wikipedia.org/wiki/.il
162 'ac.il', 'co.il', 'org.il', 'net.il', 'k12.il', 'gov.il', 'muni.il', 'idf.il',
163
164 # From http://en.wikipedia.org/wiki/.jm
165 'com.jm', 'net.jm', 'org.jm', 'edu.jm', 'gov.jm', 'mil.jm',
166
167 # From http://en.wikipedia.org/wiki/.ke
168 'co.ke', 'or.ke', 'ne.ke', 'go.ke', 'ac.ke', 'sc.ke', 'me.ke', 'mobi.ke', 'inf o.ke',
169
170 # From http://en.wikipedia.org/wiki/.kh
171 'per.kh', 'com.kh', 'edu.kh', 'gov.kh', 'mil.kh', 'net.kh', 'org.kh',
172
173 # From http://en.wikipedia.org/wiki/.kw
174 'edu.kw', 'com.kw', 'net.kw', 'org.kw', 'gov.kw',
175
176 # From http://en.wikipedia.org/wiki/.mm
177 'net.mm', 'com.mm', 'edu.mm', 'org.mm', 'gov.mm',
178
179 # From http://en.wikipedia.org/wiki/.mt
180 'com.mt', 'org.mt', 'net.mt', 'edu.mt', 'gov.mt',
181
182 # From http://en.wikipedia.org/wiki/.mz
183 'adv.mz', 'ac.mz', 'co.mz', 'org.mz', 'gov.mz', 'edu.mz',
184
185 # From http://en.wikipedia.org/wiki/.ni
186 'gob.ni', 'co.ni', 'com.ni', 'ac.ni', 'edu.ni', 'org.ni', 'nom.ni', 'net.ni', 'mil.ni',
187
188 # From http://en.wikipedia.org/wiki/.np
189 'com.np', 'edu.np', 'gov.np', 'mil.np', 'net.np', 'org.np',
190
191 # From http://en.wikipedia.org/wiki/.nz
192 'ac.nz', 'co.nz', 'geek.nz', 'gen.nz', 'maori.nz', 'net.nz', 'org.nz', 'school .nz',
193 'cri.nz', 'govt.nz', 'iwi.nz', 'parliament.nz', 'mil.nz', 'health.nz',
194
195 # From http://en.wikipedia.org/wiki/.om
196 'com.om', 'co.om', 'edu.om', 'ac.om', 'sch.om', 'gov.om', 'net.om', 'org.om',
197 'mil.om', 'museum.om', 'biz.om', 'pro.om', 'med.om',
198
199 # From http://en.wikipedia.org/wiki/.pg
200 'com.pg', 'net.pg', 'ac.pg', 'gov.pg', 'mil.pg', 'org.pg',
201
202 # From http://en.wikipedia.org/wiki/.py
203 'org.py', 'edu.py', 'mil.py', 'gov.py', 'net.py', 'com.py', 'coop.py',
204
205 # From http://en.wikipedia.org/wiki/.qa
206 'com.qa', 'net.qa', 'org.qa', 'gov.qa', 'edu.qa', 'mil.qa', 'name.qa', 'sch.qa ',
207
208 # From http://en.wikipedia.org/wiki/.sv
209 'edu.sv', 'gob.sv', 'com.sv', 'org.sv', 'red.sv',
210
211 # From http://en.wikipedia.org/wiki/.tr
212 'com.tr', 'gen.tr', 'org.tr', 'biz.tr', 'info.tr', 'av.tr', 'dr.tr', 'pol.tr',
213 'bel.tr', 'tsk.tr', 'bbs.tr', 'k12.tr', 'edu.tr', 'name.tr', 'net.tr', 'gov.tr ',
214 'web.tr', 'tel.tr', 'tv.tr', 'nc.tr',
215
216 # From http://en.wikipedia.org/wiki/.uk
217 'ac.uk', 'co.uk', 'gov.uk', 'judiciary.uk', 'ltd.uk', 'me.uk', 'mod.uk', 'net. uk',
218 'nhs.uk', 'nic.uk', 'org.uk', 'parliament.uk', 'plc.uk', 'police.uk', 'sch.uk ',
219
220 # From http://en.wikipedia.org/wiki/.uy
221 'com.uy', 'edu.uy', 'gub.uy', 'net.uy', 'mil.uy', 'org.uy',
222
223 # From http://en.wikipedia.org/wiki/.ve
224 'com.ve', 'net.ve', 'org.ve', 'info.ve', 'co.ve', 'web.ve', 'gob.ve', 'edu.ve' , 'mil.ve', 'tec.ve',
225
226 # From http://en.wikipedia.org/wiki/.ye
227 'com.ye', 'co.ye', 'ltd.ye', 'me.ye', 'net.ye', 'org.ye', 'plc.ye', 'gov.ye',
228
229 # From http://en.wikipedia.org/wiki/.za
230 'ac.za', 'city.za', 'co.za', 'edu.za', 'gov.za', 'law.za', 'mil.za', 'nom.za', 'org.za', 'school.za',
231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za',
232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za',
233
234 # From http://en.wikipedia.org/wiki/.zm
235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm',
236
237 # From http://en.wikipedia.org/wiki/.zw
238 'co.zw', 'ac.zw', 'org.zw',
239 ]
240
241 def getSuffixes(target, items):
242 suffixes = {}
243 for item, priority in items.iteritems():
244 suffix = item[-1] if len(item) else ''
245 if not suffix in suffixes:
246 suffixes[suffix] = {}
247 suffixes[suffix][item[:-1]] = priority
248 for suffix, items in suffixes.iteritems():
249 if len(items.keys()) == 1:
250 item, priority = items.items()[0]
251 target[suffix] = ''.join(reversed(item)) + ' ' + str(priority)
252 else:
253 target[suffix] = {}
254 getSuffixes(target[suffix], items)
255
256 def urlopen(url, attempts=3):
257 """
258 Tries to open a particular URL, retries on failure.
259 """
260 for i in range(attempts):
261 try:
262 return urllib.urlopen(url)
263 except IOError, e:
264 error = e
265 time.sleep(5)
266 raise error
267
268 def iterateTLDs():
269 for tld in additionalTLDs:
270 yield tld
271
272 url = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld _names.dat?raw=1'
273 resource = urlopen(url)
274 for line in resource.read().decode('utf-8').splitlines():
275 line = line.rstrip()
276 if line.startswith("//"):
277 continue
278
279 if line.startswith('*.'):
280 tld = line[2:]
281 elif line.startswith('!'):
282 tld = line[1:]
283 else:
284 tld = line
285
286 if tld:
287 yield tld
288
289 def getTLDs(domains, tldPriority):
290 for tld in iterateTLDs():
291 if not tld in domains:
292 domains[tld] = tldPriority
293
294 def updateSchemes(rules):
295 rules['scheme'] = {}
296 getSuffixes(rules['scheme'], schemes)
297
298 def updateDomains(rules):
299 domains = {}
300 reader = codecs.getreader('utf-8')(sys.stdin)
301 i = 0
302 for domain in itertools.chain(reader.readlines(), additionalDomains):
303 domain = domain.rstrip()
304 if not domain or domain in domains:
305 continue
306 domains[domain] = i
307 i += 1
308
309 maxPriority = i
310 for domain in domains.iterkeys():
311 domains[domain] = maxPriority - domains[domain]
312
313 # Extract TLDs from domain list
314 for domain, priority in domains.items():
315 while True:
316 if not re.search(r'^[^.]+\.+', domain):
317 break
318 domain = re.sub(r'^[^.]+\.+', '', domain)
319 if not domain:
320 break
321 if not domain in domains or domains[domain] < priority - maxPriority:
322 domains[domain] = priority - maxPriority
323
324 # Fill up with "official" TLDs
325 getTLDs(domains, -maxPriority)
326
327 rules['domain'] = {}
328 getSuffixes(rules['domain'], domains)
329
330 def writeRules(rules):
331 path = os.path.join('defaults', 'rules.json')
332 file = codecs.open(path, 'rb', encoding='utf-8')
333 data = file.read()
334 file.close()
335
336 marker = '// Automatically generated dictionaries'
337 markerIndex = data.find(marker)
338 if markerIndex < 0:
339 raise Exception('Insertion marker not found in %s' % path)
340 data = data[0:markerIndex + len(marker)] + '\n'
341 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n'
342
343 file = codecs.open(path, 'wb', encoding='utf-8')
344 file.write(data)
345 file.close()
346
347 def updateRules():
348 rules = {}
349 rules['domainReferrals'] = domainReferrals
350 updateSchemes(rules)
351 updateDomains(rules)
352 writeRules(rules)
353
354 if __name__ == "__main__":
355 updateRules()
OLDNEW
« no previous file with comments | « lib/updateRules.js ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld