Left: | ||
Right: |
OLD | NEW |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding: utf-8 | 2 # coding: utf-8 |
3 | 3 |
4 # This Source Code is subject to the terms of the Mozilla Public License | 4 # This Source Code is subject to the terms of the Mozilla Public License |
5 # version 2.0 (the "License"). You can obtain a copy of the License at | 5 # version 2.0 (the "License"). You can obtain a copy of the License at |
6 # http://mozilla.org/MPL/2.0/. | 6 # http://mozilla.org/MPL/2.0/. |
7 | 7 |
8 """ | 8 """ |
9 Update the dictionaries in the rules | 9 Update the dictionaries in the rules |
10 ==================================== | 10 ==================================== |
(...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', | 231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', |
232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za', | 232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za', |
233 | 233 |
234 # From http://en.wikipedia.org/wiki/.zm | 234 # From http://en.wikipedia.org/wiki/.zm |
235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', | 235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', |
236 | 236 |
237 # From http://en.wikipedia.org/wiki/.zw | 237 # From http://en.wikipedia.org/wiki/.zw |
238 'co.zw', 'ac.zw', 'org.zw', | 238 'co.zw', 'ac.zw', 'org.zw', |
239 ] | 239 ] |
240 | 240 |
241 def getSuffixes(target, items): | |
242 suffixes = {} | |
243 for item, priority in items.iteritems(): | |
244 suffix = item[-1] if len(item) else '' | |
245 if not suffix in suffixes: | |
246 suffixes[suffix] = {} | |
247 suffixes[suffix][item[:-1]] = priority | |
248 for suffix, items in suffixes.iteritems(): | |
249 if len(items.keys()) == 1: | |
250 item, priority = items.items()[0] | |
251 target[suffix] = ''.join(reversed(item)) + ' ' + str(priority) | |
252 else: | |
253 target[suffix] = {} | |
254 getSuffixes(target[suffix], items) | |
255 | |
256 def urlopen(url, attempts=3): | 241 def urlopen(url, attempts=3): |
257 """ | 242 """ |
258 Tries to open a particular URL, retries on failure. | 243 Tries to open a particular URL, retries on failure. |
259 """ | 244 """ |
260 for i in range(attempts): | 245 for i in range(attempts): |
261 try: | 246 try: |
262 return urllib.urlopen(url) | 247 return urllib.urlopen(url) |
263 except IOError, e: | 248 except IOError, e: |
264 error = e | 249 error = e |
265 time.sleep(5) | 250 time.sleep(5) |
(...skipping 18 matching lines...) Expand all Loading... | |
284 tld = line | 269 tld = line |
285 | 270 |
286 if tld: | 271 if tld: |
287 yield tld | 272 yield tld |
288 | 273 |
289 def getTLDs(domains, tldPriority): | 274 def getTLDs(domains, tldPriority): |
290 for tld in iterateTLDs(): | 275 for tld in iterateTLDs(): |
291 if not tld in domains: | 276 if not tld in domains: |
292 domains[tld] = tldPriority | 277 domains[tld] = tldPriority |
293 | 278 |
294 def updateSchemes(rules): | |
295 rules['scheme'] = {} | |
296 getSuffixes(rules['scheme'], schemes) | |
297 | |
298 def updateDomains(rules): | 279 def updateDomains(rules): |
299 domains = {} | 280 domains = {} |
300 reader = codecs.getreader('utf-8')(sys.stdin) | 281 reader = codecs.getreader('utf-8')(sys.stdin) |
301 i = 0 | 282 i = 0 |
302 for domain in itertools.chain(reader.readlines(), additionalDomains): | 283 for domain in itertools.chain(reader.readlines(), additionalDomains): |
303 domain = domain.rstrip() | 284 domain = domain.rstrip() |
304 if not domain or domain in domains: | 285 if not domain or domain in domains: |
305 continue | 286 continue |
306 domains[domain] = i | 287 domains[domain] = i |
307 i += 1 | 288 i += 1 |
308 | 289 |
309 maxPriority = i | 290 maxPriority = i |
310 for domain in domains.iterkeys(): | 291 for domain in domains.iterkeys(): |
311 domains[domain] = maxPriority - domains[domain] | 292 domains[domain] = maxPriority - domains[domain] |
312 | 293 |
313 # Extract TLDs from domain list | 294 # Extract TLDs from domain list |
314 for domain, priority in domains.items(): | 295 for domain, priority in domains.items(): |
315 while True: | 296 while True: |
316 if not re.search(r'^[^.]+\.+', domain): | 297 if not re.search(r'^[^.]+\.+', domain): |
317 break | 298 break |
318 domain = re.sub(r'^[^.]+\.+', '', domain) | 299 domain = re.sub(r'^[^.]+\.+', '', domain) |
319 if not domain: | 300 if not domain: |
320 break | 301 break |
321 if not domain in domains or domains[domain] < priority - maxPriority: | 302 if not domain in domains or domains[domain] < priority - maxPriority: |
322 domains[domain] = priority - maxPriority | 303 domains[domain] = priority - maxPriority |
323 | 304 |
324 # Fill up with "official" TLDs | 305 # Fill up with "official" TLDs |
325 getTLDs(domains, -maxPriority) | 306 getTLDs(domains, -maxPriority) |
326 | 307 |
327 rules['domain'] = {} | 308 rules['domain'] = domains |
328 getSuffixes(rules['domain'], domains) | |
329 | 309 |
330 def writeRules(rules): | 310 def writeRules(rules): |
331 path = os.path.join('defaults', 'rules.json') | 311 path = os.path.join('defaults', 'rules.json') |
Thomas Greiner
2012/09/25 12:31:37
Do you really want the file to be located at /defa
Wladimir Palant
2012/09/25 13:26:00
Actually, it's defaults/rules.json (relative to th
| |
332 file = codecs.open(path, 'rb', encoding='utf-8') | 312 file = codecs.open(path, 'rb', encoding='utf-8') |
333 data = file.read() | 313 data = file.read() |
334 file.close() | 314 file.close() |
335 | 315 |
336 marker = '// Automatically generated dictionaries' | 316 marker = '// Automatically generated dictionaries' |
337 markerIndex = data.find(marker) | 317 markerIndex = data.find(marker) |
338 if markerIndex < 0: | 318 if markerIndex < 0: |
339 raise Exception('Insertion marker not found in %s' % path) | 319 raise Exception('Insertion marker not found in %s' % path) |
340 data = data[0:markerIndex + len(marker)] + '\n' | 320 data = data[0:markerIndex + len(marker)] + '\n' |
341 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n' | 321 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n' |
342 | 322 |
343 file = codecs.open(path, 'wb', encoding='utf-8') | 323 file = codecs.open(path, 'wb', encoding='utf-8') |
344 file.write(data) | 324 file.write(data) |
345 file.close() | 325 file.close() |
346 | 326 |
347 def updateRules(): | 327 def updateRules(): |
348 rules = {} | 328 rules = {} |
349 rules['domainReferrals'] = domainReferrals | 329 rules['domainReferrals'] = domainReferrals |
350 updateSchemes(rules) | 330 rules['scheme'] = schemes |
351 updateDomains(rules) | 331 updateDomains(rules) |
352 writeRules(rules) | 332 writeRules(rules) |
353 | 333 |
354 if __name__ == "__main__": | 334 if __name__ == "__main__": |
355 updateRules() | 335 updateRules() |
OLD | NEW |