| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # coding: utf-8 | 2 # coding: utf-8 |
| 3 | 3 |
| 4 # This Source Code is subject to the terms of the Mozilla Public License | 4 # This Source Code is subject to the terms of the Mozilla Public License |
| 5 # version 2.0 (the "License"). You can obtain a copy of the License at | 5 # version 2.0 (the "License"). You can obtain a copy of the License at |
| 6 # http://mozilla.org/MPL/2.0/. | 6 # http://mozilla.org/MPL/2.0/. |
| 7 | 7 |
| 8 """ | 8 """ |
| 9 Update the dictionaries in the rules | 9 Update the dictionaries in the rules |
| 10 ==================================== | 10 ==================================== |
| (...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', | 231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', |
| 232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za', | 232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za', |
| 233 | 233 |
| 234 # From http://en.wikipedia.org/wiki/.zm | 234 # From http://en.wikipedia.org/wiki/.zm |
| 235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', | 235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', |
| 236 | 236 |
| 237 # From http://en.wikipedia.org/wiki/.zw | 237 # From http://en.wikipedia.org/wiki/.zw |
| 238 'co.zw', 'ac.zw', 'org.zw', | 238 'co.zw', 'ac.zw', 'org.zw', |
| 239 ] | 239 ] |
| 240 | 240 |
| 241 def getSuffixes(target, items): | |
| 242 suffixes = {} | |
| 243 for item, priority in items.iteritems(): | |
| 244 suffix = item[-1] if len(item) else '' | |
| 245 if not suffix in suffixes: | |
| 246 suffixes[suffix] = {} | |
| 247 suffixes[suffix][item[:-1]] = priority | |
| 248 for suffix, items in suffixes.iteritems(): | |
| 249 if len(items.keys()) == 1: | |
| 250 item, priority = items.items()[0] | |
| 251 target[suffix] = ''.join(reversed(item)) + ' ' + str(priority) | |
| 252 else: | |
| 253 target[suffix] = {} | |
| 254 getSuffixes(target[suffix], items) | |
| 255 | |
| 256 def urlopen(url, attempts=3): | 241 def urlopen(url, attempts=3): |
| 257 """ | 242 """ |
| 258 Tries to open a particular URL, retries on failure. | 243 Tries to open a particular URL, retries on failure. |
| 259 """ | 244 """ |
| 260 for i in range(attempts): | 245 for i in range(attempts): |
| 261 try: | 246 try: |
| 262 return urllib.urlopen(url) | 247 return urllib.urlopen(url) |
| 263 except IOError, e: | 248 except IOError, e: |
| 264 error = e | 249 error = e |
| 265 time.sleep(5) | 250 time.sleep(5) |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 284 tld = line | 269 tld = line |
| 285 | 270 |
| 286 if tld: | 271 if tld: |
| 287 yield tld | 272 yield tld |
| 288 | 273 |
| 289 def getTLDs(domains, tldPriority): | 274 def getTLDs(domains, tldPriority): |
| 290 for tld in iterateTLDs(): | 275 for tld in iterateTLDs(): |
| 291 if not tld in domains: | 276 if not tld in domains: |
| 292 domains[tld] = tldPriority | 277 domains[tld] = tldPriority |
| 293 | 278 |
| 294 def updateSchemes(rules): | |
| 295 rules['scheme'] = {} | |
| 296 getSuffixes(rules['scheme'], schemes) | |
| 297 | |
| 298 def updateDomains(rules): | 279 def updateDomains(rules): |
| 299 domains = {} | 280 domains = {} |
| 300 reader = codecs.getreader('utf-8')(sys.stdin) | 281 reader = codecs.getreader('utf-8')(sys.stdin) |
| 301 i = 0 | 282 i = 0 |
| 302 for domain in itertools.chain(reader.readlines(), additionalDomains): | 283 for domain in itertools.chain(reader.readlines(), additionalDomains): |
| 303 domain = domain.rstrip() | 284 domain = domain.rstrip() |
| 304 if not domain or domain in domains: | 285 if not domain or domain in domains: |
| 305 continue | 286 continue |
| 306 domains[domain] = i | 287 domains[domain] = i |
| 307 i += 1 | 288 i += 1 |
| 308 | 289 |
| 309 maxPriority = i | 290 maxPriority = i |
| 310 for domain in domains.iterkeys(): | 291 for domain in domains.iterkeys(): |
| 311 domains[domain] = maxPriority - domains[domain] | 292 domains[domain] = maxPriority - domains[domain] |
| 312 | 293 |
| 313 # Extract TLDs from domain list | 294 # Extract TLDs from domain list |
| 314 for domain, priority in domains.items(): | 295 for domain, priority in domains.items(): |
| 315 while True: | 296 while True: |
| 316 if not re.search(r'^[^.]+\.+', domain): | 297 if not re.search(r'^[^.]+\.+', domain): |
| 317 break | 298 break |
| 318 domain = re.sub(r'^[^.]+\.+', '', domain) | 299 domain = re.sub(r'^[^.]+\.+', '', domain) |
| 319 if not domain: | 300 if not domain: |
| 320 break | 301 break |
| 321 if not domain in domains or domains[domain] < priority - maxPriority: | 302 if not domain in domains or domains[domain] < priority - maxPriority: |
| 322 domains[domain] = priority - maxPriority | 303 domains[domain] = priority - maxPriority |
| 323 | 304 |
| 324 # Fill up with "official" TLDs | 305 # Fill up with "official" TLDs |
| 325 getTLDs(domains, -maxPriority) | 306 getTLDs(domains, -maxPriority) |
| 326 | 307 |
| 327 rules['domain'] = {} | 308 rules['domain'] = domains |
| 328 getSuffixes(rules['domain'], domains) | |
| 329 | 309 |
| 330 def writeRules(rules): | 310 def writeRules(rules): |
| 331 path = os.path.join('defaults', 'rules.json') | 311 path = os.path.join('defaults', 'rules.json') |
|
Thomas Greiner
2012/09/25 12:31:37
Do you really want the file to be located at /defa
Wladimir Palant
2012/09/25 13:26:00
Actually, it's defaults/rules.json (relative to th
| |
| 332 file = codecs.open(path, 'rb', encoding='utf-8') | 312 file = codecs.open(path, 'rb', encoding='utf-8') |
| 333 data = file.read() | 313 data = file.read() |
| 334 file.close() | 314 file.close() |
| 335 | 315 |
| 336 marker = '// Automatically generated dictionaries' | 316 marker = '// Automatically generated dictionaries' |
| 337 markerIndex = data.find(marker) | 317 markerIndex = data.find(marker) |
| 338 if markerIndex < 0: | 318 if markerIndex < 0: |
| 339 raise Exception('Insertion marker not found in %s' % path) | 319 raise Exception('Insertion marker not found in %s' % path) |
| 340 data = data[0:markerIndex + len(marker)] + '\n' | 320 data = data[0:markerIndex + len(marker)] + '\n' |
| 341 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n' | 321 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n' |
| 342 | 322 |
| 343 file = codecs.open(path, 'wb', encoding='utf-8') | 323 file = codecs.open(path, 'wb', encoding='utf-8') |
| 344 file.write(data) | 324 file.write(data) |
| 345 file.close() | 325 file.close() |
| 346 | 326 |
| 347 def updateRules(): | 327 def updateRules(): |
| 348 rules = {} | 328 rules = {} |
| 349 rules['domainReferrals'] = domainReferrals | 329 rules['domainReferrals'] = domainReferrals |
| 350 updateSchemes(rules) | 330 rules['scheme'] = schemes |
| 351 updateDomains(rules) | 331 updateDomains(rules) |
| 352 writeRules(rules) | 332 writeRules(rules) |
| 353 | 333 |
| 354 if __name__ == "__main__": | 334 if __name__ == "__main__": |
| 355 updateRules() | 335 updateRules() |
| OLD | NEW |