Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: updateRules.py

Issue 8401090: Modified server-side rules generation to produce prioritized domain lists instead of complete suffi… (Closed)
Patch Set: Created Sept. 25, 2012, 11:32 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « defaults/rules.json ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 3
4 # This Source Code is subject to the terms of the Mozilla Public License 4 # This Source Code is subject to the terms of the Mozilla Public License
5 # version 2.0 (the "License"). You can obtain a copy of the License at 5 # version 2.0 (the "License"). You can obtain a copy of the License at
6 # http://mozilla.org/MPL/2.0/. 6 # http://mozilla.org/MPL/2.0/.
7 7
8 """ 8 """
9 Update the dictionaries in the rules 9 Update the dictionaries in the rules
10 ==================================== 10 ====================================
(...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after
231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', 231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za',
232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za', 232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o livetti.za', 'pix.za',
233 233
234 # From http://en.wikipedia.org/wiki/.zm 234 # From http://en.wikipedia.org/wiki/.zm
235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', 235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm',
236 236
237 # From http://en.wikipedia.org/wiki/.zw 237 # From http://en.wikipedia.org/wiki/.zw
238 'co.zw', 'ac.zw', 'org.zw', 238 'co.zw', 'ac.zw', 'org.zw',
239 ] 239 ]
240 240
241 def getSuffixes(target, items):
242 suffixes = {}
243 for item, priority in items.iteritems():
244 suffix = item[-1] if len(item) else ''
245 if not suffix in suffixes:
246 suffixes[suffix] = {}
247 suffixes[suffix][item[:-1]] = priority
248 for suffix, items in suffixes.iteritems():
249 if len(items.keys()) == 1:
250 item, priority = items.items()[0]
251 target[suffix] = ''.join(reversed(item)) + ' ' + str(priority)
252 else:
253 target[suffix] = {}
254 getSuffixes(target[suffix], items)
255
256 def urlopen(url, attempts=3): 241 def urlopen(url, attempts=3):
257 """ 242 """
258 Tries to open a particular URL, retries on failure. 243 Tries to open a particular URL, retries on failure.
259 """ 244 """
260 for i in range(attempts): 245 for i in range(attempts):
261 try: 246 try:
262 return urllib.urlopen(url) 247 return urllib.urlopen(url)
263 except IOError, e: 248 except IOError, e:
264 error = e 249 error = e
265 time.sleep(5) 250 time.sleep(5)
(...skipping 18 matching lines...) Expand all
284 tld = line 269 tld = line
285 270
286 if tld: 271 if tld:
287 yield tld 272 yield tld
288 273
289 def getTLDs(domains, tldPriority): 274 def getTLDs(domains, tldPriority):
290 for tld in iterateTLDs(): 275 for tld in iterateTLDs():
291 if not tld in domains: 276 if not tld in domains:
292 domains[tld] = tldPriority 277 domains[tld] = tldPriority
293 278
294 def updateSchemes(rules):
295 rules['scheme'] = {}
296 getSuffixes(rules['scheme'], schemes)
297
298 def updateDomains(rules): 279 def updateDomains(rules):
299 domains = {} 280 domains = {}
300 reader = codecs.getreader('utf-8')(sys.stdin) 281 reader = codecs.getreader('utf-8')(sys.stdin)
301 i = 0 282 i = 0
302 for domain in itertools.chain(reader.readlines(), additionalDomains): 283 for domain in itertools.chain(reader.readlines(), additionalDomains):
303 domain = domain.rstrip() 284 domain = domain.rstrip()
304 if not domain or domain in domains: 285 if not domain or domain in domains:
305 continue 286 continue
306 domains[domain] = i 287 domains[domain] = i
307 i += 1 288 i += 1
308 289
309 maxPriority = i 290 maxPriority = i
310 for domain in domains.iterkeys(): 291 for domain in domains.iterkeys():
311 domains[domain] = maxPriority - domains[domain] 292 domains[domain] = maxPriority - domains[domain]
312 293
313 # Extract TLDs from domain list 294 # Extract TLDs from domain list
314 for domain, priority in domains.items(): 295 for domain, priority in domains.items():
315 while True: 296 while True:
316 if not re.search(r'^[^.]+\.+', domain): 297 if not re.search(r'^[^.]+\.+', domain):
317 break 298 break
318 domain = re.sub(r'^[^.]+\.+', '', domain) 299 domain = re.sub(r'^[^.]+\.+', '', domain)
319 if not domain: 300 if not domain:
320 break 301 break
321 if not domain in domains or domains[domain] < priority - maxPriority: 302 if not domain in domains or domains[domain] < priority - maxPriority:
322 domains[domain] = priority - maxPriority 303 domains[domain] = priority - maxPriority
323 304
324 # Fill up with "official" TLDs 305 # Fill up with "official" TLDs
325 getTLDs(domains, -maxPriority) 306 getTLDs(domains, -maxPriority)
326 307
327 rules['domain'] = {} 308 rules['domain'] = domains
328 getSuffixes(rules['domain'], domains)
329 309
330 def writeRules(rules): 310 def writeRules(rules):
331 path = os.path.join('defaults', 'rules.json') 311 path = os.path.join('defaults', 'rules.json')
Thomas Greiner 2012/09/25 12:31:37 Do you really want the file to be located at /defa
Wladimir Palant 2012/09/25 13:26:00 Actually, it's defaults/rules.json (relative to th
332 file = codecs.open(path, 'rb', encoding='utf-8') 312 file = codecs.open(path, 'rb', encoding='utf-8')
333 data = file.read() 313 data = file.read()
334 file.close() 314 file.close()
335 315
336 marker = '// Automatically generated dictionaries' 316 marker = '// Automatically generated dictionaries'
337 markerIndex = data.find(marker) 317 markerIndex = data.find(marker)
338 if markerIndex < 0: 318 if markerIndex < 0:
339 raise Exception('Insertion marker not found in %s' % path) 319 raise Exception('Insertion marker not found in %s' % path)
340 data = data[0:markerIndex + len(marker)] + '\n' 320 data = data[0:markerIndex + len(marker)] + '\n'
341 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n' 321 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator s = (',', ':'))[1:-1] + '\n}\n'
342 322
343 file = codecs.open(path, 'wb', encoding='utf-8') 323 file = codecs.open(path, 'wb', encoding='utf-8')
344 file.write(data) 324 file.write(data)
345 file.close() 325 file.close()
346 326
347 def updateRules(): 327 def updateRules():
348 rules = {} 328 rules = {}
349 rules['domainReferrals'] = domainReferrals 329 rules['domainReferrals'] = domainReferrals
350 updateSchemes(rules) 330 rules['scheme'] = schemes
351 updateDomains(rules) 331 updateDomains(rules)
352 writeRules(rules) 332 writeRules(rules)
353 333
354 if __name__ == "__main__": 334 if __name__ == "__main__":
355 updateRules() 335 updateRules()
OLDNEW
« no previous file with comments | « defaults/rules.json ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld