Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/urlfixer/bin/topDomains.py

Issue 8943045: Implemented extraction of URL Fixer data (Closed)
Left Patch Set: Created Nov. 23, 2012, 4:36 p.m.
Right Patch Set: Larger blacklist Created Nov. 28, 2012, 11:29 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « sitescripts/urlfixer/bin/forceDomains.py ('k') | sitescripts/urlfixer/schema.sql » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2012 Eyeo GmbH 4 # Copyright (C) 2006-2012 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
(...skipping 10 matching lines...) Expand all
21 """ 21 """
22 This script produces the list of top correct domain names currently in the 22 This script produces the list of top correct domain names currently in the
23 database. 23 database.
24 """ 24 """
25 25
26 STATUS_TYPED = 1 26 STATUS_TYPED = 1
27 STATUS_TYPO = 2 27 STATUS_TYPO = 2
28 STATUS_CORRECTION = 3 28 STATUS_CORRECTION = 3
29 STATUS_FALSE_POSITIVE = 4 29 STATUS_FALSE_POSITIVE = 4
30 30
31 def getTopDomains(count=1000): 31 def getTopDomains(count=5000):
Wladimir Palant 2012/11/28 10:49:24 Note that this is count=5000 on the server current
Thomas Greiner 2012/11/28 11:17:39 Ok.
32 db = _get_db() 32 db = _get_db()
33 cursor = db.cursor(MySQLdb.cursors.DictCursor) 33 cursor = db.cursor(MySQLdb.cursors.DictCursor)
34 cursor.execute("SELECT id, domain, forceinclusion FROM domains") 34 cursor.execute("SELECT id, domain, forceinclusion FROM domains")
35 35
36 domains = {} 36 domains = {}
37 mandatory = [] 37 mandatory = []
38 for result in cursor: 38 for result in cursor:
39 domain = result["domain"] 39 domain = result["domain"]
40 if ("." not in domain or not re.search(r"[a-zA-Z]", domain) or 40 if ("." not in domain or not re.search(r"[a-zA-Z]", domain) or
41 re.search(r"['\"_,<>;]|^\.|\.$|\.\.", domain)): 41 re.search(r"['\"_,<>:;!$%&/()*+#~]|^\.|\.$|\.\.", domain)):
Thomas Greiner 2012/11/28 10:05:42 That's a small selection of special characters. It
Wladimir Palant 2012/11/28 10:49:24 What about international domain names? I explicitl
Thomas Greiner 2012/11/28 11:17:39 Ok. For now I think it would be sufficient to chec
42 continue 42 continue
43 43
44 typed = _get_weighted_count(db, result["id"], STATUS_TYPED) 44 typed = _get_weighted_count(db, result["id"], STATUS_TYPED)
45 correction = _get_weighted_count(db, result["id"], STATUS_CORRECTION) 45 correction = _get_weighted_count(db, result["id"], STATUS_CORRECTION)
46 typo = _get_weighted_count(db, result["id"], STATUS_TYPO) 46 typo = _get_weighted_count(db, result["id"], STATUS_TYPO)
47 fp = _get_weighted_count(db, result["id"], STATUS_FALSE_POSITIVE) 47 fp = _get_weighted_count(db, result["id"], STATUS_FALSE_POSITIVE)
48 correctness = _calculate_correctness(typed + correction, typo + fp) 48 correctness = _calculate_correctness(typed + correction, typo + fp)
49 49
50 domains[domain] = correctness 50 domains[domain] = correctness
51 if result["forceinclusion"]: 51 if result["forceinclusion"]:
52 mandatory.append(domain) 52 mandatory.append(domain)
53 return sorted(domains.iterkeys(), key=lambda d: domains[d], reverse=True)[:cou nt] + mandatory 53 return sorted(domains.iterkeys(), key=lambda d: domains[d], reverse=True)[:cou nt] + mandatory
54 54
55 def _get_weighted_count(db, domain, status): 55 def _get_weighted_count(db, domain, status):
56 cursor = db.cursor(MySQLdb.cursors.DictCursor) 56 cursor = db.cursor(MySQLdb.cursors.DictCursor)
57 cursor.execute("""SELECT curr_month * 0.4 + prev_month * 0.3 + 57 cursor.execute("""SELECT curr_month * 0.4 + prev_month * 0.3 +
58 curr_year * 0.2 + prev_year * 0.1 AS result 58 curr_year * 0.2 + prev_year * 0.1 AS weighted_count
Thomas Greiner 2012/11/28 10:05:42 |result| is not very descriptive here. Please rena
59 FROM corrections WHERE domain = %s AND status = %s""", 59 FROM corrections WHERE domain = %s AND status = %s""",
60 (domain, status)) 60 (domain, status))
61 result = cursor.fetchone() 61 result = cursor.fetchone()
62 if result == None: 62 if result == None:
63 return 0 63 return 0
64 else: 64 else:
65 return result["result"] 65 return result["weighted_count"]
Thomas Greiner 2012/11/28 10:05:42 |result["result"]| is not very descriptive here. P
66 66
67 def _calculate_correctness(positive, negative): 67 def _calculate_correctness(positive, negative):
68 if positive + negative > 0: 68 if positive + negative > 0:
69 # Determine the correctness score with a confidence level of 0.95 69 # Determine the correctness score with a confidence level of 0.95
70 # (see http://www.evanmiller.org/how-not-to-sort-by-average-rating.html) 70 # (see http://www.evanmiller.org/how-not-to-sort-by-average-rating.html)
71 fp = float(positive) 71 fp = float(positive)
72 fn = float(negative) 72 fn = float(negative)
73 total = fp + fn 73 total = fp + fn
74 return ((fp + 1.9208) / total - 1.96 * math.sqrt((fp * fn) / total + 74 return ((fp + 1.9208) / total - 1.96 * math.sqrt((fp * fn) / total +
75 0.9604) / total) / (1 + 3.8416 / total) 75 0.9604) / total) / (1 + 3.8416 / total)
(...skipping 10 matching lines...) Expand all
86 else: 86 else:
87 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 87 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
88 use_unicode=True, charset="utf8") 88 use_unicode=True, charset="utf8")
89 89
90 if __name__ == '__main__': 90 if __name__ == '__main__':
91 setupStderr() 91 setupStderr()
92 92
93 domains = getTopDomains() 93 domains = getTopDomains()
94 for domain in domains: 94 for domain in domains:
95 print domain.encode("utf-8") 95 print domain.encode("utf-8")
LEFTRIGHT

Powered by Google App Engine
This is Rietveld