Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 # coding: utf-8 | |
2 | |
3 # This file is part of the Adblock Plus web scripts, | |
4 # Copyright (C) 2006-2012 Eyeo GmbH | |
5 # | |
6 # Adblock Plus is free software: you can redistribute it and/or modify | |
7 # it under the terms of the GNU General Public License version 3 as | |
8 # published by the Free Software Foundation. | |
9 # | |
10 # Adblock Plus is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public License | |
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | |
17 | |
18 import sys, os, re, math, MySQLdb | |
19 from sitescripts.utils import get_config, setupStderr | |
20 | |
21 """ | |
22 This script produces the list of top correct domain names currently in the | |
23 database. | |
24 """ | |
25 | |
26 STATUS_TYPED = 1 | |
27 STATUS_TYPO = 2 | |
28 STATUS_CORRECTION = 3 | |
29 STATUS_FALSE_POSITIVE = 4 | |
30 | |
31 def getTopDomains(count=1000): | |
Wladimir Palant
2012/11/28 10:49:24
Note that this is count=5000 on the server current
Thomas Greiner
2012/11/28 11:17:39
Ok.
| |
32 db = _get_db() | |
33 cursor = db.cursor(MySQLdb.cursors.DictCursor) | |
34 cursor.execute("SELECT id, domain, forceinclusion FROM domains") | |
35 | |
36 domains = {} | |
37 mandatory = [] | |
38 for result in cursor: | |
39 domain = result["domain"] | |
40 if ("." not in domain or not re.search(r"[a-zA-Z]", domain) or | |
41 re.search(r"['\"_,<>;]|^\.|\.$|\.\.", domain)): | |
Thomas Greiner
2012/11/28 10:05:42
That's a small selection of special characters. It
Wladimir Palant
2012/11/28 10:49:24
What about international domain names? I explicitl
Thomas Greiner
2012/11/28 11:17:39
Ok. For now I think it would be sufficient to chec
| |
42 continue | |
43 | |
44 typed = _get_weighted_count(db, result["id"], STATUS_TYPED) | |
45 correction = _get_weighted_count(db, result["id"], STATUS_CORRECTION) | |
46 typo = _get_weighted_count(db, result["id"], STATUS_TYPO) | |
47 fp = _get_weighted_count(db, result["id"], STATUS_FALSE_POSITIVE) | |
48 correctness = _calculate_correctness(typed + correction, typo + fp) | |
49 | |
50 domains[domain] = correctness | |
51 if result["forceinclusion"]: | |
52 mandatory.append(domain) | |
53 return sorted(domains.iterkeys(), key=lambda d: domains[d], reverse=True)[:cou nt] + mandatory | |
54 | |
55 def _get_weighted_count(db, domain, status): | |
56 cursor = db.cursor(MySQLdb.cursors.DictCursor) | |
57 cursor.execute("""SELECT curr_month * 0.4 + prev_month * 0.3 + | |
58 curr_year * 0.2 + prev_year * 0.1 AS result | |
Thomas Greiner
2012/11/28 10:05:42
|result| is not very descriptive here. Please rena
| |
59 FROM corrections WHERE domain = %s AND status = %s""", | |
60 (domain, status)) | |
61 result = cursor.fetchone() | |
62 if result == None: | |
63 return 0 | |
64 else: | |
65 return result["result"] | |
Thomas Greiner
2012/11/28 10:05:42
|result["result"]| is not very descriptive here. P
| |
66 | |
67 def _calculate_correctness(positive, negative): | |
68 if positive + negative > 0: | |
69 # Determine the correctness score with a confidence level of 0.95 | |
70 # (see http://www.evanmiller.org/how-not-to-sort-by-average-rating.html) | |
71 fp = float(positive) | |
72 fn = float(negative) | |
73 total = fp + fn | |
74 return ((fp + 1.9208) / total - 1.96 * math.sqrt((fp * fn) / total + | |
75 0.9604) / total) / (1 + 3.8416 / total) | |
76 else: | |
77 return 0 | |
78 | |
79 def _get_db(): | |
80 database = get_config().get("urlfixer", "database") | |
81 dbuser = get_config().get("urlfixer", "dbuser") | |
82 dbpasswd = get_config().get("urlfixer", "dbpassword") | |
83 if os.name == "nt": | |
84 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | |
85 use_unicode=True, charset="utf8", named_pipe=True) | |
86 else: | |
87 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | |
88 use_unicode=True, charset="utf8") | |
89 | |
90 if __name__ == '__main__': | |
91 setupStderr() | |
92 | |
93 domains = getTopDomains() | |
94 for domain in domains: | |
95 print domain.encode("utf-8") | |
OLD | NEW |