Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/urlfixer/bin/topDomains.py

Issue 8943045: Implemented extraction of URL Fixer data (Closed)
Patch Set: Larger blacklist Created Nov. 28, 2012, 11:29 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/urlfixer/bin/forceDomains.py ('k') | sitescripts/urlfixer/schema.sql » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # coding: utf-8
2
3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2012 Eyeo GmbH
5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation.
9 #
10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17
18 import sys, os, re, math, MySQLdb
19 from sitescripts.utils import get_config, setupStderr
20
21 """
22 This script produces the list of top correct domain names currently in the
23 database.
24 """
25
26 STATUS_TYPED = 1
27 STATUS_TYPO = 2
28 STATUS_CORRECTION = 3
29 STATUS_FALSE_POSITIVE = 4
30
31 def getTopDomains(count=5000):
32 db = _get_db()
33 cursor = db.cursor(MySQLdb.cursors.DictCursor)
34 cursor.execute("SELECT id, domain, forceinclusion FROM domains")
35
36 domains = {}
37 mandatory = []
38 for result in cursor:
39 domain = result["domain"]
40 if ("." not in domain or not re.search(r"[a-zA-Z]", domain) or
41 re.search(r"['\"_,<>:;!$%&/()*+#~]|^\.|\.$|\.\.", domain)):
42 continue
43
44 typed = _get_weighted_count(db, result["id"], STATUS_TYPED)
45 correction = _get_weighted_count(db, result["id"], STATUS_CORRECTION)
46 typo = _get_weighted_count(db, result["id"], STATUS_TYPO)
47 fp = _get_weighted_count(db, result["id"], STATUS_FALSE_POSITIVE)
48 correctness = _calculate_correctness(typed + correction, typo + fp)
49
50 domains[domain] = correctness
51 if result["forceinclusion"]:
52 mandatory.append(domain)
53 return sorted(domains.iterkeys(), key=lambda d: domains[d], reverse=True)[:cou nt] + mandatory
54
55 def _get_weighted_count(db, domain, status):
56 cursor = db.cursor(MySQLdb.cursors.DictCursor)
57 cursor.execute("""SELECT curr_month * 0.4 + prev_month * 0.3 +
58 curr_year * 0.2 + prev_year * 0.1 AS weighted_count
59 FROM corrections WHERE domain = %s AND status = %s""",
60 (domain, status))
61 result = cursor.fetchone()
62 if result == None:
63 return 0
64 else:
65 return result["weighted_count"]
66
67 def _calculate_correctness(positive, negative):
68 if positive + negative > 0:
69 # Determine the correctness score with a confidence level of 0.95
70 # (see http://www.evanmiller.org/how-not-to-sort-by-average-rating.html)
71 fp = float(positive)
72 fn = float(negative)
73 total = fp + fn
74 return ((fp + 1.9208) / total - 1.96 * math.sqrt((fp * fn) / total +
75 0.9604) / total) / (1 + 3.8416 / total)
76 else:
77 return 0
78
79 def _get_db():
80 database = get_config().get("urlfixer", "database")
81 dbuser = get_config().get("urlfixer", "dbuser")
82 dbpasswd = get_config().get("urlfixer", "dbpassword")
83 if os.name == "nt":
84 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
85 use_unicode=True, charset="utf8", named_pipe=True)
86 else:
87 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
88 use_unicode=True, charset="utf8")
89
90 if __name__ == '__main__':
91 setupStderr()
92
93 domains = getTopDomains()
94 for domain in domains:
95 print domain.encode("utf-8")
OLDNEW
« no previous file with comments | « sitescripts/urlfixer/bin/forceDomains.py ('k') | sitescripts/urlfixer/schema.sql » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld