Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/urlfixer/bin/topDomains.py

Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts (Closed)
Patch Set: Fixed raw string Created May 30, 2016, 8:47 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/urlfixer/bin/forceDomains.py ('k') | sitescripts/urlfixer/web/submitData.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file is part of the Adblock Plus web scripts, 1 # This file is part of the Adblock Plus web scripts,
2 # Copyright (C) 2006-2016 Eyeo GmbH 2 # Copyright (C) 2006-2016 Eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 import sys 16 import sys
17 import os 17 import os
18 import re 18 import re
19 import math 19 import math
20 import MySQLdb 20 import MySQLdb
21 from sitescripts.utils import get_config, setupStderr 21 from sitescripts.utils import get_config, setupStderr
22 22
23 """ 23 '''
24 This script produces the list of top correct domain names currently in the 24 This script produces the list of top correct domain names currently in the
25 database. 25 database.
26 """ 26 '''
27 27
28 STATUS_TYPED = 1 28 STATUS_TYPED = 1
29 STATUS_TYPO = 2 29 STATUS_TYPO = 2
30 STATUS_CORRECTION = 3 30 STATUS_CORRECTION = 3
31 STATUS_FALSE_POSITIVE = 4 31 STATUS_FALSE_POSITIVE = 4
32 32
33 33
34 def getTopDomains(count=5000): 34 def getTopDomains(count=5000):
35 db = _get_db() 35 db = _get_db()
36 cursor = db.cursor(MySQLdb.cursors.DictCursor) 36 cursor = db.cursor(MySQLdb.cursors.DictCursor)
37 cursor.execute("SELECT id, domain, forceinclusion FROM domains") 37 cursor.execute('SELECT id, domain, forceinclusion FROM domains')
38 38
39 domains = {} 39 domains = {}
40 mandatory = [] 40 mandatory = []
41 for result in cursor: 41 for result in cursor:
42 domain = result["domain"] 42 domain = result['domain']
43 if "." not in domain or not re.search(r"[a-zA-Z]", domain): 43 if '.' not in domain or not re.search(r'[a-zA-Z]', domain):
44 continue 44 continue
45 if re.search(r"['\"_,<>:;!$%&/()*+#~]|^\.|\.$|\.\.", domain): 45 if re.search(r"['\"_,<>:;!$%&/()*+#~]|^\.|\.$|\.\.", domain):
Vasily Kuznetsov 2016/05/30 10:01:41 I now get A110 on this line. It seems that convert
Sebastian Noack 2016/05/30 10:27:18 Well, there is also the fourth option to fix flake
Vasily Kuznetsov 2016/05/30 12:16:34 I agree that moving the backslashes around is not
46 continue 46 continue
47 47
48 typed = _get_weighted_count(db, result["id"], STATUS_TYPED) 48 typed = _get_weighted_count(db, result['id'], STATUS_TYPED)
49 correction = _get_weighted_count(db, result["id"], STATUS_CORRECTION) 49 correction = _get_weighted_count(db, result['id'], STATUS_CORRECTION)
50 typo = _get_weighted_count(db, result["id"], STATUS_TYPO) 50 typo = _get_weighted_count(db, result['id'], STATUS_TYPO)
51 fp = _get_weighted_count(db, result["id"], STATUS_FALSE_POSITIVE) 51 fp = _get_weighted_count(db, result['id'], STATUS_FALSE_POSITIVE)
52 correctness = _calculate_correctness(typed + correction, typo + fp) 52 correctness = _calculate_correctness(typed + correction, typo + fp)
53 53
54 domains[domain] = correctness 54 domains[domain] = correctness
55 if result["forceinclusion"]: 55 if result['forceinclusion']:
56 mandatory.append(domain) 56 mandatory.append(domain)
57 return sorted(domains.iterkeys(), key=lambda d: domains[d], reverse=True)[:c ount] + mandatory 57 return sorted(domains.iterkeys(), key=lambda d: domains[d], reverse=True)[:c ount] + mandatory
58 58
59 59
60 def _get_weighted_count(db, domain, status): 60 def _get_weighted_count(db, domain, status):
61 cursor = db.cursor(MySQLdb.cursors.DictCursor) 61 cursor = db.cursor(MySQLdb.cursors.DictCursor)
62 cursor.execute("""SELECT curr_month * 0.4 + prev_month * 0.3 + 62 cursor.execute('''SELECT curr_month * 0.4 + prev_month * 0.3 +
63 curr_year * 0.2 + prev_year * 0.1 AS weighted_count 63 curr_year * 0.2 + prev_year * 0.1 AS weighted_count
64 FROM corrections WHERE domain = %s AND status = %s""", 64 FROM corrections WHERE domain = %s AND status = %s''',
65 (domain, status)) 65 (domain, status))
66 result = cursor.fetchone() 66 result = cursor.fetchone()
67 if result == None: 67 if result == None:
68 return 0 68 return 0
69 else: 69 else:
70 return result["weighted_count"] 70 return result['weighted_count']
71 71
72 72
73 def _calculate_correctness(positive, negative): 73 def _calculate_correctness(positive, negative):
74 if positive + negative > 0: 74 if positive + negative > 0:
75 # Determine the correctness score with a confidence level of 0.95 75 # Determine the correctness score with a confidence level of 0.95
76 # (see http://www.evanmiller.org/how-not-to-sort-by-average-rating.html) 76 # (see http://www.evanmiller.org/how-not-to-sort-by-average-rating.html)
77 fp = float(positive) 77 fp = float(positive)
78 fn = float(negative) 78 fn = float(negative)
79 total = fp + fn 79 total = fp + fn
80 return ((fp + 1.9208) / total - 1.96 * math.sqrt((fp * fn) / total + 80 return ((fp + 1.9208) / total - 1.96 * math.sqrt((fp * fn) / total +
81 0.9604) / total) / (1 + 3.8416 / total) 81 0.9604) / total) / (1 + 3.8416 / total)
82 else: 82 else:
83 return 0 83 return 0
84 84
85 85
86 def _get_db(): 86 def _get_db():
87 database = get_config().get("urlfixer", "database") 87 database = get_config().get('urlfixer', 'database')
88 dbuser = get_config().get("urlfixer", "dbuser") 88 dbuser = get_config().get('urlfixer', 'dbuser')
89 dbpasswd = get_config().get("urlfixer", "dbpassword") 89 dbpasswd = get_config().get('urlfixer', 'dbpassword')
90 if os.name == "nt": 90 if os.name == 'nt':
91 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 91 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
92 use_unicode=True, charset="utf8", named_pipe=True ) 92 use_unicode=True, charset='utf8', named_pipe=True )
93 else: 93 else:
94 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 94 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
95 use_unicode=True, charset="utf8") 95 use_unicode=True, charset='utf8')
96 96
97 if __name__ == '__main__': 97 if __name__ == '__main__':
98 setupStderr() 98 setupStderr()
99 99
100 domains = getTopDomains() 100 domains = getTopDomains()
101 for domain in domains: 101 for domain in domains:
102 print domain.encode("utf-8") 102 print domain.encode('utf-8')
OLDNEW
« no previous file with comments | « sitescripts/urlfixer/bin/forceDomains.py ('k') | sitescripts/urlfixer/web/submitData.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld