Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/urlfixer/web/submitData.py

Issue 8483155: Added typo data collection (/submitData) (Closed)
Left Patch Set: Created Oct. 9, 2012, 10:58 a.m.
Right Patch Set: Created Oct. 9, 2012, 2:27 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « sitescripts/urlfixer/web/__init__.py ('k') | sitescripts/web.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This Source Code is subject to the terms of the Mozilla Public License 3 # This Source Code is subject to the terms of the Mozilla Public License
4 # version 2.0 (the "License"). You can obtain a copy of the License at 4 # version 2.0 (the "License"). You can obtain a copy of the License at
5 # http://mozilla.org/MPL/2.0/. 5 # http://mozilla.org/MPL/2.0/.
6 6
7 # import MySQLdb, os, simplejson, sys, re
8 # from sitescripts.utils import cached, get_config
9 # from sitescripts.web import url_handler, basic_auth
Wladimir Palant 2012/10/09 11:51:14 Any reason why the lines above are required? ;)
10
11 import os, MySQLdb, simplejson as json 7 import os, MySQLdb, simplejson as json
12 from urlparse import parse_qs 8 from urlparse import parse_qs
13 from math import sqrt
Wladimir Palant 2012/10/09 11:51:14 I would prefer to just import math and use the fun
14 from sitescripts.web import url_handler 9 from sitescripts.web import url_handler
15 from sitescripts.utils import get_config, setupStderr 10 from sitescripts.utils import cached, get_config, setupStderr
16
17 STATUS_TYPED = 1
18 STATUS_TYPO = 2
19 STATUS_CORRECTION = 3
20 STATUS_FALSE_POSITIVE = 4
21 11
22 @url_handler("/submitData") 12 @url_handler("/submitData")
23 def submit_data(environ, start_response): 13 def submit_data(environ, start_response):
24 setupStderr(environ['wsgi.errors']) 14 setupStderr(environ['wsgi.errors'])
25 15
26 if environ['REQUEST_METHOD'].upper() != 'POST': 16 if environ["REQUEST_METHOD"].upper() != "POST":
27 return showError('Unsupported request method', start_response) 17 return showError("Unsupported request method", start_response)
28
29 output = ""
30 18
31 params = parse_qs(environ.get("QUERY_STRING", "")) 19 params = parse_qs(environ.get("QUERY_STRING", ""))
32 requestVersion = params.get("version", ["0"])[0] 20 requestVersion = params.get("version", ["0"])[0]
33 data = "{}" 21 data = "{}"
34 try: 22 try:
35 data_length = int(environ.get("CONTENT_LENGTH", "0")) 23 data_length = int(environ.get("CONTENT_LENGTH", "0"))
36 except ValueError: 24 except ValueError:
37 data_length = 0 25 data_length = 0
38 if data_length != 0: 26 if data_length != 0:
39 data = environ["wsgi.input"].read(data_length) 27 data = environ["wsgi.input"].read(data_length)
40 data = json.loads(data, ["{}"]) 28 try:
Wladimir Palant 2012/10/09 11:51:14 Not sure I understand the significance of the seco
29 data = json.loads(data)
30 except json.decoder.JSONDecodeError:
31 return showError("Error while parsing JSON data.", start_response)
41 32
42 try: 33 db = _get_db()
43 database = get_config().get("urlfixer", "database")
44 dbuser = get_config().get("urlfixer", "dbuser")
45 dbpasswd = get_config().get("urlfixer", "dbpassword")
46 except Exception:
47 return showError("Error while reading config.", start_response)
Wladimir Palant 2012/10/09 11:51:14 That's a server configuration issue, not something
48 34
49 if os.name == "nt": 35 for domain, status in data.iteritems():
50 db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 36 process_domain(db, domain, status)
51 use_unicode=True, charset="utf8", named_pipe=True)
52 else:
53 db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
54 use_unicode=True, charset="utf8")
Wladimir Palant 2012/10/09 11:51:14 I suggest caching the database connection similarl
55
56 for domain in data:
Wladimir Palant 2012/10/09 11:51:14 I suggest: for domain, status in data.iteritems
57 process_domain(db, domain, data[domain])
58 37
59 db.commit() 38 db.commit()
60 db.close()
Wladimir Palant 2012/10/09 11:51:14 Database shouldn't be closed if you share the data
61 39
62 response_headers = [('Content-type', 'text/plain'), 40 response_headers = [("Content-type", "text/plain")]
63 ('Content-Length', str(len(output)))]
Wladimir Palant 2012/10/09 11:51:14 Output is empty and this won't change (nobody can
64 start_response("200 OK", response_headers) 41 start_response("200 OK", response_headers)
65 return [output] 42 return []
66 43
67 def process_domain(db, domain, status): 44 def process_domain(db, domain, status):
68 domain_id = _get_domain_id(db, domain) 45 domain_id = _get_domain_id(db, domain)
69
70 _increment_entry(db, domain_id, status) 46 _increment_entry(db, domain_id, status)
71
72 typed = _get_entry(db, domain_id, STATUS_TYPED)
73 typo = _get_entry(db, domain_id, STATUS_TYPO)
74 correction = _get_entry(db, domain_id, STATUS_CORRECTION)
75 false_positive = _get_entry(db, domain_id, STATUS_FALSE_POSITIVE)
76
77 # Determine the occurences of the domain being correct as is
78 typed_count = get_weighted_count(typed)
79 # Determine the occurences of the domain as a typo
80 typo_count = get_weighted_count(typo)
81 # Determine the occurences of the domain as a correction
82 correction_count = get_weighted_count(correction)
83 # Determine the occurences of the domain as a false positive
84 false_positive_count = get_weighted_count(false_positive)
85
86 # Determine the correctness of the domain and of the corrections (with a confi dence level of 0.95)
87 # http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
88 domain_correctness_score = calculateCorrectness(correction_count + typed_count , false_positive_count + typo_count)
89 correction_correctness_score = calculateCorrectness(correction_count, false_po sitive_count)
90
91 _update_scores(db, domain_id, domain_correctness_score, correction_correctness _score)
92
93 def calculateCorrectness(positive, negative):
94 score = 0
95
96 if positive + negative > 0:
97 score = ((positive + 1.9208) / (positive + negative) -
98 1.96 * sqrt((positive * negative) / (positive + negative) + 0.9604) /
99 (positive + negative)) / (1 + 3.8416 / (positive + negative))
100
101 return score
102
103 def get_weighted_count(result):
104 if result == None:
105 return 0
106
107 return result["curr_month"] * 0.4 + result["prev_month"] * 0.3 + result["curr_ year"] * 0.2 + result["prev_year"] * 0.1
108 47
109 def showError(message, start_response): 48 def showError(message, start_response):
110 start_response('400 Processing Error', [('Content-Type', 'text/plain; charset= utf-8')]) 49 start_response("400 Processing Error", [("Content-Type", "text/plain; charset= utf-8")])
111 return [message.encode('utf-8')] 50 return [message.encode("utf-8")]
51
52 @cached(600)
53 def _get_db():
54 database = get_config().get("urlfixer", "database")
55 dbuser = get_config().get("urlfixer", "dbuser")
56 dbpasswd = get_config().get("urlfixer", "dbpassword")
57 if os.name == "nt":
58 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
59 use_unicode=True, charset="utf8", named_pipe=True)
60 else:
61 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
62 use_unicode=True, charset="utf8")
112 63
113 def _get_domain_id(db, domain): 64 def _get_domain_id(db, domain):
114 cursor = db.cursor(MySQLdb.cursors.DictCursor) 65 cursor = db.cursor(MySQLdb.cursors.DictCursor)
115 cursor.execute("SELECT id FROM domains WHERE domain='%s'" % (domain)) 66 cursor.execute("SELECT id FROM domains WHERE domain = %s", (domain))
Wladimir Palant 2012/10/09 11:51:14 Please don't ever generate queries dynamically, us
Thomas Greiner 2012/10/09 13:47:02 Actually, I thought that this was the syntax for p
116 result = cursor.fetchone() 67 result = cursor.fetchone()
117 if result == None: 68 if result == None:
118 cursor.execute("INSERT INTO domains(domain) VALUES ('%s')" % (domain)) 69 cursor.execute("INSERT INTO domains(domain) VALUES (%s)", (domain))
119 return _get_domain_id(db, domain) 70 return db.insert_id()
Wladimir Palant 2012/10/09 11:51:14 Should be: return db.insert_id()
120 else: 71 else:
121 return result["id"] 72 return result["id"]
122
123 def _get_entry(db, domain_id, status):
124 cursor = db.cursor(MySQLdb.cursors.DictCursor)
125 cursor.execute("SELECT * FROM corrections WHERE domain=%d AND status=%d" % (do main_id, status))
126 return cursor.fetchone()
127 73
128 def _increment_entry(db, domain_id, status): 74 def _increment_entry(db, domain_id, status):
129 cursor = db.cursor(MySQLdb.cursors.DictCursor) 75 cursor = db.cursor(MySQLdb.cursors.DictCursor)
130 cursor.execute("SELECT COUNT(*) AS count FROM corrections WHERE domain=%d AND status=%d" % (domain_id, status)) 76 cursor.execute("INSERT INTO corrections(domain, status, curr_month, prev_month , curr_year, prev_year) VALUES (%s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE curr_month = curr_month + 1, curr_year = curr_year + 1", (domain_id, status, 1, 0, 1, 0))
131
132 if cursor.fetchone()["count"] < 1:
133 # Create and increment current period
134 cursor.execute("INSERT INTO corrections(domain, status, curr_month, prev_mon th, curr_year, prev_year) VALUES (%d, %d, %d, %d, %d, %d)" % (domain_id, status, 1, 0, 1, 0))
135 else:
136 # Increment current period
137 cursor.execute("UPDATE corrections SET curr_month=curr_month+1, curr_year=cu rr_year+1 WHERE domain=%d AND status=%d" % (domain_id, status))
Wladimir Palant 2012/10/09 11:51:14 http://dev.mysql.com/doc/refman/5.0/en/insert-on-d
Thomas Greiner 2012/10/09 13:47:02 Thanks for the tip!
138
139 def _update_scores(db, domain_id, domain_correctness, correction_correctness):
140 cursor = db.cursor()
141 cursor.execute("UPDATE domains SET domain_correct=%f, correction_correct=%f WH ERE id=%d" % (domain_correctness, correction_correctness, domain_id))
LEFTRIGHT

Powered by Google App Engine
This is Rietveld