Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/urlfixer/web/submitData.py

Issue 8483155: Added typo data collection (/submitData) (Closed)
Patch Set: Created Oct. 9, 2012, 10:58 a.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sitescripts/urlfixer/web/submitData.py
===================================================================
new file mode 100644
--- /dev/null
+++ b/sitescripts/urlfixer/web/submitData.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+
+# This Source Code is subject to the terms of the Mozilla Public License
+# version 2.0 (the "License"). You can obtain a copy of the License at
+# http://mozilla.org/MPL/2.0/.
+
+# import MySQLdb, os, simplejson, sys, re
+# from sitescripts.utils import cached, get_config
+# from sitescripts.web import url_handler, basic_auth
Wladimir Palant 2012/10/09 11:51:14 Any reason why the lines above are required? ;)
+
+import os, MySQLdb, simplejson as json
+from urlparse import parse_qs
+from math import sqrt
Wladimir Palant 2012/10/09 11:51:14 I would prefer to just import math and use the fun
+from sitescripts.web import url_handler
+from sitescripts.utils import get_config, setupStderr
+
+STATUS_TYPED = 1
+STATUS_TYPO = 2
+STATUS_CORRECTION = 3
+STATUS_FALSE_POSITIVE = 4
+
+@url_handler("/submitData")
+def submit_data(environ, start_response):
+ setupStderr(environ['wsgi.errors'])
+
+ if environ['REQUEST_METHOD'].upper() != 'POST':
+ return showError('Unsupported request method', start_response)
+
+ output = ""
+
+ params = parse_qs(environ.get("QUERY_STRING", ""))
+ requestVersion = params.get("version", ["0"])[0]
+ data = "{}"
+ try:
+ data_length = int(environ.get("CONTENT_LENGTH", "0"))
+ except ValueError:
+ data_length = 0
+ if data_length != 0:
+ data = environ["wsgi.input"].read(data_length)
+ data = json.loads(data, ["{}"])
Wladimir Palant 2012/10/09 11:51:14 Not sure I understand the significance of the seco
+
+ try:
+ database = get_config().get("urlfixer", "database")
+ dbuser = get_config().get("urlfixer", "dbuser")
+ dbpasswd = get_config().get("urlfixer", "dbpassword")
+ except Exception:
+ return showError("Error while reading config.", start_response)
Wladimir Palant 2012/10/09 11:51:14 That's a server configuration issue, not something
+
+ if os.name == "nt":
+ db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
+ use_unicode=True, charset="utf8", named_pipe=True)
+ else:
+ db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
+ use_unicode=True, charset="utf8")
Wladimir Palant 2012/10/09 11:51:14 I suggest caching the database connection similarl
+
+ for domain in data:
Wladimir Palant 2012/10/09 11:51:14 I suggest: for domain, status in data.iteritems
+ process_domain(db, domain, data[domain])
+
+ db.commit()
+ db.close()
Wladimir Palant 2012/10/09 11:51:14 Database shouldn't be closed if you share the data
+
+ response_headers = [('Content-type', 'text/plain'),
+ ('Content-Length', str(len(output)))]
Wladimir Palant 2012/10/09 11:51:14 Output is empty and this won't change (nobody can
+ start_response("200 OK", response_headers)
+ return [output]
+
+def process_domain(db, domain, status):
+ domain_id = _get_domain_id(db, domain)
+
+ _increment_entry(db, domain_id, status)
+
+ typed = _get_entry(db, domain_id, STATUS_TYPED)
+ typo = _get_entry(db, domain_id, STATUS_TYPO)
+ correction = _get_entry(db, domain_id, STATUS_CORRECTION)
+ false_positive = _get_entry(db, domain_id, STATUS_FALSE_POSITIVE)
+
+ # Determine the occurences of the domain being correct as is
+ typed_count = get_weighted_count(typed)
+ # Determine the occurences of the domain as a typo
+ typo_count = get_weighted_count(typo)
+ # Determine the occurences of the domain as a correction
+ correction_count = get_weighted_count(correction)
+ # Determine the occurences of the domain as a false positive
+ false_positive_count = get_weighted_count(false_positive)
+
+ # Determine the correctness of the domain and of the corrections (with a confidence level of 0.95)
+ # http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
+ domain_correctness_score = calculateCorrectness(correction_count + typed_count, false_positive_count + typo_count)
+ correction_correctness_score = calculateCorrectness(correction_count, false_positive_count)
+
+ _update_scores(db, domain_id, domain_correctness_score, correction_correctness_score)
+
+def calculateCorrectness(positive, negative):
+ score = 0
+
+ if positive + negative > 0:
+ score = ((positive + 1.9208) / (positive + negative) -
+ 1.96 * sqrt((positive * negative) / (positive + negative) + 0.9604) /
+ (positive + negative)) / (1 + 3.8416 / (positive + negative))
+
+ return score
+
+def get_weighted_count(result):
+ if result == None:
+ return 0
+
+ return result["curr_month"] * 0.4 + result["prev_month"] * 0.3 + result["curr_year"] * 0.2 + result["prev_year"] * 0.1
+
+def showError(message, start_response):
+ start_response('400 Processing Error', [('Content-Type', 'text/plain; charset=utf-8')])
+ return [message.encode('utf-8')]
+
+def _get_domain_id(db, domain):
+ cursor = db.cursor(MySQLdb.cursors.DictCursor)
+ cursor.execute("SELECT id FROM domains WHERE domain='%s'" % (domain))
Wladimir Palant 2012/10/09 11:51:14 Please don't ever generate queries dynamically, us
Thomas Greiner 2012/10/09 13:47:02 Actually, I thought that this was the syntax for p
+ result = cursor.fetchone()
+ if result == None:
+ cursor.execute("INSERT INTO domains(domain) VALUES ('%s')" % (domain))
+ return _get_domain_id(db, domain)
Wladimir Palant 2012/10/09 11:51:14 Should be: return db.insert_id()
+ else:
+ return result["id"]
+
+def _get_entry(db, domain_id, status):
+ cursor = db.cursor(MySQLdb.cursors.DictCursor)
+ cursor.execute("SELECT * FROM corrections WHERE domain=%d AND status=%d" % (domain_id, status))
+ return cursor.fetchone()
+
+def _increment_entry(db, domain_id, status):
+ cursor = db.cursor(MySQLdb.cursors.DictCursor)
+ cursor.execute("SELECT COUNT(*) AS count FROM corrections WHERE domain=%d AND status=%d" % (domain_id, status))
+
+ if cursor.fetchone()["count"] < 1:
+ # Create and increment current period
+ cursor.execute("INSERT INTO corrections(domain, status, curr_month, prev_month, curr_year, prev_year) VALUES (%d, %d, %d, %d, %d, %d)" % (domain_id, status, 1, 0, 1, 0))
+ else:
+ # Increment current period
+ cursor.execute("UPDATE corrections SET curr_month=curr_month+1, curr_year=curr_year+1 WHERE domain=%d AND status=%d" % (domain_id, status))
Wladimir Palant 2012/10/09 11:51:14 http://dev.mysql.com/doc/refman/5.0/en/insert-on-d
Thomas Greiner 2012/10/09 13:47:02 Thanks for the tip!
+
+def _update_scores(db, domain_id, domain_correctness, correction_correctness):
+ cursor = db.cursor()
+ cursor.execute("UPDATE domains SET domain_correct=%f, correction_correct=%f WHERE id=%d" % (domain_correctness, correction_correctness, domain_id))

Powered by Google App Engine
This is Rietveld