 Issue 8483155:
  Added typo data collection (/submitData)  (Closed)
    
  
    Issue 8483155:
  Added typo data collection (/submitData)  (Closed) 
  | Left: | ||
| Right: | 
| OLD | NEW | 
|---|---|
| (Empty) | |
| 1 # coding: utf-8 | |
| 2 | |
| 3 # This Source Code is subject to the terms of the Mozilla Public License | |
| 4 # version 2.0 (the "License"). You can obtain a copy of the License at | |
| 5 # http://mozilla.org/MPL/2.0/. | |
| 6 | |
| 7 # import MySQLdb, os, simplejson, sys, re | |
| 8 # from sitescripts.utils import cached, get_config | |
| 9 # from sitescripts.web import url_handler, basic_auth | |
| 
Wladimir Palant
2012/10/09 11:51:14
Any reason why the lines above are required? ;)
 | |
| 10 | |
| 11 import os, MySQLdb, simplejson as json | |
| 12 from urlparse import parse_qs | |
| 13 from math import sqrt | |
| 
Wladimir Palant
2012/10/09 11:51:14
I would prefer to just import math and use the fun
 | |
| 14 from sitescripts.web import url_handler | |
| 15 from sitescripts.utils import get_config, setupStderr | |
| 16 | |
| 17 STATUS_TYPED = 1 | |
| 18 STATUS_TYPO = 2 | |
| 19 STATUS_CORRECTION = 3 | |
| 20 STATUS_FALSE_POSITIVE = 4 | |
| 21 | |
| 22 @url_handler("/submitData") | |
| 23 def submit_data(environ, start_response): | |
| 24 setupStderr(environ['wsgi.errors']) | |
| 25 | |
| 26 if environ['REQUEST_METHOD'].upper() != 'POST': | |
| 27 return showError('Unsupported request method', start_response) | |
| 28 | |
| 29 output = "" | |
| 30 | |
| 31 params = parse_qs(environ.get("QUERY_STRING", "")) | |
| 32 requestVersion = params.get("version", ["0"])[0] | |
| 33 data = "{}" | |
| 34 try: | |
| 35 data_length = int(environ.get("CONTENT_LENGTH", "0")) | |
| 36 except ValueError: | |
| 37 data_length = 0 | |
| 38 if data_length != 0: | |
| 39 data = environ["wsgi.input"].read(data_length) | |
| 40 data = json.loads(data, ["{}"]) | |
| 
Wladimir Palant
2012/10/09 11:51:14
Not sure I understand the significance of the seco
 | |
| 41 | |
| 42 try: | |
| 43 database = get_config().get("urlfixer", "database") | |
| 44 dbuser = get_config().get("urlfixer", "dbuser") | |
| 45 dbpasswd = get_config().get("urlfixer", "dbpassword") | |
| 46 except Exception: | |
| 47 return showError("Error while reading config.", start_response) | |
| 
Wladimir Palant
2012/10/09 11:51:14
That's a server configuration issue, not something
 | |
| 48 | |
| 49 if os.name == "nt": | |
| 50 db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | |
| 51 use_unicode=True, charset="utf8", named_pipe=True) | |
| 52 else: | |
| 53 db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, | |
| 54 use_unicode=True, charset="utf8") | |
| 
Wladimir Palant
2012/10/09 11:51:14
I suggest caching the database connection similarl
 | |
| 55 | |
| 56 for domain in data: | |
| 
Wladimir Palant
2012/10/09 11:51:14
I suggest:
  for domain, status in data.iteritems
 | |
| 57 process_domain(db, domain, data[domain]) | |
| 58 | |
| 59 db.commit() | |
| 60 db.close() | |
| 
Wladimir Palant
2012/10/09 11:51:14
Database shouldn't be closed if you share the data
 | |
| 61 | |
| 62 response_headers = [('Content-type', 'text/plain'), | |
| 63 ('Content-Length', str(len(output)))] | |
| 
Wladimir Palant
2012/10/09 11:51:14
Output is empty and this won't change (nobody can
 | |
| 64 start_response("200 OK", response_headers) | |
| 65 return [output] | |
| 66 | |
| 67 def process_domain(db, domain, status): | |
| 68 domain_id = _get_domain_id(db, domain) | |
| 69 | |
| 70 _increment_entry(db, domain_id, status) | |
| 71 | |
| 72 typed = _get_entry(db, domain_id, STATUS_TYPED) | |
| 73 typo = _get_entry(db, domain_id, STATUS_TYPO) | |
| 74 correction = _get_entry(db, domain_id, STATUS_CORRECTION) | |
| 75 false_positive = _get_entry(db, domain_id, STATUS_FALSE_POSITIVE) | |
| 76 | |
| 77 # Determine the occurences of the domain being correct as is | |
| 78 typed_count = get_weighted_count(typed) | |
| 79 # Determine the occurences of the domain as a typo | |
| 80 typo_count = get_weighted_count(typo) | |
| 81 # Determine the occurences of the domain as a correction | |
| 82 correction_count = get_weighted_count(correction) | |
| 83 # Determine the occurences of the domain as a false positive | |
| 84 false_positive_count = get_weighted_count(false_positive) | |
| 85 | |
| 86 # Determine the correctness of the domain and of the corrections (with a confi dence level of 0.95) | |
| 87 # http://www.evanmiller.org/how-not-to-sort-by-average-rating.html | |
| 88 domain_correctness_score = calculateCorrectness(correction_count + typed_count , false_positive_count + typo_count) | |
| 89 correction_correctness_score = calculateCorrectness(correction_count, false_po sitive_count) | |
| 90 | |
| 91 _update_scores(db, domain_id, domain_correctness_score, correction_correctness _score) | |
| 92 | |
| 93 def calculateCorrectness(positive, negative): | |
| 94 score = 0 | |
| 95 | |
| 96 if positive + negative > 0: | |
| 97 score = ((positive + 1.9208) / (positive + negative) - | |
| 98 1.96 * sqrt((positive * negative) / (positive + negative) + 0.9604) / | |
| 99 (positive + negative)) / (1 + 3.8416 / (positive + negative)) | |
| 100 | |
| 101 return score | |
| 102 | |
| 103 def get_weighted_count(result): | |
| 104 if result == None: | |
| 105 return 0 | |
| 106 | |
| 107 return result["curr_month"] * 0.4 + result["prev_month"] * 0.3 + result["curr_ year"] * 0.2 + result["prev_year"] * 0.1 | |
| 108 | |
| 109 def showError(message, start_response): | |
| 110 start_response('400 Processing Error', [('Content-Type', 'text/plain; charset= utf-8')]) | |
| 111 return [message.encode('utf-8')] | |
| 112 | |
| 113 def _get_domain_id(db, domain): | |
| 114 cursor = db.cursor(MySQLdb.cursors.DictCursor) | |
| 115 cursor.execute("SELECT id FROM domains WHERE domain='%s'" % (domain)) | |
| 
Wladimir Palant
2012/10/09 11:51:14
Please don't ever generate queries dynamically, us
 
Thomas Greiner
2012/10/09 13:47:02
Actually, I thought that this was the syntax for p
 | |
| 116 result = cursor.fetchone() | |
| 117 if result == None: | |
| 118 cursor.execute("INSERT INTO domains(domain) VALUES ('%s')" % (domain)) | |
| 119 return _get_domain_id(db, domain) | |
| 
Wladimir Palant
2012/10/09 11:51:14
Should be:
  return db.insert_id()
 | |
| 120 else: | |
| 121 return result["id"] | |
| 122 | |
| 123 def _get_entry(db, domain_id, status): | |
| 124 cursor = db.cursor(MySQLdb.cursors.DictCursor) | |
| 125 cursor.execute("SELECT * FROM corrections WHERE domain=%d AND status=%d" % (do main_id, status)) | |
| 126 return cursor.fetchone() | |
| 127 | |
| 128 def _increment_entry(db, domain_id, status): | |
| 129 cursor = db.cursor(MySQLdb.cursors.DictCursor) | |
| 130 cursor.execute("SELECT COUNT(*) AS count FROM corrections WHERE domain=%d AND status=%d" % (domain_id, status)) | |
| 131 | |
| 132 if cursor.fetchone()["count"] < 1: | |
| 133 # Create and increment current period | |
| 134 cursor.execute("INSERT INTO corrections(domain, status, curr_month, prev_mon th, curr_year, prev_year) VALUES (%d, %d, %d, %d, %d, %d)" % (domain_id, status, 1, 0, 1, 0)) | |
| 135 else: | |
| 136 # Increment current period | |
| 137 cursor.execute("UPDATE corrections SET curr_month=curr_month+1, curr_year=cu rr_year+1 WHERE domain=%d AND status=%d" % (domain_id, status)) | |
| 
Wladimir Palant
2012/10/09 11:51:14
http://dev.mysql.com/doc/refman/5.0/en/insert-on-d
 
Thomas Greiner
2012/10/09 13:47:02
Thanks for the tip!
 | |
| 138 | |
| 139 def _update_scores(db, domain_id, domain_correctness, correction_correctness): | |
| 140 cursor = db.cursor() | |
| 141 cursor.execute("UPDATE domains SET domain_correct=%f, correction_correct=%f WH ERE id=%d" % (domain_correctness, correction_correctness, domain_id)) | |
| OLD | NEW |