| Index: sitescripts/urlfixer/web/submitData.py |
| =================================================================== |
| new file mode 100644 |
| --- /dev/null |
| +++ b/sitescripts/urlfixer/web/submitData.py |
| @@ -0,0 +1,141 @@ |
| +# coding: utf-8 |
| + |
| +# This Source Code is subject to the terms of the Mozilla Public License |
| +# version 2.0 (the "License"). You can obtain a copy of the License at |
| +# http://mozilla.org/MPL/2.0/. |
| + |
| +# import MySQLdb, os, simplejson, sys, re |
| +# from sitescripts.utils import cached, get_config |
| +# from sitescripts.web import url_handler, basic_auth |
|
Wladimir Palant
2012/10/09 11:51:14
Any reason why the lines above are required? ;)
|
| + |
| +import os, MySQLdb, simplejson as json |
| +from urlparse import parse_qs |
| +from math import sqrt |
|
Wladimir Palant
2012/10/09 11:51:14
I would prefer to just import math and use the fun
|
| +from sitescripts.web import url_handler |
| +from sitescripts.utils import get_config, setupStderr |
| + |
| +STATUS_TYPED = 1 |
| +STATUS_TYPO = 2 |
| +STATUS_CORRECTION = 3 |
| +STATUS_FALSE_POSITIVE = 4 |
| + |
| +@url_handler("/submitData") |
| +def submit_data(environ, start_response): |
| + setupStderr(environ['wsgi.errors']) |
| + |
| + if environ['REQUEST_METHOD'].upper() != 'POST': |
| + return showError('Unsupported request method', start_response) |
| + |
| + output = "" |
| + |
| + params = parse_qs(environ.get("QUERY_STRING", "")) |
| + requestVersion = params.get("version", ["0"])[0] |
| + data = "{}" |
| + try: |
| + data_length = int(environ.get("CONTENT_LENGTH", "0")) |
| + except ValueError: |
| + data_length = 0 |
| + if data_length != 0: |
| + data = environ["wsgi.input"].read(data_length) |
| + data = json.loads(data, ["{}"]) |
|
Wladimir Palant
2012/10/09 11:51:14
Not sure I understand the significance of the seco
|
| + |
| + try: |
| + database = get_config().get("urlfixer", "database") |
| + dbuser = get_config().get("urlfixer", "dbuser") |
| + dbpasswd = get_config().get("urlfixer", "dbpassword") |
| + except Exception: |
| + return showError("Error while reading config.", start_response) |
|
Wladimir Palant
2012/10/09 11:51:14
That's a server configuration issue, not something
|
| + |
| + if os.name == "nt": |
| + db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| + use_unicode=True, charset="utf8", named_pipe=True) |
| + else: |
| + db = MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| + use_unicode=True, charset="utf8") |
|
Wladimir Palant
2012/10/09 11:51:14
I suggest caching the database connection similarl
|
| + |
| + for domain in data: |
|
Wladimir Palant
2012/10/09 11:51:14
I suggest:
for domain, status in data.iteritems
|
| + process_domain(db, domain, data[domain]) |
| + |
| + db.commit() |
| + db.close() |
|
Wladimir Palant
2012/10/09 11:51:14
Database shouldn't be closed if you share the data
|
| + |
| + response_headers = [('Content-type', 'text/plain'), |
| + ('Content-Length', str(len(output)))] |
|
Wladimir Palant
2012/10/09 11:51:14
Output is empty and this won't change (nobody can
|
| + start_response("200 OK", response_headers) |
| + return [output] |
| + |
| +def process_domain(db, domain, status): |
| + domain_id = _get_domain_id(db, domain) |
| + |
| + _increment_entry(db, domain_id, status) |
| + |
| + typed = _get_entry(db, domain_id, STATUS_TYPED) |
| + typo = _get_entry(db, domain_id, STATUS_TYPO) |
| + correction = _get_entry(db, domain_id, STATUS_CORRECTION) |
| + false_positive = _get_entry(db, domain_id, STATUS_FALSE_POSITIVE) |
| + |
| + # Determine the occurences of the domain being correct as is |
| + typed_count = get_weighted_count(typed) |
| + # Determine the occurences of the domain as a typo |
| + typo_count = get_weighted_count(typo) |
| + # Determine the occurences of the domain as a correction |
| + correction_count = get_weighted_count(correction) |
| + # Determine the occurences of the domain as a false positive |
| + false_positive_count = get_weighted_count(false_positive) |
| + |
| + # Determine the correctness of the domain and of the corrections (with a confidence level of 0.95) |
| + # http://www.evanmiller.org/how-not-to-sort-by-average-rating.html |
| + domain_correctness_score = calculateCorrectness(correction_count + typed_count, false_positive_count + typo_count) |
| + correction_correctness_score = calculateCorrectness(correction_count, false_positive_count) |
| + |
| + _update_scores(db, domain_id, domain_correctness_score, correction_correctness_score) |
| + |
| +def calculateCorrectness(positive, negative): |
| + score = 0 |
| + |
| + if positive + negative > 0: |
| + score = ((positive + 1.9208) / (positive + negative) - |
| + 1.96 * sqrt((positive * negative) / (positive + negative) + 0.9604) / |
| + (positive + negative)) / (1 + 3.8416 / (positive + negative)) |
| + |
| + return score |
| + |
| +def get_weighted_count(result): |
| + if result == None: |
| + return 0 |
| + |
| + return result["curr_month"] * 0.4 + result["prev_month"] * 0.3 + result["curr_year"] * 0.2 + result["prev_year"] * 0.1 |
| + |
| +def showError(message, start_response): |
| + start_response('400 Processing Error', [('Content-Type', 'text/plain; charset=utf-8')]) |
| + return [message.encode('utf-8')] |
| + |
| +def _get_domain_id(db, domain): |
| + cursor = db.cursor(MySQLdb.cursors.DictCursor) |
| + cursor.execute("SELECT id FROM domains WHERE domain='%s'" % (domain)) |
|
Wladimir Palant
2012/10/09 11:51:14
Please don't ever generate queries dynamically, us
Thomas Greiner
2012/10/09 13:47:02
Actually, I thought that this was the syntax for p
|
| + result = cursor.fetchone() |
| + if result == None: |
| + cursor.execute("INSERT INTO domains(domain) VALUES ('%s')" % (domain)) |
| + return _get_domain_id(db, domain) |
|
Wladimir Palant
2012/10/09 11:51:14
Should be:
return db.insert_id()
|
| + else: |
| + return result["id"] |
| + |
| +def _get_entry(db, domain_id, status): |
| + cursor = db.cursor(MySQLdb.cursors.DictCursor) |
| + cursor.execute("SELECT * FROM corrections WHERE domain=%d AND status=%d" % (domain_id, status)) |
| + return cursor.fetchone() |
| + |
| +def _increment_entry(db, domain_id, status): |
| + cursor = db.cursor(MySQLdb.cursors.DictCursor) |
| + cursor.execute("SELECT COUNT(*) AS count FROM corrections WHERE domain=%d AND status=%d" % (domain_id, status)) |
| + |
| + if cursor.fetchone()["count"] < 1: |
| + # Create and increment current period |
| + cursor.execute("INSERT INTO corrections(domain, status, curr_month, prev_month, curr_year, prev_year) VALUES (%d, %d, %d, %d, %d, %d)" % (domain_id, status, 1, 0, 1, 0)) |
| + else: |
| + # Increment current period |
| + cursor.execute("UPDATE corrections SET curr_month=curr_month+1, curr_year=curr_year+1 WHERE domain=%d AND status=%d" % (domain_id, status)) |
|
Wladimir Palant
2012/10/09 11:51:14
http://dev.mysql.com/doc/refman/5.0/en/insert-on-d
Thomas Greiner
2012/10/09 13:47:02
Thanks for the tip!
|
| + |
| +def _update_scores(db, domain_id, domain_correctness, correction_correctness): |
| + cursor = db.cursor() |
| + cursor.execute("UPDATE domains SET domain_correct=%f, correction_correct=%f WHERE id=%d" % (domain_correctness, correction_correctness, domain_id)) |