| Index: sitescripts/crawler/web/crawler.py |
| =================================================================== |
| --- a/sitescripts/crawler/web/crawler.py |
| +++ b/sitescripts/crawler/web/crawler.py |
| @@ -9,15 +9,15 @@ |
| @cached(600) |
| def _get_db(): |
| - database = get_config().get("crawler", "database") |
| - dbuser = get_config().get("crawler", "dbuser") |
| - dbpasswd = get_config().get("crawler", "dbpassword") |
| - if os.name == "nt": |
| + database = get_config().get('crawler', 'database') |
| + dbuser = get_config().get('crawler', 'dbuser') |
| + dbpasswd = get_config().get('crawler', 'dbpassword') |
| + if os.name == 'nt': |
| return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| - use_unicode=True, charset="utf8", named_pipe=True) |
| + use_unicode=True, charset='utf8', named_pipe=True) |
| else: |
| return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| - use_unicode=True, charset="utf8") |
| + use_unicode=True, charset='utf8') |
| def _get_cursor(): |
| @@ -26,36 +26,36 @@ |
| def _fetch_crawlable_sites(): |
| cursor = _get_cursor() |
| - cursor.execute("SELECT url from crawler_sites") |
| + cursor.execute('SELECT url from crawler_sites') |
| results = cursor.fetchall() |
| - sites = [result["url"] for result in results] |
| + sites = [result['url'] for result in results] |
| return sites |
| -@url_handler("/crawlableSites") |
| -@basic_auth("crawler") |
| +@url_handler('/crawlableSites') |
| +@basic_auth('crawler') |
| def crawlable_sites(environ, start_response): |
| urls = _fetch_crawlable_sites() |
| - start_response("200 OK", [("Content-Type", "text/plain")]) |
| - return "\n".join(urls) |
| + start_response('200 OK', [('Content-Type', 'text/plain')]) |
| + return '\n'.join(urls) |
| def _find_site_id(site_url): |
| cursor = _get_cursor() |
| - cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) |
| + cursor.execute('SELECT id FROM crawler_sites WHERE url = %s', site_url) |
| result = cursor.fetchone() |
| - return result["id"] if result else None |
| + return result['id'] if result else None |
| def _read_multipart_lines(environ, line_callback): |
| - data_file = environ["wsgi.input"] |
| - content_type = environ.get("CONTENT_TYPE") |
| + data_file = environ['wsgi.input'] |
| + content_type = environ.get('CONTENT_TYPE') |
| if not content_type: |
| - raise ValueError("Content-Type missing from header") |
| + raise ValueError('Content-Type missing from header') |
| - match = re.search(r"boundary=(.*)", content_type) |
| + match = re.search(r'boundary=(.*)', content_type) |
| if not match: |
| - raise ValueError("Multipart form data or boundary declaration missing") |
| + raise ValueError('Multipart form data or boundary declaration missing') |
| boundary = match.group(1) |
| boundary_passed = False |
| @@ -65,7 +65,7 @@ |
| line = line.strip() |
| if not boundary_passed: |
| - if line == "--" + boundary: |
| + if line == '--' + boundary: |
| boundary_passed = True |
| continue |
| @@ -74,7 +74,7 @@ |
| header_passed = True |
| continue |
| - if line == "--" + boundary + "--": |
| + if line == '--' + boundary + '--': |
| break |
| if line: |
| @@ -83,7 +83,7 @@ |
| def _create_run(): |
| cursor = _get_cursor() |
| - cursor.execute("INSERT INTO crawler_runs () VALUES ()") |
| + cursor.execute('INSERT INTO crawler_runs () VALUES ()') |
| return cursor.lastrowid |
| @@ -94,14 +94,14 @@ |
| return |
| cursor = _get_cursor() |
| - cursor.execute(""" |
| + cursor.execute(''' |
| INSERT INTO crawler_requests (run, site, url, filtered) |
| -VALUES (%s, %s, %s, %s)""", |
| +VALUES (%s, %s, %s, %s)''', |
| (run_id, site_id, url, filtered)) |
| -@url_handler("/crawlerRequests") |
| -@basic_auth("crawler") |
| +@url_handler('/crawlerRequests') |
| +@basic_auth('crawler') |
| def crawler_requests(environ, start_response): |
| def line_callback(line): |
| try: |
| @@ -119,9 +119,9 @@ |
| run_id = _create_run() |
| try: |
| _read_multipart_lines(environ, line_callback) |
| - start_response("200 OK", [("Content-Type", "text/plain")]) |
| - return "" |
| + start_response('200 OK', [('Content-Type', 'text/plain')]) |
| + return '' |
| except ValueError as e: |
| - start_response("401 Bad Request", [("Content-Type", "text/plain")]) |
| - print >>sys.stderr, "Unable to read multipart data: %s" % e |
| + start_response('401 Bad Request', [('Content-Type', 'text/plain')]) |
| + print >>sys.stderr, 'Unable to read multipart data: %s' % e |
| return e |