| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 import MySQLdb, os | 1 import MySQLdb, os, re, simplejson, sys |
| 2 from sitescripts.utils import cached, get_config | 2 from sitescripts.utils import cached, get_config |
| 3 from sitescripts.web import url_handler | 3 from sitescripts.web import url_handler, basic_auth |
| 4 | |
| 5 @url_handler('/crawlableUrls') | |
| 6 def listUrls(environ, start_response): | |
| 7 urls = fetch_crawlable_urls() | |
| 8 start_response('200 OK', [('Content-Type', 'text/plain')]) | |
| 9 return '\n'.join(urls) | |
| 10 | |
| 11 def fetch_crawlable_urls(): | |
| 12 cursor = get_db().cursor(MySQLdb.cursors.DictCursor) | |
| 13 executeQuery(cursor, 'SELECT url from crawler_urls') | |
| 14 results = cursor.fetchall() | |
| 15 urls = [result['url'] for result in results] | |
| 16 return urls | |
| 17 | 4 |
| 18 @cached(600) | 5 @cached(600) |
| 19 def get_db(): | 6 def _get_db(): |
| 20 database = get_config().get('crawler', 'database') | 7 database = get_config().get("crawler", "database") |
| 21 dbuser = get_config().get('crawler', 'dbuser') | 8 dbuser = get_config().get("crawler", "dbuser") |
| 22 dbpasswd = get_config().get('crawler', 'dbpassword') | 9 dbpasswd = get_config().get("crawler", "dbpassword") |
| 23 if os.name == 'nt': | 10 if os.name == "nt": |
| 24 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8', named_pipe=True) | 11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| 12 use_unicode=True, charset="utf8", named_pipe=True) | |
| 25 else: | 13 else: |
| 26 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8') | 14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| 15 use_unicode=True, charset="utf8") | |
| 27 | 16 |
| 28 def executeQuery(cursor, query, args=None): | 17 def get_cursor(): |
| 29 cursor.execute(query, args) | 18 return _get_db().cursor(MySQLdb.cursors.DictCursor) |
| 19 | |
| 20 def _fetch_crawlable_sites(): | |
| 21 cursor = get_cursor() | |
| 22 cursor.execute("SELECT url from crawler_sites") | |
| 23 results = cursor.fetchall() | |
| 24 sites = [result["url"] for result in results] | |
| 25 return sites | |
| 26 | |
| 27 @url_handler("/crawlableSites") | |
| 28 @basic_auth | |
| 29 def crawlable_sites(environ, start_response): | |
| 30 urls = _fetch_crawlable_sites() | |
| 31 start_response("200 OK", [("Content-Type", "text/plain")]) | |
| 32 return "\n".join(urls) | |
| 33 | |
| 34 def _find_site_id(site_url): | |
| 35 cursor = get_cursor() | |
| 36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url) | |
| 37 result = cursor.fetchone() | |
| 38 return result["id"] if result else None | |
| 39 | |
| 40 def _read_multipart_lines(environ, line_callback): | |
| 41 data_file = environ["wsgi.input"] | |
| 42 boundary = re.search(r"boundary=(.*)", environ["CONTENT_TYPE"]).group(1) | |
|
Wladimir Palant
2012/09/27 07:34:17
Assumptions here:
1) There is a CONTENT_TYPE head
Felix Dahlke
2012/09/27 09:26:24
Done. You're finding a lot of those because my min
| |
| 43 boundary_passed = False | |
| 44 header_passed = False | |
| 45 | |
| 46 for line in data_file: | |
| 47 line = line.strip() | |
| 48 | |
| 49 if not boundary_passed: | |
| 50 if line == "--" + boundary: | |
| 51 boundary_passed = True | |
| 52 continue | |
| 53 | |
| 54 if not header_passed: | |
| 55 if not line: | |
| 56 header_passed = True | |
| 57 continue | |
| 58 | |
| 59 if line == "--" + boundary + "--": | |
| 60 break | |
| 61 | |
| 62 if line: | |
| 63 line_callback(line) | |
| 64 | |
| 65 def _create_run(): | |
| 66 cursor = get_cursor() | |
| 67 cursor.execute("INSERT INTO crawler_runs () VALUES ()") | |
| 68 return cursor.lastrowid | |
| 69 | |
| 70 def _insert_data(run_id, site, url, filtered): | |
| 71 site_id = _find_site_id(site) | |
| 72 if site_id is None: | |
| 73 print >>sys.stderr, "Unable to find site '%s' in the database" % site | |
| 74 return | |
| 75 | |
|
Wladimir Palant
2012/09/27 07:34:17
Thinking about that... So we give the client a lis
Felix Dahlke
2012/09/27 09:26:24
I did that for two reasons:
1. The site urls are d
| |
| 76 cursor = get_cursor() | |
| 77 cursor.execute(""" | |
| 78 INSERT INTO crawler_data (run, site, url, filtered) | |
| 79 VALUES (%s, %s, %s, %s)""", | |
| 80 (run_id, site_id, url, filtered)) | |
| 81 | |
| 82 @url_handler("/crawlerData") | |
| 83 @basic_auth | |
| 84 def crawler_data(environ, start_response): | |
| 85 def line_callback(line): | |
| 86 try: | |
| 87 url, site, filtered = simplejson.loads(line) | |
| 88 _insert_data(run_id, site, url, filtered) | |
| 89 except simplejson.JSONDecodeError: | |
| 90 print >>sys.stderr, "Unable to parse JSON from '%s'" % line | |
| 91 | |
| 92 run_id = _create_run() | |
| 93 _read_multipart_lines(environ, line_callback) | |
| 94 start_response("200 OK", [("Content-Type", "text/plain")]) | |
| 95 return "" | |
| OLD | NEW |