sitescripts/crawler/web/crawler.py - Issue 8327353: Crawler backend

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)

Patch Set: Created Sept. 27, 2012, 9:26 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 import MySQLdb, os	1 import MySQLdb, os, re, simplejson, sys

2 from sitescripts.utils import cached, get_config	2 from sitescripts.utils import cached, get_config

3 from sitescripts.web import url_handler	3 from sitescripts.web import url_handler, basic_auth

4

5 @url_handler('/crawlableUrls')

6 def listUrls(environ, start_response):

7 urls = fetch_crawlable_urls()

8 start_response('200 OK', [('Content-Type', 'text/plain')])

9 return '\n'.join(urls)

10

11 def fetch_crawlable_urls():

12 cursor = get_db().cursor(MySQLdb.cursors.DictCursor)

13 executeQuery(cursor, 'SELECT url from crawler_urls')

14 results = cursor.fetchall()

15 urls = [result['url'] for result in results]

16 return urls

17	4

18 @cached(600)	5 @cached(600)

19 def get_db():	6 def _get_db():

20 database = get_config().get('crawler', 'database')	7 database = get_config().get("crawler", "database")

21 dbuser = get_config().get('crawler', 'dbuser')	8 dbuser = get_config().get("crawler", "dbuser")

22 dbpasswd = get_config().get('crawler', 'dbpassword')	9 dbpasswd = get_config().get("crawler", "dbpassword")

23 if os.name == 'nt':	10 if os.name == "nt":

24 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8', named_pipe=True)	11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	12 use_unicode=True, charset="utf8", named_pipe=True)

25 else:	13 else:

26 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8')	14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	15 use_unicode=True, charset="utf8")

27	16

28 def executeQuery(cursor, query, args=None):	17 def get_cursor():

29 cursor.execute(query, args)	18 return _get_db().cursor(MySQLdb.cursors.DictCursor)

	19

	20 def _fetch_crawlable_sites():

	21 cursor = get_cursor()

	22 cursor.execute("SELECT url from crawler_sites")

	23 results = cursor.fetchall()

	24 sites = [result["url"] for result in results]

	25 return sites

	26

	27 @url_handler("/crawlableSites")

	28 @basic_auth
	Wladimir Palant 2012/09/27 13:44:51 Shouldn't this be @basic_auth(config_section="craw Shouldn't this be @basic_auth(config_section="crawler")? Felix Dahlke 2012/09/27 14:15:33 Done. Show quoted text On 2012/09/27 13:44:51, Wladimir Palant wrote: > Shouldn't this be @basic_auth(config_section="crawler")? Done.
	29 def crawlable_sites(environ, start_response):

	30 urls = _fetch_crawlable_sites()

	31 start_response("200 OK", [("Content-Type", "text/plain")])

	32 return "\n".join(urls)

	33

	34 def _find_site_id(site_url):

	35 cursor = get_cursor()

	36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)

	37 result = cursor.fetchone()

	38 return result["id"] if result else None

	39

	40 def _read_multipart_lines(environ, line_callback):

	41 data_file = environ["wsgi.input"]

	42 content_type = environ["CONTENT_TYPE"]

	43 if not content_type:

	44 raise ValueError("Content-Type missing from header")
	Wladimir Palant 2012/09/27 13:44:51 Python doesn't work like that - accessing a non-ex Python doesn't work like that - accessing a non-existing key will cause an exception. So you better change that condition into "if not 'CONTENT_TYPE' in environ:" and only access the key after the check. Alternatively you can use environ.get("CONTENT_TYPE", "") and leave out the check here (an empty string will trigger an error below). And: yes, I am used to looking at the edge conditions when I write code rather than leaving it up to testing. Felix Dahlke 2012/09/27 14:15:33 Done. Show quoted text On 2012/09/27 13:44:51, Wladimir Palant wrote: > Python doesn't work like that - accessing a non-existing key will cause an > exception. So you better change that condition into "if not 'CONTENT_TYPE' in > environ:" and only access the key after the check. Alternatively you can use > environ.get("CONTENT_TYPE", "") and leave out the check here (an empty string > will trigger an error below). Done. Show quoted text > And: yes, I am used to looking at the edge conditions when I write code rather > than leaving it up to testing. I'll work on it.
	45

	46 match = re.search(r"boundary=(.*)", content_type)

	47 if not match:

	48 raise ValueError("Multipart form data or boundary declaration missing")

	49

	50 boundary = match.group(1)

	51 boundary_passed = False

	52 header_passed = False

	53

	54 for line in data_file:

	55 line = line.strip()

	56

	57 if not boundary_passed:

	58 if line == "--" + boundary:

	59 boundary_passed = True

	60 continue

	61

	62 if not header_passed:

	63 if not line:

	64 header_passed = True

	65 continue

	66

	67 if line == "--" + boundary + "--":

	68 break

	69

	70 if line:

	71 line_callback(line)

	72

	73 return True
	Wladimir Palant 2012/09/27 13:44:51 That return value is unused, left-over from earlie That return value is unused, left-over from earlier testing? Felix Dahlke 2012/09/27 14:15:33 Done. Show quoted text On 2012/09/27 13:44:51, Wladimir Palant wrote: > That return value is unused, left-over from earlier testing? Done.
	74

	75 def _create_run():

	76 cursor = get_cursor()

	77 cursor.execute("INSERT INTO crawler_runs () VALUES ()")

	78 return cursor.lastrowid

	79

	80 def _insert_data(run_id, site, url, filtered):

	81 site_id = _find_site_id(site)

	82 if site_id is None:

	83 print >>sys.stderr, "Unable to find site '%s' in the database" % site

	84 return

	85

	86 cursor = get_cursor()

	87 cursor.execute("""

	88 INSERT INTO crawler_data (run, site, url, filtered)

	89 VALUES (%s, %s, %s, %s)""",

	90 (run_id, site_id, url, filtered))

	91

	92 @url_handler("/crawlerData")

	93 @basic_auth
	Wladimir Palant 2012/09/27 13:44:51 Shouldn't this be @basic_auth(config_section="craw Shouldn't this be @basic_auth(config_section="crawler")? Felix Dahlke 2012/09/27 14:15:33 Done. Show quoted text On 2012/09/27 13:44:51, Wladimir Palant wrote: > Shouldn't this be @basic_auth(config_section="crawler")? Done.
	94 def crawler_data(environ, start_response):

	95 def line_callback(line):

	96 try:

	97 url, site, filtered = simplejson.loads(line)

	98 _insert_data(run_id, site, url, filtered)

	99 except simplejson.JSONDecodeError:

	100 print >>sys.stderr, "Unable to parse JSON from '%s'" % line

	101

	102 run_id = _create_run()

	103 try:

	104 _read_multipart_lines(environ, line_callback)

	105 start_response("200 OK", [("Content-Type", "text/plain")])

	106 return ""

	107 except ValueError as e:

	108 start_response("401 Bad Request", [("Content-Type", "text/plain")])

	109 return e

OLD	NEW

« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | sitescripts/extensions/bin/createNightlies.py » ('j') | no next file with comments »