sitescripts/crawler/web/crawler.py - Issue 8327353: Crawler backend

Delta Between Two Patch Sets: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)

Left Patch Set: Created Sept. 14, 2012, 2:23 p.m.

Right Patch Set: Created Sept. 27, 2012, 2:15 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 import MySQLdb, os, simplejson	1 import MySQLdb, os, re, simplejson, sys

2 from sitescripts.utils import cached, get_config	2 from sitescripts.utils import cached, get_config

3 from sitescripts.web import url_handler, basic_auth	3 from sitescripts.web import url_handler, basic_auth

4	4

5 @cached(600)	5 @cached(600)

6 def _get_db():	6 def _get_db():

7 database = get_config().get("crawler", "database")	7 database = get_config().get("crawler", "database")

8 dbuser = get_config().get("crawler", "dbuser")	8 dbuser = get_config().get("crawler", "dbuser")

9 dbpasswd = get_config().get("crawler", "dbpassword")	9 dbpasswd = get_config().get("crawler", "dbpassword")

10 if os.name == "nt":	10 if os.name == "nt":

11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8", named_pipe=True)	11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	12 use_unicode=True, charset="utf8", named_pipe=True)

12 else:	13 else:

13 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset="utf8")	14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	15 use_unicode=True, charset="utf8")

14	16

15 def get_cursor():	17 def get_cursor():

16 return _get_db().cursor(MySQLdb.cursors.DictCursor)	18 return _get_db().cursor(MySQLdb.cursors.DictCursor)

17	19

18 def _fetch_crawlable_sites():	20 def _fetch_crawlable_sites():

19 cursor = get_cursor()	21 cursor = get_cursor()

20 cursor.execute("SELECT url from crawler_sites")	22 cursor.execute("SELECT url from crawler_sites")

21 results = cursor.fetchall()	23 results = cursor.fetchall()

22 sites = [result["url"] for result in results]	24 sites = [result["url"] for result in results]

23 return sites	25 return sites

24	26

25 @url_handler("/crawlableSites")	27 @url_handler("/crawlableSites")

26 @basic_auth	28 @basic_auth("crawler")

27 def crawlable_sites(environ, start_response):	29 def crawlable_sites(environ, start_response):

28 urls = _fetch_crawlable_sites()	30 urls = _fetch_crawlable_sites()

29 start_response("200 OK", [("Content-Type", "text/plain")])	31 start_response("200 OK", [("Content-Type", "text/plain")])

30 return "\n".join(urls)	32 return "\n".join(urls)

31	33

32 def _find_site_id(site_url):	34 def _find_site_id(site_url):

33 cursor = get_cursor()	35 cursor = get_cursor()

34 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)	36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)

35 return cursor.fetchall()[0]["id"]	37 result = cursor.fetchone()

	38 return result["id"] if result else None

36	39

37 def _read_multipart_lines(environ, line_callback):	40 def _read_multipart_lines(environ, line_callback):

38 data_file = environ["wsgi.input"]	41 data_file = environ["wsgi.input"]

39 current_line = 0	42 content_type = environ.get("CONTENT_TYPE")

	43 if not content_type:

	44 raise ValueError("Content-Type missing from header")

40	45

41 while True:	46 match = re.search(r"boundary=(.*)", content_type)

42 line = data_file.readline().strip()	47 if not match:

43 current_line += 1	48 raise ValueError("Multipart form data or boundary declaration missing")

44	49

45 if current_line == 1:	50 boundary = match.group(1)

46 boundary = line	51 boundary_passed = False

	52 header_passed = False

	53

	54 for line in data_file:

	55 line = line.strip()

	56

	57 if not boundary_passed:

	58 if line == "--" + boundary:

	59 boundary_passed = True

47 continue	60 continue

48	61

49 if current_line < 5 or not line:	62 if not header_passed:

	63 if not line:

	64 header_passed = True

50 continue	65 continue

51	66

52 if line.startswith(boundary):	67 if line == "--" + boundary + "--":

53 break	68 break

54	69

55 line_callback(line)	70 if line:

	71 line_callback(line)

56	72

57 def _create_run():	73 def _create_run():

58 cursor = get_cursor()	74 cursor = get_cursor()

59 cursor.execute("INSERT INTO crawler_runs () VALUES ()")	75 cursor.execute("INSERT INTO crawler_runs () VALUES ()")

60 return cursor.lastrowid	76 return cursor.lastrowid

61	77

62 def _insert_data(run_id, site, url, filtered):	78 def _insert_data(run_id, site, url, filtered):

63 site_id = _find_site_id(site)	79 site_id = _find_site_id(site)

	80 if site_id is None:

	81 print >>sys.stderr, "Unable to find site '%s' in the database" % site

	82 return

	83

64 cursor = get_cursor()	84 cursor = get_cursor()

65 cursor.execute("""	85 cursor.execute("""

66 INSERT INTO crawler_data (run, site, url, filtered)	86 INSERT INTO crawler_data (run, site, url, filtered)

67 VALUES (%s, %s, %s, %s)""",	87 VALUES (%s, %s, %s, %s)""",

68 (run_id, site_id, url, filtered))	88 (run_id, site_id, url, filtered))

69	89

70 @url_handler("/crawlerData")	90 @url_handler("/crawlerData")

71 @basic_auth	91 @basic_auth("crawler")

72 def crawler_data(environ, start_response):	92 def crawler_data(environ, start_response):

73 def line_callback(line):	93 def line_callback(line):

74 url, site, filtered = simplejson.loads(line)	94 try:

75 _insert_data(run_id, site, url, filtered)	95 url, site, filtered = simplejson.loads(line)

	96 _insert_data(run_id, site, url, filtered)

	97 except simplejson.JSONDecodeError:

	98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line

76	99

77 run_id = _create_run()	100 run_id = _create_run()

78 _read_multipart_lines(environ, line_callback)	101 try:

79 start_response("200 OK", [("Content-Type", "text/plain")])	102 _read_multipart_lines(environ, line_callback)

80 return ""	103 start_response("200 OK", [("Content-Type", "text/plain")])

	104 return ""

	105 except ValueError as e:

	106 start_response("401 Bad Request", [("Content-Type", "text/plain")])

	107 return e

LEFT	RIGHT