sitescripts/crawler/web/crawler.py - Issue 8327353: Crawler backend

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)

Patch Set: Created Sept. 27, 2012, 6:22 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 import MySQLdb, os	1 import MySQLdb, os, re, simplejson, sys

2 from sitescripts.utils import cached, get_config	2 from sitescripts.utils import cached, get_config

3 from sitescripts.web import url_handler	3 from sitescripts.web import url_handler, basic_auth

4

5 @url_handler('/crawlableUrls')

6 def listUrls(environ, start_response):

7 urls = fetch_crawlable_urls()

8 start_response('200 OK', [('Content-Type', 'text/plain')])

9 return '\n'.join(urls)

10

11 def fetch_crawlable_urls():

12 cursor = get_db().cursor(MySQLdb.cursors.DictCursor)

13 executeQuery(cursor, 'SELECT url from crawler_urls')

14 results = cursor.fetchall()

15 urls = [result['url'] for result in results]

16 return urls

17	4

18 @cached(600)	5 @cached(600)

19 def get_db():	6 def _get_db():

20 database = get_config().get('crawler', 'database')	7 database = get_config().get("crawler", "database")

21 dbuser = get_config().get('crawler', 'dbuser')	8 dbuser = get_config().get("crawler", "dbuser")

22 dbpasswd = get_config().get('crawler', 'dbpassword')	9 dbpasswd = get_config().get("crawler", "dbpassword")

23 if os.name == 'nt':	10 if os.name == "nt":

24 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8', named_pipe=True)	11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	12 use_unicode=True, charset="utf8", named_pipe=True)

25 else:	13 else:

26 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8')	14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	15 use_unicode=True, charset="utf8")

27	16

28 def executeQuery(cursor, query, args=None):	17 def get_cursor():

29 cursor.execute(query, args)	18 return _get_db().cursor(MySQLdb.cursors.DictCursor)

	19

	20 def _fetch_crawlable_sites():

	21 cursor = get_cursor()

	22 cursor.execute("SELECT url from crawler_sites")

	23 results = cursor.fetchall()

	24 sites = [result["url"] for result in results]

	25 return sites

	26

	27 @url_handler("/crawlableSites")

	28 @basic_auth

	29 def crawlable_sites(environ, start_response):

	30 urls = _fetch_crawlable_sites()

	31 start_response("200 OK", [("Content-Type", "text/plain")])

	32 return "\n".join(urls)

	33

	34 def _find_site_id(site_url):

	35 cursor = get_cursor()

	36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)

	37 result = cursor.fetchone()

	38 return result["id"] if result else None

	39

	40 def _read_multipart_lines(environ, line_callback):

	41 data_file = environ["wsgi.input"]

	42 boundary = re.search(r"boundary=(.*)", environ["CONTENT_TYPE"]).group(1)
	Wladimir Palant 2012/09/27 07:34:17 Assumptions here: 1) There is a CONTENT_TYPE head Assumptions here: 1) There is a CONTENT_TYPE header. 2) It starts with "multipart/form-data". 3) It also contains "boundary=" substring. These are probably valid assumptions to make but I would normally make sure to produce a proper error message if any of these is violated. Felix Dahlke 2012/09/27 09:26:24 Done. You're finding a lot of those because my min Show quoted text On 2012/09/27 07:34:17, Wladimir Palant wrote: > Assumptions here: > > 1) There is a CONTENT_TYPE header. > 2) It starts with "multipart/form-data". > 3) It also contains "boundary=" substring. > > These are probably valid assumptions to make but I would normally make sure to > produce a proper error message if any of these is violated. Done. You're finding a lot of those because my mindset has always been to write the simplest code possible, see what exceptions occur during testing and handle errors properly where necessary. But that was in environments with lots of testing before deployment, so I guess I'll better get used to handling things like that properly form the start.
	43 boundary_passed = False

	44 header_passed = False

	45

	46 for line in data_file:

	47 line = line.strip()

	48

	49 if not boundary_passed:

	50 if line == "--" + boundary:

	51 boundary_passed = True

	52 continue

	53

	54 if not header_passed:

	55 if not line:

	56 header_passed = True

	57 continue

	58

	59 if line == "--" + boundary + "--":

	60 break

	61

	62 if line:

	63 line_callback(line)

	64

	65 def _create_run():

	66 cursor = get_cursor()

	67 cursor.execute("INSERT INTO crawler_runs () VALUES ()")

	68 return cursor.lastrowid

	69

	70 def _insert_data(run_id, site, url, filtered):

	71 site_id = _find_site_id(site)

	72 if site_id is None:

	73 print >>sys.stderr, "Unable to find site '%s' in the database" % site

	74 return

	75
	Wladimir Palant 2012/09/27 07:34:17 Thinking about that... So we give the client a lis Thinking about that... So we give the client a list of URLs from crawler_sites table but ignore IDs, then we get the URL back from the client and have to find the corresponding ID? Seems not very logical. Maybe send both the URL and the ID to the client and have it use the site ID in the data it sends back? Felix Dahlke 2012/09/27 09:26:24 I did that for two reasons: 1. The site urls are d Show quoted text On 2012/09/27 07:34:17, Wladimir Palant wrote: > Thinking about that... So we give the client a list of URLs from crawler_sites > table but ignore IDs, then we get the URL back from the client and have to find > the corresponding ID? Seems not very logical. Maybe send both the URL and the ID > to the client and have it use the site ID in the data it sends back? I did that for two reasons: 1. The site urls are definitely unique (now even guaranteed by the database) 2. The ID is a database detail that could actually change for some reason, e.g. denormalization The problem is that this is blowing the size of the crawlerData request up somewhat. I was planning to wait and see if that's going to be a problem.
	76 cursor = get_cursor()

	77 cursor.execute("""

	78 INSERT INTO crawler_data (run, site, url, filtered)

	79 VALUES (%s, %s, %s, %s)""",

	80 (run_id, site_id, url, filtered))

	81

	82 @url_handler("/crawlerData")

	83 @basic_auth

	84 def crawler_data(environ, start_response):

	85 def line_callback(line):

	86 try:

	87 url, site, filtered = simplejson.loads(line)

	88 _insert_data(run_id, site, url, filtered)

	89 except simplejson.JSONDecodeError:

	90 print >>sys.stderr, "Unable to parse JSON from '%s'" % line

	91

	92 run_id = _create_run()

	93 _read_multipart_lines(environ, line_callback)

	94 start_response("200 OK", [("Content-Type", "text/plain")])

	95 return ""

OLD	NEW

« sitescripts/crawler/bin/extract_crawler_sites.py ('K') | « sitescripts/crawler/schema.sql ('k') | sitescripts/extensions/bin/createNightlies.py » ('j') | sitescripts/hg/bin/irchook.py » ('J')