sitescripts/crawler/web/crawler.py - Issue 8327353: Crawler backend

Side by Side Diff

Use n/p to move between diff chunks; N/P to move between comments.

Keyboard Shortcuts

	File
u :	up to issue
m :	publish + mail comments
M :	edit review message
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line
<Enter> :	respond to / edit current comment
d :	mark current comment as done

	Issue
u :	up to list of issues
m :	publish + mail comments
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue
# :	close issue

	Comment/message editing
<Ctrl> + s or <Ctrl> + Enter :	save comment
<Esc> :	cancel edit

Side by Side Diff: sitescripts/crawler/web/crawler.py

Issue 8327353: Crawler backend (Closed)

Patch Set: Created Sept. 27, 2012, 2:15 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 import MySQLdb, os	1 import MySQLdb, os, re, simplejson, sys

2 from sitescripts.utils import cached, get_config	2 from sitescripts.utils import cached, get_config

3 from sitescripts.web import url_handler	3 from sitescripts.web import url_handler, basic_auth

4

5 @url_handler('/crawlableUrls')

6 def listUrls(environ, start_response):

7 urls = fetch_crawlable_urls()

8 start_response('200 OK', [('Content-Type', 'text/plain')])

9 return '\n'.join(urls)

10

11 def fetch_crawlable_urls():

12 cursor = get_db().cursor(MySQLdb.cursors.DictCursor)

13 executeQuery(cursor, 'SELECT url from crawler_urls')

14 results = cursor.fetchall()

15 urls = [result['url'] for result in results]

16 return urls

17	4

18 @cached(600)	5 @cached(600)

19 def get_db():	6 def _get_db():

20 database = get_config().get('crawler', 'database')	7 database = get_config().get("crawler", "database")

21 dbuser = get_config().get('crawler', 'dbuser')	8 dbuser = get_config().get("crawler", "dbuser")

22 dbpasswd = get_config().get('crawler', 'dbpassword')	9 dbpasswd = get_config().get("crawler", "dbpassword")

23 if os.name == 'nt':	10 if os.name == "nt":

24 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8', named_pipe=True)	11 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	12 use_unicode=True, charset="utf8", named_pipe=True)

25 else:	13 else:

26 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, use_unicod e=True, charset='utf8')	14 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,

	15 use_unicode=True, charset="utf8")

27	16

28 def executeQuery(cursor, query, args=None):	17 def get_cursor():

29 cursor.execute(query, args)	18 return _get_db().cursor(MySQLdb.cursors.DictCursor)

	19

	20 def _fetch_crawlable_sites():

	21 cursor = get_cursor()

	22 cursor.execute("SELECT url from crawler_sites")

	23 results = cursor.fetchall()

	24 sites = [result["url"] for result in results]

	25 return sites

	26

	27 @url_handler("/crawlableSites")

	28 @basic_auth("crawler")

	29 def crawlable_sites(environ, start_response):

	30 urls = _fetch_crawlable_sites()

	31 start_response("200 OK", [("Content-Type", "text/plain")])

	32 return "\n".join(urls)

	33

	34 def _find_site_id(site_url):

	35 cursor = get_cursor()

	36 cursor.execute("SELECT id FROM crawler_sites WHERE url = %s", site_url)

	37 result = cursor.fetchone()

	38 return result["id"] if result else None

	39

	40 def _read_multipart_lines(environ, line_callback):

	41 data_file = environ["wsgi.input"]

	42 content_type = environ.get("CONTENT_TYPE")

	43 if not content_type:

	44 raise ValueError("Content-Type missing from header")

	45

	46 match = re.search(r"boundary=(.*)", content_type)

	47 if not match:

	48 raise ValueError("Multipart form data or boundary declaration missing")

	49

	50 boundary = match.group(1)

	51 boundary_passed = False

	52 header_passed = False

	53

	54 for line in data_file:

	55 line = line.strip()

	56

	57 if not boundary_passed:

	58 if line == "--" + boundary:

	59 boundary_passed = True

	60 continue

	61

	62 if not header_passed:

	63 if not line:

	64 header_passed = True

	65 continue

	66

	67 if line == "--" + boundary + "--":

	68 break

	69

	70 if line:

	71 line_callback(line)

	72

	73 def _create_run():

	74 cursor = get_cursor()

	75 cursor.execute("INSERT INTO crawler_runs () VALUES ()")

	76 return cursor.lastrowid

	77

	78 def _insert_data(run_id, site, url, filtered):

	79 site_id = _find_site_id(site)

	80 if site_id is None:

	81 print >>sys.stderr, "Unable to find site '%s' in the database" % site

	82 return

	83

	84 cursor = get_cursor()

	85 cursor.execute("""

	86 INSERT INTO crawler_data (run, site, url, filtered)

	87 VALUES (%s, %s, %s, %s)""",

	88 (run_id, site_id, url, filtered))

	89

	90 @url_handler("/crawlerData")

	91 @basic_auth("crawler")

	92 def crawler_data(environ, start_response):

	93 def line_callback(line):

	94 try:

	95 url, site, filtered = simplejson.loads(line)

	96 _insert_data(run_id, site, url, filtered)

	97 except simplejson.JSONDecodeError:

	98 print >>sys.stderr, "Unable to parse JSON from '%s'" % line

	99

	100 run_id = _create_run()

	101 try:

	102 _read_multipart_lines(environ, line_callback)

	103 start_response("200 OK", [("Content-Type", "text/plain")])

	104 return ""

	105 except ValueError as e:

	106 start_response("401 Bad Request", [("Content-Type", "text/plain")])

	107 return e

OLD	NEW

« no previous file with comments | « sitescripts/crawler/schema.sql ('k') | sitescripts/extensions/bin/createNightlies.py » ('j') | sitescripts/web.py » ('J')