| Index: run.py |
| =================================================================== |
| --- a/run.py |
| +++ b/run.py |
| @@ -41,17 +41,20 @@ class CrawlerApp: |
| request_body_size = int(environ.get('CONTENT_LENGTH', 0)) |
| except (ValueError): |
| start_response('400 Bad Request', []) |
| return '' |
| data = json.loads(environ['wsgi.input'].read(request_body_size)) |
| self.urls.remove(data['url']) |
| - parsedurl = urlparse.urlparse(data['url']) |
| + fullurl = data['url'] |
| + if not urlparse.urlparse(fullurl).scheme: |
| + fullurl = 'http://' + fullurl |
| + parsedurl = urlparse.urlparse(fullurl) |
| urlhash = hashlib.new('md5', data['url']).hexdigest() |
| timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).strftime('%Y-%m-%dT%H%M%S.%f') |
| basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) |
| datapath = os.path.join(self.parameters.outdir, basename + ".json") |
| screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") |
| sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") |
| try: |