| OLD | NEW | 
|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python | 
| 2 # coding: utf-8 | 2 # coding: utf-8 | 
| 3 | 3 | 
| 4 import argparse | 4 import argparse | 
| 5 import datetime | 5 import datetime | 
| 6 import errno | 6 import errno | 
| 7 import hashlib | 7 import hashlib | 
| 8 import io | 8 import io | 
| 9 import json | 9 import json | 
| 10 import os | 10 import os | 
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 77 | 77 | 
| 78             with io.open(datapath, 'w', encoding='utf-8') as handle: | 78             with io.open(datapath, 'w', encoding='utf-8') as handle: | 
| 79                 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
     se, sort_keys=True)) + u'\n') | 79                 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
     se, sort_keys=True)) + u'\n') | 
| 80             start_response('204 No Content', []) | 80             start_response('204 No Content', []) | 
| 81             return '' | 81             return '' | 
| 82 | 82 | 
| 83         start_response('404 Not Found', []) | 83         start_response('404 Not Found', []) | 
| 84         return '' | 84         return '' | 
| 85 | 85 | 
| 86 | 86 | 
|  | 87 def read_as_json(file_path): | 
|  | 88     with open(file_path, mode='r') as json_file: | 
|  | 89         return json.load(json_file, encoding='UTF-8') | 
|  | 90 | 
|  | 91 | 
|  | 92 class Parameters: | 
|  | 93     """This class loads config file and parses command line parameters. | 
|  | 94        Values are stored in attibutes of this class instance. | 
|  | 95     """ | 
|  | 96     def __init__(self): | 
|  | 97         cli_parameters = vars(Parameters._parse_command_line()) | 
|  | 98         config_parameters = Parameters._load_config(cli_parameters["config"]) | 
|  | 99         for field in cli_parameters.keys(): | 
|  | 100             value = cli_parameters[field] | 
|  | 101             if value is None: | 
|  | 102                 value = config_parameters.get(field) | 
|  | 103             setattr(self, field, value) | 
|  | 104 | 
|  | 105     @staticmethod | 
|  | 106     def _parse_command_line(): | 
|  | 107         parser = argparse.ArgumentParser(description='Run crawler') | 
|  | 108         parser.add_argument( | 
|  | 109             '-c', '--config', type=str, | 
|  | 110             help='path to config file, example is config.json.example' | 
|  | 111         ) | 
|  | 112         parser.add_argument( | 
|  | 113             '-b', '--binary', type=str, | 
|  | 114             help='path to the Firefox binary' | 
|  | 115         ) | 
|  | 116         parser.add_argument( | 
|  | 117             '-a', '--abpdir', type=str, | 
|  | 118             help='path to the Adblock Plus repository' | 
|  | 119         ) | 
|  | 120         parser.add_argument( | 
|  | 121             '-f', '--filters', metavar='url', type=str, nargs='+', | 
|  | 122             help='filter lists to install in Adblock Plus. The arguments can als
     o have the format path=url, the data will be read from the specified path then.' | 
|  | 123         ) | 
|  | 124         parser.add_argument( | 
|  | 125             '-t', '--timeout', type=int, | 
|  | 126             help='Load timeout (seconds)' | 
|  | 127         ) | 
|  | 128         parser.add_argument( | 
|  | 129             '-x', '--maxtabs', type=int, | 
|  | 130             help='Maximal number of tabs to open in parallel' | 
|  | 131         ) | 
|  | 132         parser.add_argument( | 
|  | 133             '-l', '--list', type=str, | 
|  | 134             help='URL list to process', | 
|  | 135         ) | 
|  | 136         parser.add_argument( | 
|  | 137             '-o', '--outdir', type=str, | 
|  | 138             help='directory to write data into', | 
|  | 139         ) | 
|  | 140         parameters = parser.parse_args() | 
|  | 141         return parameters | 
|  | 142 | 
|  | 143     @staticmethod | 
|  | 144     def _load_config(config_file_path): | 
|  | 145         config = { | 
|  | 146           "filters": [ | 
|  | 147             "https://easylist-downloads.adblockplus.org/easylist.txt", | 
|  | 148             "https://easylist-downloads.adblockplus.org/exceptionrules.txt" | 
|  | 149           ], | 
|  | 150           "timeout": 300, | 
|  | 151           "maxtabs": 15 | 
|  | 152         } | 
|  | 153         if config_file_path is not None: | 
|  | 154             config.update(read_as_json(config_file_path)) | 
|  | 155         return config | 
|  | 156 | 
|  | 157 | 
| 87 def run(): | 158 def run(): | 
| 88     parser = argparse.ArgumentParser(description='Run crawler') | 159     parameters = Parameters() | 
| 89     parser.add_argument( |  | 
| 90         '-b', '--binary', type=str, |  | 
| 91         help='path to the Firefox binary' |  | 
| 92     ) |  | 
| 93     parser.add_argument( |  | 
| 94         '-a', '--abpdir', type=str, |  | 
| 95         help='path to the Adblock Plus repository' |  | 
| 96     ) |  | 
| 97     parser.add_argument( |  | 
| 98         '-f', '--filters', metavar='url', type=str, nargs='+', |  | 
| 99         default=["https://easylist-downloads.adblockplus.org/easylist.txt", "htt
     ps://easylist-downloads.adblockplus.org/exceptionrules.txt"], |  | 
| 100         help='filter lists to install in Adblock Plus. The arguments can also ha
     ve the format path=url, the data will be read from the specified path then.' |  | 
| 101     ) |  | 
| 102     parser.add_argument( |  | 
| 103         '-t', '--timeout', type=int, default=300, |  | 
| 104         help='Load timeout (seconds)' |  | 
| 105     ) |  | 
| 106     parser.add_argument( |  | 
| 107         '-x', '--maxtabs', type=int, default=15, |  | 
| 108         help='Maximal number of tabs to open in parallel' |  | 
| 109     ) |  | 
| 110     parser.add_argument( |  | 
| 111         'list', type=str, |  | 
| 112         help='URL list to process' |  | 
| 113     ) |  | 
| 114     parser.add_argument( |  | 
| 115         'outdir', type=str, |  | 
| 116         help='directory to write data into' |  | 
| 117     ) |  | 
| 118     parameters = parser.parse_args() |  | 
| 119 |  | 
| 120     import buildtools.packagerGecko as packager | 160     import buildtools.packagerGecko as packager | 
| 121     cleanup = [] | 161     cleanup = [] | 
| 122     try: | 162     try: | 
| 123         base_dir = os.path.dirname(os.path.abspath(__file__)) | 163         base_dir = os.path.dirname(os.path.abspath(__file__)) | 
| 124         handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | 164         handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | 
| 125         os.close(handle) | 165         os.close(handle) | 
| 126         cleanup.append(crawlerxpi) | 166         cleanup.append(crawlerxpi) | 
| 127         packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | 167         packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | 
| 128 | 168 | 
| 129         abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
     -1865-latest.xpi' | 169         abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
     -1865-latest.xpi' | 
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 197     BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 237     BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 
| 198     DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | 238     DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | 
| 199 | 239 | 
| 200     try: | 240     try: | 
| 201         subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | 241         subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | 
| 202     except subprocess.CalledProcessError as e: | 242     except subprocess.CalledProcessError as e: | 
| 203         print >>sys.stderr, e | 243         print >>sys.stderr, e | 
| 204         print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | 244         print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | 
| 205 | 245 | 
| 206     run() | 246     run() | 
| OLD | NEW | 
|---|