| OLD | NEW |
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # coding: utf-8 | 2 # coding: utf-8 |
| 3 | 3 |
| 4 import argparse | 4 import argparse |
| 5 import datetime | 5 import datetime |
| 6 import errno | 6 import errno |
| 7 import hashlib | 7 import hashlib |
| 8 import io | 8 import io |
| 9 import json | 9 import json |
| 10 import os | 10 import os |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 77 | 77 |
| 78 with io.open(datapath, 'w', encoding='utf-8') as handle: | 78 with io.open(datapath, 'w', encoding='utf-8') as handle: |
| 79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
se, sort_keys=True)) + u'\n') | 79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
se, sort_keys=True)) + u'\n') |
| 80 start_response('204 No Content', []) | 80 start_response('204 No Content', []) |
| 81 return '' | 81 return '' |
| 82 | 82 |
| 83 start_response('404 Not Found', []) | 83 start_response('404 Not Found', []) |
| 84 return '' | 84 return '' |
| 85 | 85 |
| 86 | 86 |
| 87 def read_as_json(file_path): |
| 88 with open(file_path, mode='r') as json_file: |
| 89 return json.load(json_file, encoding='UTF-8') |
| 90 |
| 91 |
| 92 class Parameters: |
| 93 """This class loads config file and parses command line parameters. |
| 94 Values are stored in attibutes of this class instance. |
| 95 """ |
| 96 def __init__(self): |
| 97 cli_parameters = vars(Parameters._parse_command_line()) |
| 98 config_parameters = Parameters._load_config(cli_parameters["config"]) |
| 99 for field in cli_parameters.keys(): |
| 100 value = cli_parameters[field] |
| 101 if value is None: |
| 102 value = config_parameters.get(field) |
| 103 setattr(self, field, value) |
| 104 |
| 105 @staticmethod |
| 106 def _parse_command_line(): |
| 107 parser = argparse.ArgumentParser(description='Run crawler') |
| 108 parser.add_argument( |
| 109 '-c', '--config', type=str, |
| 110 help='path to config file, example is config.json.example' |
| 111 ) |
| 112 parser.add_argument( |
| 113 '-b', '--binary', type=str, |
| 114 help='path to the Firefox binary' |
| 115 ) |
| 116 parser.add_argument( |
| 117 '-a', '--abpdir', type=str, |
| 118 help='path to the Adblock Plus repository' |
| 119 ) |
| 120 parser.add_argument( |
| 121 '-f', '--filters', metavar='url', type=str, nargs='+', |
| 122 help='filter lists to install in Adblock Plus. The arguments can als
o have the format path=url, the data will be read from the specified path then.' |
| 123 ) |
| 124 parser.add_argument( |
| 125 '-t', '--timeout', type=int, |
| 126 help='Load timeout (seconds)' |
| 127 ) |
| 128 parser.add_argument( |
| 129 '-x', '--maxtabs', type=int, |
| 130 help='Maximal number of tabs to open in parallel' |
| 131 ) |
| 132 parser.add_argument( |
| 133 '-l', '--list', type=str, |
| 134 help='URL list to process', |
| 135 ) |
| 136 parser.add_argument( |
| 137 '-o', '--outdir', type=str, |
| 138 help='directory to write data into', |
| 139 ) |
| 140 parameters = parser.parse_args() |
| 141 return parameters |
| 142 |
| 143 @staticmethod |
| 144 def _load_config(config_file_path): |
| 145 config = { |
| 146 "filters": [ |
| 147 "https://easylist-downloads.adblockplus.org/easylist.txt", |
| 148 "https://easylist-downloads.adblockplus.org/exceptionrules.txt" |
| 149 ], |
| 150 "timeout": 300, |
| 151 "maxtabs": 15 |
| 152 } |
| 153 if config_file_path is not None: |
| 154 config.update(read_as_json(config_file_path)) |
| 155 return config |
| 156 |
| 157 |
| 87 def run(): | 158 def run(): |
| 88 parser = argparse.ArgumentParser(description='Run crawler') | 159 parameters = Parameters() |
| 89 parser.add_argument( | |
| 90 '-b', '--binary', type=str, | |
| 91 help='path to the Firefox binary' | |
| 92 ) | |
| 93 parser.add_argument( | |
| 94 '-a', '--abpdir', type=str, | |
| 95 help='path to the Adblock Plus repository' | |
| 96 ) | |
| 97 parser.add_argument( | |
| 98 '-f', '--filters', metavar='url', type=str, nargs='+', | |
| 99 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "htt
ps://easylist-downloads.adblockplus.org/exceptionrules.txt"], | |
| 100 help='filter lists to install in Adblock Plus. The arguments can also ha
ve the format path=url, the data will be read from the specified path then.' | |
| 101 ) | |
| 102 parser.add_argument( | |
| 103 '-t', '--timeout', type=int, default=300, | |
| 104 help='Load timeout (seconds)' | |
| 105 ) | |
| 106 parser.add_argument( | |
| 107 '-x', '--maxtabs', type=int, default=15, | |
| 108 help='Maximal number of tabs to open in parallel' | |
| 109 ) | |
| 110 parser.add_argument( | |
| 111 'list', type=str, | |
| 112 help='URL list to process' | |
| 113 ) | |
| 114 parser.add_argument( | |
| 115 'outdir', type=str, | |
| 116 help='directory to write data into' | |
| 117 ) | |
| 118 parameters = parser.parse_args() | |
| 119 | |
| 120 import buildtools.packagerGecko as packager | 160 import buildtools.packagerGecko as packager |
| 121 cleanup = [] | 161 cleanup = [] |
| 122 try: | 162 try: |
| 123 base_dir = os.path.dirname(os.path.abspath(__file__)) | 163 base_dir = os.path.dirname(os.path.abspath(__file__)) |
| 124 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | 164 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') |
| 125 os.close(handle) | 165 os.close(handle) |
| 126 cleanup.append(crawlerxpi) | 166 cleanup.append(crawlerxpi) |
| 127 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | 167 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) |
| 128 | 168 |
| 129 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
-1865-latest.xpi' | 169 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
-1865-latest.xpi' |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 197 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 237 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| 198 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | 238 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") |
| 199 | 239 |
| 200 try: | 240 try: |
| 201 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | 241 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) |
| 202 except subprocess.CalledProcessError as e: | 242 except subprocess.CalledProcessError as e: |
| 203 print >>sys.stderr, e | 243 print >>sys.stderr, e |
| 204 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | 244 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" |
| 205 | 245 |
| 206 run() | 246 run() |
| OLD | NEW |