OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding: utf-8 | 2 # coding: utf-8 |
3 | 3 |
4 import argparse | 4 import argparse |
5 import datetime | 5 import datetime |
6 import errno | 6 import errno |
7 import hashlib | 7 import hashlib |
8 import io | 8 import io |
9 import json | 9 import json |
10 import os | 10 import os |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
77 | 77 |
78 with io.open(datapath, 'w', encoding='utf-8') as handle: | 78 with io.open(datapath, 'w', encoding='utf-8') as handle: |
79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
se, sort_keys=True)) + u'\n') | 79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
se, sort_keys=True)) + u'\n') |
80 start_response('204 No Content', []) | 80 start_response('204 No Content', []) |
81 return '' | 81 return '' |
82 | 82 |
83 start_response('404 Not Found', []) | 83 start_response('404 Not Found', []) |
84 return '' | 84 return '' |
85 | 85 |
86 | 86 |
| 87 def read_as_json(file_path): |
| 88 with open(file_path, mode='r') as json_file: |
| 89 return json.load(json_file, encoding='UTF-8') |
| 90 |
| 91 |
| 92 class Parameters: |
| 93 """This class loads config file and parses command line parameters. |
| 94 Values are stored in attibutes of this class instance. |
| 95 """ |
| 96 def __init__(self): |
| 97 cli_parameters = vars(Parameters._parse_command_line()) |
| 98 config_parameters = Parameters._load_config(cli_parameters["config"]) |
| 99 for field in cli_parameters.keys(): |
| 100 value = cli_parameters[field] |
| 101 if value is None: |
| 102 value = config_parameters.get(field) |
| 103 setattr(self, field, value) |
| 104 |
| 105 @staticmethod |
| 106 def _parse_command_line(): |
| 107 parser = argparse.ArgumentParser(description='Run crawler') |
| 108 parser.add_argument( |
| 109 '-c', '--config', type=str, |
| 110 help='path to config file, example is config.json.example' |
| 111 ) |
| 112 parser.add_argument( |
| 113 '-b', '--binary', type=str, |
| 114 help='path to the Firefox binary' |
| 115 ) |
| 116 parser.add_argument( |
| 117 '-a', '--abpdir', type=str, |
| 118 help='path to the Adblock Plus repository' |
| 119 ) |
| 120 parser.add_argument( |
| 121 '-f', '--filters', metavar='url', type=str, nargs='+', |
| 122 help='filter lists to install in Adblock Plus. The arguments can als
o have the format path=url, the data will be read from the specified path then.' |
| 123 ) |
| 124 parser.add_argument( |
| 125 '-t', '--timeout', type=int, |
| 126 help='Load timeout (seconds)' |
| 127 ) |
| 128 parser.add_argument( |
| 129 '-x', '--maxtabs', type=int, |
| 130 help='Maximal number of tabs to open in parallel' |
| 131 ) |
| 132 parser.add_argument( |
| 133 '-l', '--list', type=str, |
| 134 help='URL list to process', |
| 135 ) |
| 136 parser.add_argument( |
| 137 '-o', '--outdir', type=str, |
| 138 help='directory to write data into', |
| 139 ) |
| 140 parameters = parser.parse_args() |
| 141 return parameters |
| 142 |
| 143 @staticmethod |
| 144 def _load_config(config_file_path): |
| 145 config = { |
| 146 "filters": [ |
| 147 "https://easylist-downloads.adblockplus.org/easylist.txt", |
| 148 "https://easylist-downloads.adblockplus.org/exceptionrules.txt" |
| 149 ], |
| 150 "timeout": 300, |
| 151 "maxtabs": 15 |
| 152 } |
| 153 if config_file_path is not None: |
| 154 config.update(read_as_json(config_file_path)) |
| 155 return config |
| 156 |
| 157 |
87 def run(): | 158 def run(): |
88 parser = argparse.ArgumentParser(description='Run crawler') | 159 parameters = Parameters() |
89 parser.add_argument( | |
90 '-b', '--binary', type=str, | |
91 help='path to the Firefox binary' | |
92 ) | |
93 parser.add_argument( | |
94 '-a', '--abpdir', type=str, | |
95 help='path to the Adblock Plus repository' | |
96 ) | |
97 parser.add_argument( | |
98 '-f', '--filters', metavar='url', type=str, nargs='+', | |
99 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "htt
ps://easylist-downloads.adblockplus.org/exceptionrules.txt"], | |
100 help='filter lists to install in Adblock Plus. The arguments can also ha
ve the format path=url, the data will be read from the specified path then.' | |
101 ) | |
102 parser.add_argument( | |
103 '-t', '--timeout', type=int, default=300, | |
104 help='Load timeout (seconds)' | |
105 ) | |
106 parser.add_argument( | |
107 '-x', '--maxtabs', type=int, default=15, | |
108 help='Maximal number of tabs to open in parallel' | |
109 ) | |
110 parser.add_argument( | |
111 'list', type=str, | |
112 help='URL list to process' | |
113 ) | |
114 parser.add_argument( | |
115 'outdir', type=str, | |
116 help='directory to write data into' | |
117 ) | |
118 parameters = parser.parse_args() | |
119 | |
120 import buildtools.packagerGecko as packager | 160 import buildtools.packagerGecko as packager |
121 cleanup = [] | 161 cleanup = [] |
122 try: | 162 try: |
123 base_dir = os.path.dirname(os.path.abspath(__file__)) | 163 base_dir = os.path.dirname(os.path.abspath(__file__)) |
124 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | 164 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') |
125 os.close(handle) | 165 os.close(handle) |
126 cleanup.append(crawlerxpi) | 166 cleanup.append(crawlerxpi) |
127 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | 167 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) |
128 | 168 |
129 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
-1865-latest.xpi' | 169 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
-1865-latest.xpi' |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
197 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 237 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
198 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | 238 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") |
199 | 239 |
200 try: | 240 try: |
201 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | 241 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) |
202 except subprocess.CalledProcessError as e: | 242 except subprocess.CalledProcessError as e: |
203 print >>sys.stderr, e | 243 print >>sys.stderr, e |
204 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | 244 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" |
205 | 245 |
206 run() | 246 run() |
OLD | NEW |