Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 | 2 |
3 import os | 3 import os |
4 import subprocess | 4 import subprocess |
5 import re | 5 import re |
6 import datetime | 6 import datetime |
7 import shutil | 7 import shutil |
8 import urllib.parse | 8 import urllib.parse |
9 import urllib.request | 9 import urllib.request |
10 import html.parser | 10 import html.parser |
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
106 def handle_endtag(self, tag): | 106 def handle_endtag(self, tag): |
107 if tag == 'tr': | 107 if tag == 'tr': |
108 self.recordtr = False | 108 self.recordtr = False |
109 self.cell = 0 | 109 self.cell = 0 |
110 | 110 |
111 def handle_data(self, data): | 111 def handle_data(self, data): |
112 if self.cell == 2 and self.recordtr is True: | 112 if self.cell == 2 and self.recordtr is True: |
113 self.recordtr = False | 113 self.recordtr = False |
114 self.cell = 0 | 114 self.cell = 0 |
115 # Only process the URL if the description is not Deprecated | 115 # Only process the URL if the description is not Deprecated |
116 if ('*DEPRECATED*' not in data and '(Deprecated)' not in data and | 116 if ('*DEPRECATED*' not in data and |
117 len(self.current_url) > 2): | 117 '(Deprecated)' not in data and |
118 len(self.current_url) > 2): | |
118 self.result += [self.current_url] | 119 self.result += [self.current_url] |
119 return self.result | 120 return self.result |
120 | 121 |
121 | 122 |
122 def extract_urls(hg_page): | 123 def extract_urls(hg_page): |
123 base_url = os.path.dirname(hg_page) + '/' | 124 base_url = os.path.dirname(hg_page) + '/' |
124 parser = Parser() | 125 parser = Parser() |
125 with urllib.request.urlopen(hg_page) as response: | 126 with urllib.request.urlopen(hg_page) as response: |
126 parser.feed(response.read().decode('utf-8')) | 127 parser.feed(response.read().decode('utf-8')) |
127 parser.close() | 128 parser.close() |
128 repo_urls = [] | 129 repo_urls = [] |
129 for url in parser.result: | 130 for url in parser.result: |
130 repo_urls.append(urllib.parse.urljoin(base_url, url)) | 131 repo_urls.append(urllib.parse.urljoin(base_url, url)) |
131 return repo_urls | 132 return repo_urls |
132 | 133 |
133 | 134 |
134 def main(hg_page, hg_upstream): | 135 def main(hg_page, hg_upstream): |
135 for repo in extract_urls(hg_page): | 136 for repo in extract_urls(hg_page): |
136 process_repo(repo, hg_upstream) | 137 process_repo(repo, hg_upstream) |
137 | 138 |
138 | 139 |
139 if __name__ == '__main__': | 140 if __name__ == '__main__': |
140 arg_parser = argparse.ArgumentParser() | 141 arg_parser = argparse.ArgumentParser() |
141 arg_parser.add_argument('-u', '--hg-url', | 142 arg_parser.add_argument('-u', '--hg-url', |
142 help='specify which Mercurial URL site to scrape', | 143 help='specify which Mercurial URL site to scrape') |
143 default='https://hg.adblockplus.org/') | |
144 arg_parser.add_argument('-p', '--push-url', | 144 arg_parser.add_argument('-p', '--push-url', |
145 default='ssh://hg@hg.adblockplus.org/', | |
146 help='specify where to push the repository') | 145 help='specify where to push the repository') |
147 args = arg_parser.parse_args() | 146 args = arg_parser.parse_args() |
148 if (args.hg_url != 'https://hg.adblockplus.org/' and args.push_url == | 147 if args.hg_url and not args.push_url: |
149 'ssh://hg@hg.adblockplus.org/'): | |
Sebastian Noack
2017/07/07 16:58:31
Perhaps it would be worth to handle the defaults m
Sebastian Noack
2017/07/17 10:19:54
What is about this comment?
rosie
2017/07/17 13:23:17
Done.
rosie
2017/07/17 13:23:17
Yeah, makes sense. I like this way better.
| |
150 arg_parser.error('If -u is provided, -p is mandatory') | 148 arg_parser.error('If -u is provided, -p is mandatory') |
151 main(args.hg_url, args.push_url) | 149 main(args.hg_url or 'https://hg.adblockplus.org/', |
150 args.push_url or 'ssh://hg@hg.adblockplus.org/') | |
LEFT | RIGHT |