Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python3 | |
2 | |
3 import os | |
4 import sys | |
5 import subprocess | |
6 import re | |
7 import datetime | |
8 import shutil | |
9 import urllib.parse | |
10 import urllib.request | |
11 import html.parser | |
12 import argparse | |
13 from posixpath import dirname | |
14 | |
15 | |
16 CURRENT_YEAR = datetime.datetime.now().year | |
17 | |
18 | |
19 def process_repo(url, hg_upstream): | |
20 repo = url.rstrip('/').split('/')[-1] | |
21 | |
22 if repo in { | |
23 # headers are copied from libadblockplus, no need to update seperately | |
24 'libadblockplus-binaries', | |
25 # huge and only contains autogenerated builds | |
26 'downloads', | |
27 }: | |
28 return | |
29 | |
30 try: | |
31 subprocess.check_call(['hg', 'clone', url, repo]) | |
32 if repo == 'adblockbrowser': | |
33 # adblockbrowser is a FF fork with its own changes in a | |
34 # seperate branch | |
35 subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser', | |
36 '--repository', repo]) | |
37 else: | |
38 # switch to 'master' bookmark if it exists | |
39 subprocess.call(['hg', 'up', '--rev', 'master', | |
40 '--repository', repo]) | |
41 for dirpath, dirnames, filenames in os.walk(repo): | |
42 if dirpath == repo: | |
43 dirnames.remove('.hg') | |
44 | |
45 for filename in filenames: | |
46 text_replace(dirpath, filename) | |
47 if hg_upstream is None: | |
48 hg_upstream = url | |
49 hg_commit(repo, hg_upstream) | |
50 | |
51 finally: | |
52 shutil.rmtree(repo, ignore_errors=True) | |
53 | |
54 | |
55 def text_replace(dirpath, filename): | |
56 with open(os.path.join(dirpath, filename), 'r+', | |
57 encoding='utf-8', newline='') as file: | |
58 try: | |
59 text = file.read() | |
60 except UnicodeDecodeError: | |
61 print("Error: Couldn't read {}{}".format(dirpath, filename)) | |
62 return | |
63 | |
64 text = re.sub( | |
65 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh', | |
66 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I | |
67 ) | |
68 file.seek(0) | |
69 file.write(text) | |
70 file.truncate() | |
71 | |
72 | |
73 def hg_commit(repo, hg_upstream): | |
74 try: | |
75 subprocess.check_call(['hg', 'commit', '-m', | |
76 'Noissue - Updated copyright year', | |
77 '--repository', repo]) | |
78 except subprocess.CalledProcessError as e: | |
79 if e.returncode == 1: # no changes | |
80 return | |
81 raise | |
82 | |
83 # Push changes, or save patch if access denied | |
84 if 'ssh://hg@hg.adblockplus.org/' in hg_upstream: | |
Vasily Kuznetsov
2017/07/03 19:25:47
Here we still have hardcoded logic related to hg.a
rosie
2017/07/04 13:38:25
Done.
| |
85 hg_upstream += repo | |
86 if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0: | |
87 with open(repo + '.patch', 'wb') as file: | |
88 print('couldnt push, making patch instead') | |
89 subprocess.check_call(['hg', 'export', '--repository', repo], | |
90 stdout=file) | |
91 | |
92 | |
93 class Parser(html.parser.HTMLParser): | |
94 result = [] | |
95 recordtr = False | |
96 cell = 0 | |
97 current_url = '' | |
98 | |
99 def handle_starttag(self, tag, attrs): | |
100 if tag == 'tr': | |
101 self.recordtr = True | |
102 if tag == 'td': | |
103 self.cell += 1 | |
104 if tag == 'a': | |
105 attrs = dict(attrs) | |
106 if 'list' in attrs.get('class', '').split(): | |
107 self.current_url = attrs['href'] | |
108 | |
109 def handle_endtag(self, tag): | |
110 if tag == 'tr': | |
111 self.recordtr = False | |
112 self.cell = 0 | |
113 | |
114 def handle_data(self, data): | |
115 if self.cell == 2 and self.recordtr is True: | |
116 self.recordtr = False | |
117 self.cell = 0 | |
118 # Only process the URL if the description is not Deprecated | |
119 deprecated = (re.search(r'\*DEPRECATED\*', data) or | |
120 re.search(r'(Deprecated)', data)) | |
121 if not deprecated and len(self.current_url) > 2: | |
122 self.result += [self.current_url] | |
123 return self.result | |
124 | |
125 | |
126 def extract_urls(hg_page): | |
127 base_url = dirname(hg_page) + '/' | |
Vasily Kuznetsov
2017/07/03 19:25:47
This will probably still break on some non-POSIX s
rosie
2017/07/04 13:38:26
Acknowledged.
| |
128 parser = Parser() | |
129 with urllib.request.urlopen(hg_page) as response: | |
130 parser.feed(response.read().decode('utf-8')) | |
131 parser.close() | |
132 repo_urls = [] | |
133 for url in parser.result: | |
134 repo_urls.append(urllib.parse.urljoin(base_url, url)) | |
135 return repo_urls | |
136 | |
137 | |
138 def main(hg_page, hg_upstream): | |
139 for repo in extract_urls(hg_page): | |
140 process_repo(repo, hg_upstream) | |
141 | |
142 | |
143 if __name__ == '__main__': | |
144 arg_parser = argparse.ArgumentParser() | |
145 arg_parser.add_argument('-u', '--hg-url', | |
146 default=None, | |
147 help='specify which Mercurial URL site to scrape') | |
148 arg_parser.add_argument('-p', '--push-url', | |
149 default=None, | |
150 help='specify where to push the repository') | |
151 args = arg_parser.parse_args() | |
152 if args.hg_url is None: | |
Vasily Kuznetsov
2017/07/03 19:25:47
Actually this is not necessary. If you make this o
rosie
2017/07/04 13:38:25
Done.
| |
153 arg_parser.error('-u HG_URL was not specified') | |
154 sys.exit(2) | |
155 hg_page = args.hg_url | |
Vasily Kuznetsov
2017/07/03 19:25:47
Do you think these intermediate variables add valu
rosie
2017/07/04 13:38:25
Done.
| |
156 hg_upstream = args.push_url | |
157 main(hg_page, hg_upstream) | |
OLD | NEW |