update-copyright/update_copyright.py - Issue 29459580: Issue 5250 - Add copyright update script

Delta Between Two Patch Sets: update-copyright/update_copyright.py

Issue 29459580: Issue 5250 - Add copyright update script (Closed) Base URL: https://hg.adblockplus.org/codingtools

Left Patch Set: Minor formatting fixes Created July 4, 2017, 3:13 p.m.

Right Patch Set: Fix indentation and default args Created July 17, 2017, 1:22 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 #!/usr/bin/env python3	1 #!/usr/bin/env python3
Sebastian Noack 2017/07/05 15:46:53 I suppose we should make this script executable (i I suppose we should make this script executable (i.e. the mode above should be 755 instead of 644). Note that if the file is not executable, the shebang (i.e. #!) here would be useless. Once you made the script executable, you might also want to adapt the README to instruct calling ./update_copyright.py instead of "python3 update_copyright.py", which as a side effect removes some duplication. rosie 2017/07/07 15:55:48 Done. Show quoted text On 2017/07/05 15:46:53, Sebastian Noack wrote: > I suppose we should make this script executable (i.e. the mode above should be > 755 instead of 644). Note that if the file is not executable, the shebang (i.e. > #!) here would be useless. > > Once you made the script executable, you might also want to adapt the README to > instruct calling ./update_copyright.py instead of "python3 update_copyright.py", > which as a side effect removes some duplication. Done.
2	2

3 import os	3 import os

4 import subprocess	4 import subprocess

5 import re	5 import re

6 import datetime	6 import datetime

7 import shutil	7 import shutil

8 import urllib.parse	8 import urllib.parse

9 import urllib.request	9 import urllib.request

10 import html.parser	10 import html.parser

11 import argparse	11 import argparse

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
51 finally:	51 finally:

52 shutil.rmtree(repo, ignore_errors=True)	52 shutil.rmtree(repo, ignore_errors=True)

53	53

54	54

55 def text_replace(dirpath, filename):	55 def text_replace(dirpath, filename):

56 with open(os.path.join(dirpath, filename), 'r+',	56 with open(os.path.join(dirpath, filename), 'r+',

57 encoding='utf-8', newline='') as file:	57 encoding='utf-8', newline='') as file:

58 try:	58 try:

59 text = file.read()	59 text = file.read()

60 except UnicodeDecodeError:	60 except UnicodeDecodeError:

61 print("Error: Couldn't read {}{}".format(dirpath, filename))
Sebastian Noack 2017/07/05 15:46:53 Failing silently (in the original code) was intend Failing silently (in the original code) was intended here, as this script is expected to encounter many binary files (e.g. images). If we output an error message every time, this will generate a huge amount of noise. rosie 2017/07/07 15:55:49 Done. Show quoted text On 2017/07/05 15:46:53, Sebastian Noack wrote: > Failing silently (in the original code) was intended here, as this script is > expected to encounter many binary files (e.g. images). If we output an error > message every time, this will generate a huge amount of noise. Done.
62 return	61 return

63	62

64 text = re.sub(	63 text = re.sub(

65 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh',	64 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh',

66 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I	65 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I

67 )	66 )

68 file.seek(0)	67 file.seek(0)

69 file.write(text)	68 file.write(text)

70 file.truncate()	69 file.truncate()

71	70

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107 def handle_endtag(self, tag):	106 def handle_endtag(self, tag):

108 if tag == 'tr':	107 if tag == 'tr':

109 self.recordtr = False	108 self.recordtr = False

110 self.cell = 0	109 self.cell = 0

111	110

112 def handle_data(self, data):	111 def handle_data(self, data):

113 if self.cell == 2 and self.recordtr is True:	112 if self.cell == 2 and self.recordtr is True:

114 self.recordtr = False	113 self.recordtr = False

115 self.cell = 0	114 self.cell = 0

116 # Only process the URL if the description is not Deprecated	115 # Only process the URL if the description is not Deprecated

117 deprecated = (re.search(r'\DEPRECATED\', data) or	116 if ('DEPRECATED' not in data and

118 re.search(r'(Deprecated)', data))	117 '(Deprecated)' not in data and
Sebastian Noack 2017/07/05 15:46:53 This regular expression seems incorrect. The paren This regular expression seems incorrect. The parentheses annotate a group. So it won't only match substrings like "(Deprecated)" but any occurrence of "Deprecated", storing the result in the 1st group of the match (which is discarded here). If you want to match parantheses in the input string, you have to escape them in the pattern: re.search(r'\(Deprecated\)', data) Anyway, if all you want to do is to find a fixed substring, it would be much simpler (and less error-prone) to just use the in-operator (instead of regular expressions): 'DEPRECATED' in data or '(Deprecated)' in data rosie 2017/07/07 15:55:48 Done. Show quoted text On 2017/07/05 15:46:53, Sebastian Noack wrote: > This regular expression seems incorrect. The parentheses annotate a group. So it > won't only match substrings like "(Deprecated)" but any occurrence of > "Deprecated", storing the result in the 1st group of the match (which is > discarded here). If you want to match parantheses in the input string, you have > to escape them in the pattern: > > re.search(r'\(Deprecated\)', data) > > Anyway, if all you want to do is to find a fixed substring, it would be much > simpler (and less error-prone) to just use the in-operator (instead of regular > expressions): > > 'DEPRECATED' in data or '(Deprecated)' in data Done.
119 if not deprecated and len(self.current_url) > 2:	118 len(self.current_url) > 2):

120 self.result += [self.current_url]	119 self.result += [self.current_url]

121 return self.result	120 return self.result

122	121

123	122

124 def extract_urls(hg_page):	123 def extract_urls(hg_page):

125 base_url = os.path.dirname(hg_page) + '/'	124 base_url = os.path.dirname(hg_page) + '/'

126 parser = Parser()	125 parser = Parser()

127 with urllib.request.urlopen(hg_page) as response:	126 with urllib.request.urlopen(hg_page) as response:

128 parser.feed(response.read().decode('utf-8'))	127 parser.feed(response.read().decode('utf-8'))

129 parser.close()	128 parser.close()

130 repo_urls = []	129 repo_urls = []

131 for url in parser.result:	130 for url in parser.result:

132 repo_urls.append(urllib.parse.urljoin(base_url, url))	131 repo_urls.append(urllib.parse.urljoin(base_url, url))

133 return repo_urls	132 return repo_urls

134	133

135	134

136 def main(hg_page, hg_upstream):	135 def main(hg_page, hg_upstream):

137 for repo in extract_urls(hg_page):	136 for repo in extract_urls(hg_page):

138 process_repo(repo, hg_upstream)	137 process_repo(repo, hg_upstream)

139	138

140	139

141 if __name__ == '__main__':	140 if __name__ == '__main__':

142 arg_parser = argparse.ArgumentParser()	141 arg_parser = argparse.ArgumentParser()

143 arg_parser.add_argument('-u', '--hg-url',	142 arg_parser.add_argument('-u', '--hg-url',

144 help='specify which Mercurial URL site to scrape',	143 help='specify which Mercurial URL site to scrape')

145 required=True)

146 arg_parser.add_argument('-p', '--push-url',	144 arg_parser.add_argument('-p', '--push-url',

147 default=None,

148 help='specify where to push the repository')	145 help='specify where to push the repository')

149 args = arg_parser.parse_args()	146 args = arg_parser.parse_args()

150 main(args.hg_url, args.push_url)	147 if args.hg_url and not args.push_url:

	148 arg_parser.error('If -u is provided, -p is mandatory')

	149 main(args.hg_url or 'https://hg.adblockplus.org/',

	150 args.push_url or 'ssh://hg@hg.adblockplus.org/')

LEFT	RIGHT