update-copyright/update_copyright.py - Issue 29459580: Issue 5250 - Add copyright update script

Side by Side Diff: update-copyright/update_copyright.py

Issue 29459580: Issue 5250 - Add copyright update script (Closed) Base URL: https://hg.adblockplus.org/codingtools

Patch Set: Minor formatting fixes Created July 4, 2017, 3:13 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python3
	Sebastian Noack 2017/07/05 15:46:53 I suppose we should make this script executable (i I suppose we should make this script executable (i.e. the mode above should be 755 instead of 644). Note that if the file is not executable, the shebang (i.e. #!) here would be useless. Once you made the script executable, you might also want to adapt the README to instruct calling ./update_copyright.py instead of "python3 update_copyright.py", which as a side effect removes some duplication. rosie 2017/07/07 15:55:48 Done. Show quoted text On 2017/07/05 15:46:53, Sebastian Noack wrote: > I suppose we should make this script executable (i.e. the mode above should be > 755 instead of 644). Note that if the file is not executable, the shebang (i.e. > #!) here would be useless. > > Once you made the script executable, you might also want to adapt the README to > instruct calling ./update_copyright.py instead of "python3 update_copyright.py", > which as a side effect removes some duplication. Done.
	2

	3 import os

	4 import subprocess

	5 import re

	6 import datetime

	7 import shutil

	8 import urllib.parse

	9 import urllib.request

	10 import html.parser

	11 import argparse

	12

	13

	14 CURRENT_YEAR = datetime.datetime.now().year

	15

	16

	17 def process_repo(url, hg_upstream):

	18 repo = url.rstrip('/').split('/')[-1]

	19

	20 if repo in {

	21 # headers are copied from libadblockplus, no need to update seperately

	22 'libadblockplus-binaries',

	23 # huge and only contains autogenerated builds

	24 'downloads',

	25 }:

	26 return

	27

	28 try:

	29 subprocess.check_call(['hg', 'clone', url, repo])

	30 if repo == 'adblockbrowser':

	31 # adblockbrowser is a FF fork with its own changes in a

	32 # seperate branch

	33 subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser',

	34 '--repository', repo])

	35 else:

	36 # switch to 'master' bookmark if it exists

	37 subprocess.call(['hg', 'up', '--rev', 'master',

	38 '--repository', repo])

	39 for dirpath, dirnames, filenames in os.walk(repo):

	40 if dirpath == repo:

	41 dirnames.remove('.hg')

	42

	43 for filename in filenames:

	44 text_replace(dirpath, filename)

	45 if hg_upstream is None:

	46 hg_upstream = url

	47 else:

	48 hg_upstream += '/' + repo

	49 hg_commit(repo, hg_upstream)

	50

	51 finally:

	52 shutil.rmtree(repo, ignore_errors=True)

	53

	54

	55 def text_replace(dirpath, filename):

	56 with open(os.path.join(dirpath, filename), 'r+',

	57 encoding='utf-8', newline='') as file:

	58 try:

	59 text = file.read()

	60 except UnicodeDecodeError:

	61 print("Error: Couldn't read {}{}".format(dirpath, filename))
	Sebastian Noack 2017/07/05 15:46:53 Failing silently (in the original code) was intend Failing silently (in the original code) was intended here, as this script is expected to encounter many binary files (e.g. images). If we output an error message every time, this will generate a huge amount of noise. rosie 2017/07/07 15:55:49 Done. Show quoted text On 2017/07/05 15:46:53, Sebastian Noack wrote: > Failing silently (in the original code) was intended here, as this script is > expected to encounter many binary files (e.g. images). If we output an error > message every time, this will generate a huge amount of noise. Done.
	62 return

	63

	64 text = re.sub(

	65 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh',

	66 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I

	67 )

	68 file.seek(0)

	69 file.write(text)

	70 file.truncate()

	71

	72

	73 def hg_commit(repo, hg_upstream):

	74 try:

	75 subprocess.check_call(['hg', 'commit', '-m',

	76 'Noissue - Updated copyright year',

	77 '--repository', repo])

	78 except subprocess.CalledProcessError as e:

	79 if e.returncode == 1: # no changes

	80 return

	81 raise

	82

	83 # Push changes, or save patch if access denied

	84 if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0:

	85 with open(repo + '.patch', 'wb') as file:

	86 print('couldnt push, making patch instead')

	87 subprocess.check_call(['hg', 'export', '--repository', repo],

	88 stdout=file)

	89

	90

	91 class Parser(html.parser.HTMLParser):

	92 result = []

	93 recordtr = False

	94 cell = 0

	95 current_url = ''

	96

	97 def handle_starttag(self, tag, attrs):

	98 if tag == 'tr':

	99 self.recordtr = True

	100 if tag == 'td':

	101 self.cell += 1

	102 if tag == 'a':

	103 attrs = dict(attrs)

	104 if 'list' in attrs.get('class', '').split():

	105 self.current_url = attrs['href']

	106

	107 def handle_endtag(self, tag):

	108 if tag == 'tr':

	109 self.recordtr = False

	110 self.cell = 0

	111

	112 def handle_data(self, data):

	113 if self.cell == 2 and self.recordtr is True:

	114 self.recordtr = False

	115 self.cell = 0

	116 # Only process the URL if the description is not Deprecated

	117 deprecated = (re.search(r'\DEPRECATED\', data) or

	118 re.search(r'(Deprecated)', data))
	Sebastian Noack 2017/07/05 15:46:53 This regular expression seems incorrect. The paren This regular expression seems incorrect. The parentheses annotate a group. So it won't only match substrings like "(Deprecated)" but any occurrence of "Deprecated", storing the result in the 1st group of the match (which is discarded here). If you want to match parantheses in the input string, you have to escape them in the pattern: re.search(r'\(Deprecated\)', data) Anyway, if all you want to do is to find a fixed substring, it would be much simpler (and less error-prone) to just use the in-operator (instead of regular expressions): 'DEPRECATED' in data or '(Deprecated)' in data rosie 2017/07/07 15:55:48 Done. Show quoted text On 2017/07/05 15:46:53, Sebastian Noack wrote: > This regular expression seems incorrect. The parentheses annotate a group. So it > won't only match substrings like "(Deprecated)" but any occurrence of > "Deprecated", storing the result in the 1st group of the match (which is > discarded here). If you want to match parantheses in the input string, you have > to escape them in the pattern: > > re.search(r'\(Deprecated\)', data) > > Anyway, if all you want to do is to find a fixed substring, it would be much > simpler (and less error-prone) to just use the in-operator (instead of regular > expressions): > > 'DEPRECATED' in data or '(Deprecated)' in data Done.
	119 if not deprecated and len(self.current_url) > 2:

	120 self.result += [self.current_url]

	121 return self.result

	122

	123

	124 def extract_urls(hg_page):

	125 base_url = os.path.dirname(hg_page) + '/'

	126 parser = Parser()

	127 with urllib.request.urlopen(hg_page) as response:

	128 parser.feed(response.read().decode('utf-8'))

	129 parser.close()

	130 repo_urls = []

	131 for url in parser.result:

	132 repo_urls.append(urllib.parse.urljoin(base_url, url))

	133 return repo_urls

	134

	135

	136 def main(hg_page, hg_upstream):

	137 for repo in extract_urls(hg_page):

	138 process_repo(repo, hg_upstream)

	139

	140

	141 if __name__ == '__main__':

	142 arg_parser = argparse.ArgumentParser()

	143 arg_parser.add_argument('-u', '--hg-url',

	144 help='specify which Mercurial URL site to scrape',

	145 required=True)

	146 arg_parser.add_argument('-p', '--push-url',

	147 default=None,

	148 help='specify where to push the repository')

	149 args = arg_parser.parse_args()

	150 main(args.hg_url, args.push_url)

OLD	NEW

« update-copyright/tox.ini ('K') | « update-copyright/tox.ini ('k') | no next file » | no next file with comments »