Index: sitescripts/subscriptions/combineSubscriptions.py |
=================================================================== |
--- a/sitescripts/subscriptions/combineSubscriptions.py |
+++ b/sitescripts/subscriptions/combineSubscriptions.py |
@@ -152,21 +152,20 @@ def resolveIncludes(sourceName, sourceDi |
error = None |
break |
except urllib2.URLError, e: |
error = e |
time.sleep(5) |
if error: |
raise error |
- charset = 'utf-8' |
- contentType = request.headers.get('content-type', '') |
- if contentType.find('charset=') >= 0: |
- charset = contentType.split('charset=', 1)[1] |
- newLines = unicode(request.read(), charset).split('\n') |
+ # We should really get the charset from the headers rather than assuming |
+ # that it is UTF-8. However, some of the Google Code mirrors are |
+ # misconfigured and will return ISO-8859-1 as charset instead of UTF-8. |
+ newLines = unicode(request.read(), 'utf-8').split('\n') |
newLines = map(lambda l: re.sub(r'[\r\n]', '', l), newLines) |
newLines = filter(lambda l: not re.search(r'^\s*!.*?\bExpires\s*(?::|after)\s*(\d+)\s*(h)?', l, re.M | re.I), newLines) |
newLines = filter(lambda l: not re.search(r'^\s*!\s*(Redirect|Homepage|Title)\s*:', l, re.M | re.I), newLines) |
else: |
result.append('! *** %s ***' % file) |
includeSource = sourceName |
if file.find(':') >= 0: |