Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: anonymized.py

Issue 29817563: #12025 - Add city information to anonymized.py (Closed)
Left Patch Set: #12025 - Add city information to anonymized.py Created July 2, 2018, 2:36 p.m.
Right Patch Set: #12025 - Add city information to anonymized.py Created July 2, 2018, 2:42 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Anonymize data in access log lines. 2 """Anonymize data in access log lines.
3 3
4 Read a line from stdin, write it to stdout with the following changes: 4 Read a line from stdin, write it to stdout with the following changes:
5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date 5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date
6 2. Country information (extracted from IP) added after the salted hash. 6 2. Country and city information (extracted from IP) added after the salted hash.
7 7
8 If the country information is unavailable in the database, '-' is added instead 8 If the country or city information is unavailable in the database,
9 of ISO 3166-1 alpha-2 country code (like 'DE'). 9 '-' is added instead of ISO 3166-1 alpha-2 country code (like DE) &
10 city name (like 'Berlin').
10 11
11 Salt and the country information database are taken as command line options and 12 Salt and the country/city information database are taken
12 default to environment variables. 13 as command line options and default to environment variables.
13 14
14 Malformed lines are passed on as is, based on the assumption that they don't 15 Malformed lines are passed on as is, based on the assumption that they don't
15 contain sensitive information. Malformed here means the line couldn't be split 16 contain sensitive information. Malformed here means the line couldn't be split
16 on space character. If it could be split, and an error occurs afterwards 17 on space character. If it could be split, and an error occurs afterwards
17 (e.g. while trying to parse out the date), the script will fail and exit in 18 (e.g. while trying to parse out the date), the script will fail and exit in
18 order to bring attention to the fact that something might not be getting 19 order to bring attention to the fact that something might not be getting
19 anonymized. 20 anonymized.
20 """ 21 """
21 22
22 from __future__ import print_function 23 from __future__ import print_function
23 from __future__ import unicode_literals 24 from __future__ import unicode_literals
24 25
25 import argparse 26 import argparse
26 import hashlib 27 import hashlib
27 import hmac 28 import hmac
28 import os 29 import os
29 import sys 30 import sys
30 31
31 import geoip2.database 32 import geoip2.database
32 33
33 34 def main(salt, country_db, city_db):
34 def main(salt, country_db): 35 country_reader = geoip2.database.Reader(country_db)
35 reader = geoip2.database.Reader(country_db) 36 city_reader = geoip2.database.Reader(city_db)
36 salt = salt.encode('utf-8') 37 salt = salt.encode('utf-8')
37 38
38 for line in sys.stdin: 39 for line in sys.stdin:
39 try: 40 try:
40 ip, non_sensitive_info = line.split(' ', 1) 41 ip, non_sensitive_info = line.split(' ', 1)
41 except ValueError: 42 except ValueError:
42 print(line, end='') 43 print(line, end='')
43 continue 44 continue
44 45
45 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country 46 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country
46 try: 47 try:
47 record = reader.country(ip) 48 record = country_reader.country(ip)
48 except geoip2.errors.AddressNotFoundError: 49 except geoip2.errors.AddressNotFoundError:
49 country = '-' 50 country = '-'
50 else: 51 else:
51 country = record.country.iso_code 52 country = record.country.iso_code
53
54 try:
55 record = city_reader.city(ip)
56 except geoip2.errors.AddressNotFoundError:
57 city = '-'
58 else:
59 city = record.city.name
52 60
53 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... 61 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /...
54 date_start = line.index('[') + 1 62 date_start = line.index('[') + 1
55 # IP might be v4 or v6 63 # IP might be v4 or v6
56 date_end = line.index(':', date_start) 64 date_end = line.index(':', date_start)
57 date = line[date_start:date_end] 65 date = line[date_start:date_end]
58 66
59 # https://docs.python.org/2/library/hmac.html 67 # https://docs.python.org/2/library/hmac.html
60 to_hash = (ip + date).encode('utf-8') 68 to_hash = (ip + date).encode('utf-8')
61 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() 69 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest()
62 70
63 print(token, country, non_sensitive_info, end='') 71 print(token, country, city, non_sensitive_info, end='')
64 72
65 reader.close() 73 country_reader.close()
74 city_reader.close()
66 75
67 76
68 if __name__ == '__main__': 77 if __name__ == '__main__':
69 parser = argparse.ArgumentParser( 78 parser = argparse.ArgumentParser(
70 description='Filter out sensitive data from access logs', 79 description='Filter out sensitive data from access logs',
71 ) 80 )
72 81
73 parser.add_argument( 82 parser.add_argument(
74 '--salt', 83 '--salt',
75 dest='salt', 84 dest='salt',
76 default=os.getenv('ANONYMIZE_SALT'), 85 default=os.getenv('ANONYMIZE_SALT'),
77 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' 86 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT'
78 ) 87 )
79 88
80 # https://dev.maxmind.com/geoip/geoip2/geolite2/ 89 # https://dev.maxmind.com/geoip/geoip2/geolite2/
81 parser.add_argument( 90 parser.add_argument(
82 '--geolite2-db', 91 '--geolite2-country-db',
83 dest='country_db', 92 dest='country_db',
84 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), 93 default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
85 help='Path to MaxMind DB file with GeoLite2 Country data, defaults ' 94 help='Path to MaxMind DB file with GeoLite2 Country data, defaults '
86 'to $ANONYMIZE_GEOLITE2_DB' 95 'to $ANONYMIZE_GEOLITE2_DB'
87 ) 96 )
88 97
98 # https://dev.maxmind.com/geoip/geoip2/geolite2/
99 parser.add_argument(
100 '--geolite2-city-db',
101 dest='city_db',
102 default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
103 help='Path to MaxMind DB file with GeoLite2 City data, defaults '
104 'to $ANONYMIZE_GEOLITE2_DB'
105 )
106
89 args = parser.parse_args() 107 args = parser.parse_args()
90 108
91 if args.salt is None or args.country_db is None: 109 if args.salt is None or args.country_db is None or args.city_db is None:
92 parser.print_help() 110 parser.print_help()
93 sys.exit(1) 111 sys.exit(1)
94 112
95 main(args.salt, args.country_db) 113 main(args.salt, args.country_db, args.city_db)
96 114
LEFTRIGHT
« no previous file | no next file » | Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Toggle Comments ('s')

Powered by Google App Engine
This is Rietveld