LEFT | RIGHT |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Anonymize data in access log lines. | 2 """Anonymize data in access log lines. |
3 | 3 |
4 Read a line from stdin, write it to stdout with the following changes: | 4 Read a line from stdin, write it to stdout with the following changes: |
5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date | 5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date |
6 2. Country information (extracted from IP) added after the salted hash. | 6 2. Country and city information (extracted from IP) added after the salted hash. |
7 | 7 |
8 If the country information is unavailable in the database, '-' is added instead | 8 If the country or city information is unavailable in the database, |
9 of ISO 3166-1 alpha-2 country code (like 'DE'). | 9 '-' is added instead of ISO 3166-1 alpha-2 country code (like DE) & |
| 10 city name (like 'Berlin'). |
10 | 11 |
11 Salt and the country information database are taken as command line options and | 12 Salt and the country/city information database are taken |
12 default to environment variables. | 13 as command line options and default to environment variables. |
13 | 14 |
14 Malformed lines are passed on as is, based on the assumption that they don't | 15 Malformed lines are passed on as is, based on the assumption that they don't |
15 contain sensitive information. Malformed here means the line couldn't be split | 16 contain sensitive information. Malformed here means the line couldn't be split |
16 on space character. If it could be split, and an error occurs afterwards | 17 on space character. If it could be split, and an error occurs afterwards |
17 (e.g. while trying to parse out the date), the script will fail and exit in | 18 (e.g. while trying to parse out the date), the script will fail and exit in |
18 order to bring attention to the fact that something might not be getting | 19 order to bring attention to the fact that something might not be getting |
19 anonymized. | 20 anonymized. |
20 """ | 21 """ |
21 | 22 |
22 from __future__ import print_function | 23 from __future__ import print_function |
23 from __future__ import unicode_literals | 24 from __future__ import unicode_literals |
24 | 25 |
25 import argparse | 26 import argparse |
26 import hashlib | 27 import hashlib |
27 import hmac | 28 import hmac |
28 import os | 29 import os |
29 import sys | 30 import sys |
30 | 31 |
31 import geoip2.database | 32 import geoip2.database |
32 | 33 |
33 | 34 def main(salt, country_db, city_db): |
34 def main(salt, country_db): | 35 country_reader = geoip2.database.Reader(country_db) |
35 reader = geoip2.database.Reader(country_db) | 36 city_reader = geoip2.database.Reader(city_db) |
36 salt = salt.encode('utf-8') | 37 salt = salt.encode('utf-8') |
37 | 38 |
38 for line in sys.stdin: | 39 for line in sys.stdin: |
39 try: | 40 try: |
40 ip, non_sensitive_info = line.split(' ', 1) | 41 ip, non_sensitive_info = line.split(' ', 1) |
41 except ValueError: | 42 except ValueError: |
42 print(line, end='') | 43 print(line, end='') |
43 continue | 44 continue |
44 | 45 |
45 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country | 46 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country |
46 try: | 47 try: |
47 record = reader.country(ip) | 48 record = country_reader.country(ip) |
48 except geoip2.errors.AddressNotFoundError: | 49 except geoip2.errors.AddressNotFoundError: |
49 country = '-' | 50 country = '-' |
50 else: | 51 else: |
51 country = record.country.iso_code | 52 country = record.country.iso_code |
| 53 |
| 54 try: |
| 55 record = city_reader.city(ip) |
| 56 except geoip2.errors.AddressNotFoundError: |
| 57 city = '-' |
| 58 else: |
| 59 city = record.city.name |
52 | 60 |
53 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... | 61 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... |
54 date_start = line.index('[') + 1 | 62 date_start = line.index('[') + 1 |
55 # IP might be v4 or v6 | 63 # IP might be v4 or v6 |
56 date_end = line.index(':', date_start) | 64 date_end = line.index(':', date_start) |
57 date = line[date_start:date_end] | 65 date = line[date_start:date_end] |
58 | 66 |
59 # https://docs.python.org/2/library/hmac.html | 67 # https://docs.python.org/2/library/hmac.html |
60 to_hash = (ip + date).encode('utf-8') | 68 to_hash = (ip + date).encode('utf-8') |
61 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() | 69 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() |
62 | 70 |
63 print(token, country, non_sensitive_info, end='') | 71 print(token, country, city, non_sensitive_info, end='') |
64 | 72 |
65 reader.close() | 73 country_reader.close() |
| 74 city_reader.close() |
66 | 75 |
67 | 76 |
68 if __name__ == '__main__': | 77 if __name__ == '__main__': |
69 parser = argparse.ArgumentParser( | 78 parser = argparse.ArgumentParser( |
70 description='Filter out sensitive data from access logs', | 79 description='Filter out sensitive data from access logs', |
71 ) | 80 ) |
72 | 81 |
73 parser.add_argument( | 82 parser.add_argument( |
74 '--salt', | 83 '--salt', |
75 dest='salt', | 84 dest='salt', |
76 default=os.getenv('ANONYMIZE_SALT'), | 85 default=os.getenv('ANONYMIZE_SALT'), |
77 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' | 86 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' |
78 ) | 87 ) |
79 | 88 |
80 # https://dev.maxmind.com/geoip/geoip2/geolite2/ | 89 # https://dev.maxmind.com/geoip/geoip2/geolite2/ |
81 parser.add_argument( | 90 parser.add_argument( |
82 '--geolite2-db', | 91 '--geolite2-country-db', |
83 dest='country_db', | 92 dest='country_db', |
84 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), | 93 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), |
85 help='Path to MaxMind DB file with GeoLite2 Country data, defaults ' | 94 help='Path to MaxMind DB file with GeoLite2 Country data, defaults ' |
86 'to $ANONYMIZE_GEOLITE2_DB' | 95 'to $ANONYMIZE_GEOLITE2_DB' |
87 ) | 96 ) |
88 | 97 |
| 98 # https://dev.maxmind.com/geoip/geoip2/geolite2/ |
| 99 parser.add_argument( |
| 100 '--geolite2-city-db', |
| 101 dest='city_db', |
| 102 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), |
| 103 help='Path to MaxMind DB file with GeoLite2 City data, defaults ' |
| 104 'to $ANONYMIZE_GEOLITE2_DB' |
| 105 ) |
| 106 |
89 args = parser.parse_args() | 107 args = parser.parse_args() |
90 | 108 |
91 if args.salt is None or args.country_db is None: | 109 if args.salt is None or args.country_db is None or args.city_db is None: |
92 parser.print_help() | 110 parser.print_help() |
93 sys.exit(1) | 111 sys.exit(1) |
94 | 112 |
95 main(args.salt, args.country_db) | 113 main(args.salt, args.country_db, args.city_db) |
96 | 114 |
LEFT | RIGHT |