OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 """Anonymize data in access log lines. |
| 3 |
| 4 Read a line from stdin, write it to stdout with the following changes: |
| 5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date |
| 6 2. Country and city information (extracted from IP) added after the salted hash. |
| 7 |
| 8 If the country or city information is unavailable in the database, '-' is added
instead |
| 9 of ISO 3166-1 alpha-2 country/city code (like 'DE'). |
| 10 |
| 11 Salt and the country/city information database are taken as command line options
and |
| 12 default to environment variables. |
| 13 |
| 14 Malformed lines are passed on as is, based on the assumption that they don't |
| 15 contain sensitive information. Malformed here means the line couldn't be split |
| 16 on space character. If it could be split, and an error occurs afterwards |
| 17 (e.g. while trying to parse out the date), the script will fail and exit in |
| 18 order to bring attention to the fact that something might not be getting |
| 19 anonymized. |
| 20 """ |
| 21 |
| 22 from __future__ import print_function |
| 23 from __future__ import unicode_literals |
| 24 |
| 25 import argparse |
| 26 import hashlib |
| 27 import hmac |
| 28 import os |
| 29 import sys |
| 30 |
| 31 import geoip2.database |
| 32 |
| 33 def main(salt, country_db, city_db): |
| 34 country_reader = geoip2.database.Reader(country_db) |
| 35 city_reader = geoip2.database.Reader(city_db) |
| 36 salt = salt.encode('utf-8') |
| 37 |
| 38 for line in sys.stdin: |
| 39 try: |
| 40 ip, non_sensitive_info = line.split(' ', 1) |
| 41 except ValueError: |
| 42 print(line, end='') |
| 43 continue |
| 44 |
| 45 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country |
| 46 try: |
| 47 record = country_reader.country(ip) |
| 48 except geoip2.errors.AddressNotFoundError: |
| 49 country = '-' |
| 50 else: |
| 51 country = record.country.iso_code |
| 52 |
| 53 try: |
| 54 record = city_reader.city(ip) |
| 55 except geoip2.errors.AddressNotFoundError: |
| 56 city = '-' |
| 57 else: |
| 58 city = record.city.name |
| 59 |
| 60 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... |
| 61 date_start = line.index('[') + 1 |
| 62 # IP might be v4 or v6 |
| 63 date_end = line.index(':', date_start) |
| 64 date = line[date_start:date_end] |
| 65 |
| 66 # https://docs.python.org/2/library/hmac.html |
| 67 to_hash = (ip + date).encode('utf-8') |
| 68 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() |
| 69 |
| 70 print(token, country, city, non_sensitive_info, end='') |
| 71 |
| 72 country_reader.close() |
| 73 city_reader.close() |
| 74 |
| 75 |
| 76 if __name__ == '__main__': |
| 77 parser = argparse.ArgumentParser( |
| 78 description='Filter out sensitive data from access logs', |
| 79 ) |
| 80 |
| 81 parser.add_argument( |
| 82 '--salt', |
| 83 dest='salt', |
| 84 default=os.getenv('ANONYMIZE_SALT'), |
| 85 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' |
| 86 ) |
| 87 |
| 88 # https://dev.maxmind.com/geoip/geoip2/geolite2/ |
| 89 parser.add_argument( |
| 90 '--geolite2-country-db', |
| 91 dest='country_db', |
| 92 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), |
| 93 help='Path to MaxMind DB file with GeoLite2 Country data, defaults ' |
| 94 'to $ANONYMIZE_GEOLITE2_DB' |
| 95 ) |
| 96 |
| 97 # https://dev.maxmind.com/geoip/geoip2/geolite2/ |
| 98 parser.add_argument( |
| 99 '--geolite2-city-db', |
| 100 dest='city_db', |
| 101 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), |
| 102 help='Path to MaxMind DB file with GeoLite2 City data, defaults ' |
| 103 'to $ANONYMIZE_GEOLITE2_DB' |
| 104 ) |
| 105 |
| 106 args = parser.parse_args() |
| 107 |
| 108 if args.salt is None or args.country_db is None or args.city_db is None: |
| 109 parser.print_help() |
| 110 sys.exit(1) |
| 111 |
| 112 main(args.salt, args.country_db, args.city_db) |
| 113 |
OLD | NEW |