| OLD | NEW | 
| (Empty) |  | 
 |   1 #!/usr/bin/env python | 
 |   2 """Anonymize data in access log lines. | 
 |   3  | 
 |   4 Read a line from stdin, write it to stdout with the following changes: | 
 |   5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date | 
 |   6 2. Country information (extracted from IP) added after the salted hash. | 
 |   7  | 
 |   8 If the country information is unavailable in the database, '-' is added instead | 
 |   9 of ISO 3166-1 alpha-2 country code (like 'DE'). | 
 |  10  | 
 |  11 Salt and the country information database are taken as command line options and | 
 |  12 default to environment variables. | 
 |  13  | 
 |  14 Malformed lines are passed on as is, based on the assumption that they don't | 
 |  15 contain sensitive information. Malformed here means the line couldn't be split | 
 |  16 on space character. If it could be split, and an error occurs afterwards | 
 |  17 (e.g. while trying to parse out the date), the script will fail and exit in | 
 |  18 order to bring attention to the fact that something might not be getting | 
 |  19 anonymized. | 
 |  20 """ | 
 |  21  | 
 |  22 from __future__ import print_function | 
 |  23 from __future__ import unicode_literals | 
 |  24  | 
 |  25 import argparse | 
 |  26 import hashlib | 
 |  27 import hmac | 
 |  28 import os | 
 |  29 import sys | 
 |  30  | 
 |  31 import geoip2.database | 
 |  32  | 
 |  33  | 
 |  34 def main(salt, country_db): | 
 |  35     reader = geoip2.database.Reader(country_db) | 
 |  36     salt = salt.encode('utf-8') | 
 |  37  | 
 |  38     for line in sys.stdin: | 
 |  39         try: | 
 |  40             ip, non_sensitive_info = line.split(' ', 1) | 
 |  41         except ValueError: | 
 |  42             print(line, end='') | 
 |  43             continue | 
 |  44  | 
 |  45         # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country | 
 |  46         try: | 
 |  47             record = reader.country(ip) | 
 |  48         except geoip2.errors.AddressNotFoundError: | 
 |  49             country = '-' | 
 |  50         else: | 
 |  51             country = record.country.iso_code | 
 |  52  | 
 |  53         # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... | 
 |  54         date_start = line.index('[') + 1 | 
 |  55         # IP might be v4 or v6 | 
 |  56         date_end = line.index(':', date_start) | 
 |  57         date = line[date_start:date_end] | 
 |  58  | 
 |  59         # https://docs.python.org/2/library/hmac.html | 
 |  60         to_hash = (ip + date).encode('utf-8') | 
 |  61         token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() | 
 |  62  | 
 |  63         print(token, country, non_sensitive_info, end='') | 
 |  64  | 
 |  65     reader.close() | 
 |  66  | 
 |  67  | 
 |  68 if __name__ == '__main__': | 
 |  69     parser = argparse.ArgumentParser( | 
 |  70         description='Filter out sensitive data from access logs', | 
 |  71     ) | 
 |  72  | 
 |  73     parser.add_argument( | 
 |  74         '--salt', | 
 |  75         dest='salt', | 
 |  76         default=os.getenv('ANONYMIZE_SALT'), | 
 |  77         help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' | 
 |  78     ) | 
 |  79  | 
 |  80     # https://dev.maxmind.com/geoip/geoip2/geolite2/ | 
 |  81     parser.add_argument( | 
 |  82         '--geolite2-db', | 
 |  83         dest='country_db', | 
 |  84         default=os.getenv('ANONYMIZE_GEOLITE2_DB'), | 
 |  85         help='Path to MaxMind DB file with GeoLite2 Country data, defaults ' | 
 |  86              'to $ANONYMIZE_GEOLITE2_DB' | 
 |  87     ) | 
 |  88  | 
 |  89     args = parser.parse_args() | 
 |  90  | 
 |  91     if args.salt is None or args.country_db is None: | 
 |  92         parser.print_help() | 
 |  93         sys.exit(1) | 
 |  94  | 
 |  95     main(args.salt, args.country_db) | 
| OLD | NEW |