| Index: anonymized.py | 
| diff --git a/anonymized.py b/anonymized.py | 
| new file mode 100755 | 
| index 0000000000000000000000000000000000000000..8456beef2d2cd32c5e0477e8dbf68fe4fa75b0b4 | 
| --- /dev/null | 
| +++ b/anonymized.py | 
| @@ -0,0 +1,113 @@ | 
| +#!/usr/bin/env python | 
| +"""Anonymize data in access log lines. | 
| + | 
| +Read a line from stdin, write it to stdout with the following changes: | 
| +1. IP (v4 or v6) replaced with a salted hash of the IP and the date | 
| +2. Country and city information (extracted from IP) added after the salted hash. | 
| + | 
| +If the country or city information is unavailable in the database, '-' is added instead | 
| +of ISO 3166-1 alpha-2 country/city code (like 'DE'). | 
| + | 
| +Salt and the country/city information database are taken as command line options and | 
| +default to environment variables. | 
| + | 
| +Malformed lines are passed on as is, based on the assumption that they don't | 
| +contain sensitive information. Malformed here means the line couldn't be split | 
| +on space character. If it could be split, and an error occurs afterwards | 
| +(e.g. while trying to parse out the date), the script will fail and exit in | 
| +order to bring attention to the fact that something might not be getting | 
| +anonymized. | 
| +""" | 
| + | 
| +from __future__ import print_function | 
| +from __future__ import unicode_literals | 
| + | 
| +import argparse | 
| +import hashlib | 
| +import hmac | 
| +import os | 
| +import sys | 
| + | 
| +import geoip2.database | 
| + | 
| +def main(salt, country_db, city_db): | 
| +    country_reader = geoip2.database.Reader(country_db) | 
| +    city_reader = geoip2.database.Reader(city_db) | 
| +    salt = salt.encode('utf-8') | 
| + | 
| +    for line in sys.stdin: | 
| +        try: | 
| +            ip, non_sensitive_info = line.split(' ', 1) | 
| +        except ValueError: | 
| +            print(line, end='') | 
| +            continue | 
| + | 
| +        # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country | 
| +        try: | 
| +            record = country_reader.country(ip) | 
| +        except geoip2.errors.AddressNotFoundError: | 
| +            country = '-' | 
| +        else: | 
| +            country = record.country.iso_code | 
| + | 
| +        try: | 
| +            record = city_reader.city(ip) | 
| +        except geoip2.errors.AddressNotFoundError: | 
| +            city = '-' | 
| +        else: | 
| +            city = record.city.name | 
| + | 
| +        # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... | 
| +        date_start = line.index('[') + 1 | 
| +        # IP might be v4 or v6 | 
| +        date_end = line.index(':', date_start) | 
| +        date = line[date_start:date_end] | 
| + | 
| +        # https://docs.python.org/2/library/hmac.html | 
| +        to_hash = (ip + date).encode('utf-8') | 
| +        token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() | 
| + | 
| +        print(token, country, city, non_sensitive_info, end='') | 
| + | 
| +    country_reader.close() | 
| +    city_reader.close() | 
| + | 
| + | 
| +if __name__ == '__main__': | 
| +    parser = argparse.ArgumentParser( | 
| +        description='Filter out sensitive data from access logs', | 
| +    ) | 
| + | 
| +    parser.add_argument( | 
| +        '--salt', | 
| +        dest='salt', | 
| +        default=os.getenv('ANONYMIZE_SALT'), | 
| +        help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' | 
| +    ) | 
| + | 
| +    # https://dev.maxmind.com/geoip/geoip2/geolite2/ | 
| +    parser.add_argument( | 
| +        '--geolite2-country-db', | 
| +        dest='country_db', | 
| +        default=os.getenv('ANONYMIZE_GEOLITE2_DB'), | 
| +        help='Path to MaxMind DB file with GeoLite2 Country data, defaults ' | 
| +             'to $ANONYMIZE_GEOLITE2_DB' | 
| +    ) | 
| + | 
| +    # https://dev.maxmind.com/geoip/geoip2/geolite2/ | 
| +    parser.add_argument( | 
| +        '--geolite2-city-db', | 
| +        dest='city_db', | 
| +        default=os.getenv('ANONYMIZE_GEOLITE2_DB'), | 
| +        help='Path to MaxMind DB file with GeoLite2 City data, defaults ' | 
| +             'to $ANONYMIZE_GEOLITE2_DB' | 
| +    ) | 
| + | 
| +    args = parser.parse_args() | 
| + | 
| +    if args.salt is None or args.country_db is None or args.city_db is None: | 
| +        parser.print_help() | 
| +        sys.exit(1) | 
| + | 
| +    main(args.salt, args.country_db, args.city_db) | 
| + | 
|  |