Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: anonymized.py

Issue 29817563: #12025 - Add city information to anonymized.py (Closed)
Patch Set: Created June 27, 2018, 1:23 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: anonymized.py
diff --git a/anonymized.py b/anonymized.py
new file mode 100755
index 0000000000000000000000000000000000000000..8456beef2d2cd32c5e0477e8dbf68fe4fa75b0b4
--- /dev/null
+++ b/anonymized.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+"""Anonymize data in access log lines.
+
+Read a line from stdin, write it to stdout with the following changes:
+1. IP (v4 or v6) replaced with a salted hash of the IP and the date
+2. Country and city information (extracted from IP) added after the salted hash.
+
+If the country or city information is unavailable in the database, '-' is added instead
+of ISO 3166-1 alpha-2 country/city code (like 'DE').
+
+Salt and the country/city information database are taken as command line options and
+default to environment variables.
+
+Malformed lines are passed on as is, based on the assumption that they don't
+contain sensitive information. Malformed here means the line couldn't be split
+on space character. If it could be split, and an error occurs afterwards
+(e.g. while trying to parse out the date), the script will fail and exit in
+order to bring attention to the fact that something might not be getting
+anonymized.
+"""
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import hashlib
+import hmac
+import os
+import sys
+
+import geoip2.database
+
+def main(salt, country_db, city_db):
+ country_reader = geoip2.database.Reader(country_db)
+ city_reader = geoip2.database.Reader(city_db)
+ salt = salt.encode('utf-8')
+
+ for line in sys.stdin:
+ try:
+ ip, non_sensitive_info = line.split(' ', 1)
+ except ValueError:
+ print(line, end='')
+ continue
+
+ # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country
+ try:
+ record = country_reader.country(ip)
+ except geoip2.errors.AddressNotFoundError:
+ country = '-'
+ else:
+ country = record.country.iso_code
+
+ try:
+ record = city_reader.city(ip)
+ except geoip2.errors.AddressNotFoundError:
+ city = '-'
+ else:
+ city = record.city.name
+
+ # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /...
+ date_start = line.index('[') + 1
+ # IP might be v4 or v6
+ date_end = line.index(':', date_start)
+ date = line[date_start:date_end]
+
+ # https://docs.python.org/2/library/hmac.html
+ to_hash = (ip + date).encode('utf-8')
+ token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest()
+
+ print(token, country, city, non_sensitive_info, end='')
+
+ country_reader.close()
+ city_reader.close()
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description='Filter out sensitive data from access logs',
+ )
+
+ parser.add_argument(
+ '--salt',
+ dest='salt',
+ default=os.getenv('ANONYMIZE_SALT'),
+ help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT'
+ )
+
+ # https://dev.maxmind.com/geoip/geoip2/geolite2/
+ parser.add_argument(
+ '--geolite2-country-db',
+ dest='country_db',
+ default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
+ help='Path to MaxMind DB file with GeoLite2 Country data, defaults '
+ 'to $ANONYMIZE_GEOLITE2_DB'
+ )
+
+ # https://dev.maxmind.com/geoip/geoip2/geolite2/
+ parser.add_argument(
+ '--geolite2-city-db',
+ dest='city_db',
+ default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
+ help='Path to MaxMind DB file with GeoLite2 City data, defaults '
+ 'to $ANONYMIZE_GEOLITE2_DB'
+ )
+
+ args = parser.parse_args()
+
+ if args.salt is None or args.country_db is None or args.city_db is None:
+ parser.print_help()
+ sys.exit(1)
+
+ main(args.salt, args.country_db, args.city_db)
+
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld