Anonymization

Why?

On production like database we can experiment with queries and migrations on a similar size dataset

How?

Postgresql Anonymizer + Postgres.ai

Heroku Postgres

Remora Backup
offsite, daily

 

Digital Ocean
anonymization

machine

Postgres.ai
Thin clones
Ask Joe

PG Anonymizer


PLAIN SQL/plpgsql
Write your own functions
SLOW

Anon chain

# download db backup and decrypt it
sshpass -f "/root/.remora" scp 9983@backup.remorabackup.com:db_backup.gpg .
gpg --passphrase-file ~/.remora_gpg decrypt --output db_backup db_backup.gpg
gpg --batch --passphrase-file ~/.remora_gpg --decrypt --output db_backup db_backup.gpg
rm db_backup.gpg

# extract
dropdb --if-exists -h localhost -p 5432 -U postgres d678q4cotr1hfd;
createdb -h localhost -p 5432 -U postgres d678q4cotr1hfd;
pg_restore -h localhost -U postgres -d d678q4cotr1hfd ~/anonymizer/db_backup
psql -h localhost -U postgres -d d678q4cotr1hfd \
			-c 'TRUNCATE versions, email_histories, activity_logs, login_sessions, sessions'

# anonymize
psql -h localhost -U postgres -d d678q4cotr1hfd < ~/anonymizer/init.sql
psql -h localhost -U postgres -d d678q4cotr1hfd < ~/anonymizer/addresses.sql
psql -h localhost -U postgres -d d678q4cotr1hfd < ~/anonymizer/authors.sql
psql -h localhost -U postgres -d d678q4cotr1hfd < ~/anonymizer/bulk_invitations.sql
psql -h localhost -U postgres -d d678q4cotr1hfd < ~/anonymizer/email_histories.sql
...

# dump anonymized db
docker exec -it b5a15c7dc4e1 pg_dump_anon -h localhost -U postgres -d d678q4cotr1hfd -f /var/lib/postgresql/data/full_anon.sql

# restore anonymized
dropdb --if-exists -h localhost -p 5433 -U postgres anonymized;
createdb -h localhost -p 5433 -U postgres anonymized;
psql -h localhost -p 5433 -U postgres anonymized < /pg_data/full_anon.sql;

Example

SECURITY LABEL FOR anon ON COLUMN users.email
IS 'MASKED WITH FUNCTION anon.fake_email()';

SECURITY LABEL FOR anon ON COLUMN users.unconfirmed_email
IS 'MASKED WITH FUNCTION anon.fake_email()';

SECURITY LABEL FOR anon ON COLUMN users.name
IS 'MASKED WITH FUNCTION anon.fake_first_name()';

SECURITY LABEL FOR anon ON COLUMN users.note
IS 'MASKED WITH FUNCTION anon.lorem_ipsum(words := 2)';

SECURITY LABEL FOR anon ON COLUMN users.bio
IS 'MASKED WITH FUNCTION anon.lorem_ipsum(words := 10)';

SECURITY LABEL FOR anon ON COLUMN users.phone
IS 'MASKED WITH FUNCTION anon.random_phone()';

SECURITY LABEL FOR anon ON COLUMN users.paypal_user_id
IS 'MASKED WITH FUNCTION anon.random_phone(''1'')';

SECURITY LABEL FOR anon ON COLUMN users.current_sign_in_ip
IS 'MASKED WITH FUNCTION anon.randomize_ip(host(current_sign_in_ip))';

Example

CREATE OR REPLACE FUNCTION anon.randomize_ip(
  source_ip TEXT DEFAULT NULL
)
RETURNS TEXT
AS $func$
BEGIN

  IF source_ip = '' THEN
    RETURN NULL;
  ELSEIF LENGTH(source_ip) < 16 THEN
    RETURN array_to_string(
        ARRAY[
                (string_to_array(source_ip, '.'))[1],
                (string_to_array(source_ip, '.'))[2],
                CAST(anon.random_int_between(1,254) AS TEXT),
                CAST(anon.random_int_between(1,254) AS TEXT)
                ],
        '.'
    );
  ELSE
    RETURN '127.0.0.1';
  END IF;
END;
$func$
  LANGUAGE plpgsql
  VOLATILE
  RETURNS NULL ON NULL INPUT
  SECURITY INVOKER
  SET search_path=''
;

deck

By Mikhail Shirkov