From 889a91ca22d547255c2004a8f708acba6e83c727 Mon Sep 17 00:00:00 2001 From: Blake Irvin Date: Fri, 10 May 2024 16:35:46 +0200 Subject: [PATCH] Feat: Enable Column Cleanup by Regex Match Co-authored-by: Daniil Bandarenka --- .gitignore | 3 +++ .../hny-column-cleanup.py | 20 ++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b6f5e69..29ef5c2 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ stack_parameters.json # Macs .DS_Store + +# Python +**/venv/** diff --git a/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py b/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py index 60dbbb2..a390c04 100755 --- a/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py +++ b/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py @@ -23,6 +23,8 @@ import sys import signal import time +import re +import ipdb from datetime import date from datetime import datetime @@ -69,6 +71,18 @@ def list_spammy_columns(dataset, api_key): break # end the inner loop in case there's multiple matches in the same string return spammy_column_ids +def match_columns(dataset, api_key, regex_pattern): + """ + List columns in a dataset that match a regular expression and return the list as an array of column IDs + """ + all_columns = fetch_all_columns(dataset, api_key) + pattern = re.compile(regex_pattern) + matched_column_ids = {} + for column in all_columns: + if pattern.match(column['key_name']): + matched_column_ids[column['id']] = column['key_name'] + return matched_column_ids + def list_columns_by_date(dataset, api_key, date): """ List columns by date in a dataset and return the list as an array of column IDs. The created date is set in `column_created_date_string` for now. @@ -132,11 +146,13 @@ def delete_columns(dataset, api_key, is_dry_run, column_ids): parser.add_argument('-d', '--dataset', help='Honeycomb Dataset', required=True) parser.add_argument('-m', '--mode', default='hidden', - choices=['hidden', 'spammy', 'date', 'last_written_before'], help='Type of columns to clean up') + choices=['hidden', 'spammy', 'date', 'last_written_before', 'regex_pattern'], help='Type of columns to clean up') parser.add_argument('--dry-run', default=False, action=argparse.BooleanOptionalAction, help='Will print out the columns it would delete without deleting them') parser.add_argument('--date', type=date.fromisoformat, default=None, help='Date filter to use with date and last_written_before modes (YYYY-MM-DD)') + parser.add_argument('--regex_pattern', + help='Regular expression to match on column names') args = parser.parse_args() columns_to_delete = {} @@ -145,6 +161,8 @@ def delete_columns(dataset, api_key, is_dry_run, column_ids): columns_to_delete = list_hidden_columns(args.dataset, args.api_key) elif args.mode == 'spammy': columns_to_delete = list_spammy_columns(args.dataset, args.api_key) + elif args.mode == 'regex_pattern': + columns_to_delete = match_columns(args.dataset, args.api_key, args.regex_pattern) elif (args.mode == 'date' and args.date is not None): columns_to_delete = list_columns_by_date(args.dataset, args.api_key, args.date) elif (args.mode == 'last_written_before' and args.date is not None):