Skip to content

Commit

Permalink
Feat: Enable Column Cleanup by Regex Match
Browse files Browse the repository at this point in the history
Co-authored-by: Daniil Bandarenka <dbondarenko.post@gmail.com>
  • Loading branch information
bixu and BandarenkaDaniil committed May 10, 2024
1 parent 8dc07cb commit 889a91c
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,6 @@ stack_parameters.json

# Macs
.DS_Store

# Python
**/venv/**
20 changes: 19 additions & 1 deletion tools/hny_dataset_cleanup_tool/hny-column-cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import sys
import signal
import time
import re
import ipdb
from datetime import date
from datetime import datetime

Expand Down Expand Up @@ -69,6 +71,18 @@ def list_spammy_columns(dataset, api_key):
break # end the inner loop in case there's multiple matches in the same string
return spammy_column_ids

def match_columns(dataset, api_key, regex_pattern):
"""
List columns in a dataset that match a regular expression and return the list as an array of column IDs
"""
all_columns = fetch_all_columns(dataset, api_key)
pattern = re.compile(regex_pattern)
matched_column_ids = {}
for column in all_columns:
if pattern.match(column['key_name']):
matched_column_ids[column['id']] = column['key_name']
return matched_column_ids

def list_columns_by_date(dataset, api_key, date):
"""
List columns by date in a dataset and return the list as an array of column IDs. The created date is set in `column_created_date_string` for now.
Expand Down Expand Up @@ -132,11 +146,13 @@ def delete_columns(dataset, api_key, is_dry_run, column_ids):
parser.add_argument('-d', '--dataset',
help='Honeycomb Dataset', required=True)
parser.add_argument('-m', '--mode', default='hidden',
choices=['hidden', 'spammy', 'date', 'last_written_before'], help='Type of columns to clean up')
choices=['hidden', 'spammy', 'date', 'last_written_before', 'regex_pattern'], help='Type of columns to clean up')
parser.add_argument('--dry-run', default=False,
action=argparse.BooleanOptionalAction, help='Will print out the columns it would delete without deleting them')
parser.add_argument('--date', type=date.fromisoformat, default=None,
help='Date filter to use with date and last_written_before modes (YYYY-MM-DD)')
parser.add_argument('--regex_pattern',
help='Regular expression to match on column names')
args = parser.parse_args()

columns_to_delete = {}
Expand All @@ -145,6 +161,8 @@ def delete_columns(dataset, api_key, is_dry_run, column_ids):
columns_to_delete = list_hidden_columns(args.dataset, args.api_key)
elif args.mode == 'spammy':
columns_to_delete = list_spammy_columns(args.dataset, args.api_key)
elif args.mode == 'regex_pattern':
columns_to_delete = match_columns(args.dataset, args.api_key, args.regex_pattern)
elif (args.mode == 'date' and args.date is not None):
columns_to_delete = list_columns_by_date(args.dataset, args.api_key, args.date)
elif (args.mode == 'last_written_before' and args.date is not None):
Expand Down

0 comments on commit 889a91c

Please sign in to comment.