From 889a91ca22d547255c2004a8f708acba6e83c727 Mon Sep 17 00:00:00 2001
From: Blake Irvin <blakeirvin@me.com>
Date: Fri, 10 May 2024 16:35:46 +0200
Subject: [PATCH] Feat: Enable Column Cleanup by Regex Match

Co-authored-by: Daniil Bandarenka <dbondarenko.post@gmail.com>
---
 .gitignore                                    |  3 +++
 .../hny-column-cleanup.py                     | 20 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index b6f5e69..29ef5c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,6 @@ stack_parameters.json
 
 # Macs
 .DS_Store
+
+# Python
+**/venv/**
diff --git a/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py b/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py
index 60dbbb2..a390c04 100755
--- a/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py
+++ b/tools/hny_dataset_cleanup_tool/hny-column-cleanup.py
@@ -23,6 +23,8 @@
 import sys
 import signal
 import time
+import re
+import ipdb
 from datetime import date
 from datetime import datetime
 
@@ -69,6 +71,18 @@ def list_spammy_columns(dataset, api_key):
                 break  # end the inner loop in case there's multiple matches in the same string
     return spammy_column_ids
 
+def match_columns(dataset, api_key, regex_pattern):
+    """
+    List columns in a dataset that match a regular expression and return the list as an array of column IDs
+    """
+    all_columns = fetch_all_columns(dataset, api_key)
+    pattern = re.compile(regex_pattern)
+    matched_column_ids = {}
+    for column in all_columns:
+        if pattern.match(column['key_name']):
+            matched_column_ids[column['id']] = column['key_name']
+    return matched_column_ids
+
 def list_columns_by_date(dataset, api_key, date):
     """
     List columns by date in a dataset and return the list as an array of column IDs. The created date is set in `column_created_date_string` for now.
@@ -132,11 +146,13 @@ def delete_columns(dataset, api_key, is_dry_run, column_ids):
         parser.add_argument('-d', '--dataset',
                             help='Honeycomb Dataset', required=True)
         parser.add_argument('-m', '--mode', default='hidden',
-                            choices=['hidden', 'spammy', 'date', 'last_written_before'], help='Type of columns to clean up')
+                            choices=['hidden', 'spammy', 'date', 'last_written_before', 'regex_pattern'], help='Type of columns to clean up')
         parser.add_argument('--dry-run', default=False,
                             action=argparse.BooleanOptionalAction, help='Will print out the columns it would delete without deleting them')
         parser.add_argument('--date', type=date.fromisoformat, default=None,
                             help='Date filter to use with date and last_written_before modes (YYYY-MM-DD)')
+        parser.add_argument('--regex_pattern',
+                            help='Regular expression to match on column names')
         args = parser.parse_args()
 
         columns_to_delete = {}
@@ -145,6 +161,8 @@ def delete_columns(dataset, api_key, is_dry_run, column_ids):
             columns_to_delete = list_hidden_columns(args.dataset, args.api_key)
         elif args.mode == 'spammy':
             columns_to_delete = list_spammy_columns(args.dataset, args.api_key)
+        elif args.mode == 'regex_pattern':
+            columns_to_delete = match_columns(args.dataset, args.api_key, args.regex_pattern)
         elif (args.mode == 'date' and args.date is not None):
             columns_to_delete = list_columns_by_date(args.dataset, args.api_key, args.date)
         elif (args.mode == 'last_written_before' and args.date is not None):