From a17220136e98ea6864c1e36d40688a8599ab3678 Mon Sep 17 00:00:00 2001 From: Harsh Bandhey Date: Wed, 6 Dec 2023 12:57:08 -0800 Subject: [PATCH] Added addiitional known_exclude_options correlation that shuts off correlation matrix ops --- streamline/dataprep/data_process.py | 36 ++++++++++++++++++++---- streamline/runners/dataprocess_runner.py | 6 +++- streamline/tests/test_classification.py | 4 +-- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/streamline/dataprep/data_process.py b/streamline/dataprep/data_process.py index a5a33aba..e5778e61 100644 --- a/streamline/dataprep/data_process.py +++ b/streamline/dataprep/data_process.py @@ -74,7 +74,7 @@ def __init__(self, dataset, experiment_path, ignore_features=None, self.experiment_path = experiment_path self.random_state = random_state - known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots'] + known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots', 'correlation'] explorations_list = ["Describe", "Univariate Analysis", "Feature Correlation"] plot_list = ["Describe", "Univariate Analysis", "Feature Correlation"] @@ -90,6 +90,9 @@ def __init__(self, dataset, experiment_path, ignore_features=None, plot_list.remove("Univariate Analysis") if 'correlation_plots' in exclude_eda_output: plot_list.remove("Feature Correlation") + if 'correlation' in exclude_eda_output: + explorations_list.remove("Feature Correlation") + plot_list.remove("Feature Correlation") for item in plot_list: if item not in explorations_list: @@ -463,12 +466,15 @@ def run_process(self, top_features=20): # Run initial EDA from the Dataset Class logging.info("Running Initial EDA:") - self.dataset.initial_eda(self.experiment_path) + # self.dataset.initial_eda(self.experiment_path) + self.initial_eda(initial='initial/') # Running all data manipulation steps: cleaning and feature engineering self.data_manipulation() - self.anomaly_detection() + # Removing anomaly detection to help debug big memory issue + # if "Anomaly" in self.explorations: + # self.anomaly_detection() # Running EDA after all data manipulation self.second_eda(top_features) @@ -694,7 +700,11 @@ def data_manipulation(self): transition_df.loc["E2"] = self.counts_summary(save=False) # Drop highly correlated features with correlation greater that max_correlation - self.drop_highly_correlated_features() # Completed + if (self.correlation_removal_threshold is None or self.correlation_removal_threshold > 1 + or "Feature Correlation" not in self.explorations): + pass + else: + self.drop_highly_correlated_features() # Completed transition_df.loc["C4"] = self.counts_summary(save=False) # Create features-only version of processed dataset and save as .csv @@ -1138,6 +1148,22 @@ def drop_highly_correlated_features(self): else: logging.info("No Features with correlation higher than parameter") + def initial_eda(self, initial='initial/'): + # Describe and save description if user specified + logging.warning(self.experiment_path) + if "Describe" in self.explorations: + self.dataset.describe_data(self.experiment_path, initial=initial) + total_missing = self.dataset.missingness_counts(self.experiment_path, initial=initial) + self.dataset.missing_count_plot(self.experiment_path, plot=False, initial=initial) + self.dataset.counts_summary(self.experiment_path, total_missing, False, + show_plots=False, initial=initial) + + # Export feature correlation plot if user specified + if "Feature Correlation" in self.explorations: + logging.info("Generating Feature Correlation Heatmap...") + self.dataset.feature_correlation(self.experiment_path, None, plot=False, + show_plots=False, initial=initial) + def second_eda(self, top_features=20): # Running EDA after all the new data processing/manipulation logging.info("Running Basic Exploratory Analysis...") @@ -1159,7 +1185,7 @@ def second_eda(self, top_features=20): plot = True x_data = self.dataset.feature_only_data() self.dataset.feature_correlation(self.experiment_path, x_data, plot=plot, show_plots=self.show_plots) - del x_data + del x_data # Conduct uni-variate analyses of association between individual features and class if "Univariate Analysis" in self.explorations: diff --git a/streamline/runners/dataprocess_runner.py b/streamline/runners/dataprocess_runner.py index 582714b5..3bd9e13e 100644 --- a/streamline/runners/dataprocess_runner.py +++ b/streamline/runners/dataprocess_runner.py @@ -107,7 +107,7 @@ def __init__(self, data_path, output_path, experiment_name, exclude_eda_output=N self.top_features = top_features self.exclude_eda_output = exclude_eda_output - known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots'] + known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots', 'correlation'] exploration_list = ["Describe", "Univariate Analysis", "Feature Correlation"] plot_list = ["Describe", "Univariate Analysis", "Feature Correlation"] @@ -123,6 +123,10 @@ def __init__(self, data_path, output_path, experiment_name, exclude_eda_output=N plot_list.remove("Univariate Analysis") if 'correlation_plots' in exclude_eda_output: plot_list.remove("Feature Correlation") + exploration_list.remove("Feature Correlation") + if 'correlation' in exclude_eda_output: + exploration_list.remove("Feature Correlation") + plot_list.remove("Feature Correlation") self.exploration_list = exploration_list self.plot_list = plot_list diff --git a/streamline/tests/test_classification.py b/streamline/tests/test_classification.py index 6b55592a..b87c7223 100644 --- a/streamline/tests/test_classification.py +++ b/streamline/tests/test_classification.py @@ -17,7 +17,7 @@ algorithms, run_parallel, output_path = ["MI", "MS"], False, "./tests/" dataset_path, experiment_name = "./data/DemoData/", "demo" -model_algorithms = ["LR", "DT", "NB", "ANN"] +model_algorithms = ["LR", "DT", "NB"] rep_data_path = "./data/DemoRepData/" @@ -27,7 +27,7 @@ def test_classification(): os.mkdir(output_path) eda = DataProcessRunner(dataset_path, output_path, experiment_name, - exclude_eda_output=None, + exclude_eda_output=['correlation'], outcome_label="Class", instance_label="InstanceID", n_splits=3, ignore_features=None, categorical_features=['Gender', 'Symptoms ', 'Alcohol', 'Hepatitis B Surface Antigen', 'Hepatitis B e Antigen', 'Hepatitis B Core Antibody',