From a17220136e98ea6864c1e36d40688a8599ab3678 Mon Sep 17 00:00:00 2001
From: Harsh Bandhey <raptor419heavy@gmail.com>
Date: Wed, 6 Dec 2023 12:57:08 -0800
Subject: [PATCH] Added addiitional known_exclude_options correlation that
 shuts off correlation matrix ops

---
 streamline/dataprep/data_process.py      | 36 ++++++++++++++++++++----
 streamline/runners/dataprocess_runner.py |  6 +++-
 streamline/tests/test_classification.py  |  4 +--
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/streamline/dataprep/data_process.py b/streamline/dataprep/data_process.py
index a5a33aba..e5778e61 100644
--- a/streamline/dataprep/data_process.py
+++ b/streamline/dataprep/data_process.py
@@ -74,7 +74,7 @@ def __init__(self, dataset, experiment_path, ignore_features=None,
         self.experiment_path = experiment_path
         self.random_state = random_state
 
-        known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots']
+        known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots', 'correlation']
 
         explorations_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
         plot_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
@@ -90,6 +90,9 @@ def __init__(self, dataset, experiment_path, ignore_features=None,
                 plot_list.remove("Univariate Analysis")
             if 'correlation_plots' in exclude_eda_output:
                 plot_list.remove("Feature Correlation")
+            if 'correlation' in exclude_eda_output:
+                explorations_list.remove("Feature Correlation")
+                plot_list.remove("Feature Correlation")
 
         for item in plot_list:
             if item not in explorations_list:
@@ -463,12 +466,15 @@ def run_process(self, top_features=20):
 
         # Run initial EDA from the Dataset Class
         logging.info("Running Initial EDA:")
-        self.dataset.initial_eda(self.experiment_path)
+        # self.dataset.initial_eda(self.experiment_path)
+        self.initial_eda(initial='initial/')
 
         # Running all data manipulation steps: cleaning and feature engineering
         self.data_manipulation()
 
-        self.anomaly_detection()
+        # Removing anomaly detection to help debug big memory issue
+        # if "Anomaly" in self.explorations:
+        #     self.anomaly_detection()
 
         # Running EDA after all data manipulation
         self.second_eda(top_features)
@@ -694,7 +700,11 @@ def data_manipulation(self):
         transition_df.loc["E2"] = self.counts_summary(save=False)
 
         # Drop highly correlated features with correlation greater that max_correlation
-        self.drop_highly_correlated_features()  # Completed
+        if (self.correlation_removal_threshold is None or self.correlation_removal_threshold > 1
+                or "Feature Correlation" not in self.explorations):
+            pass
+        else:
+            self.drop_highly_correlated_features()  # Completed
         transition_df.loc["C4"] = self.counts_summary(save=False)
 
         # Create features-only version of processed dataset and save as .csv
@@ -1138,6 +1148,22 @@ def drop_highly_correlated_features(self):
         else:
             logging.info("No Features with correlation higher than parameter")
 
+    def initial_eda(self, initial='initial/'):
+        # Describe and save description if user specified
+        logging.warning(self.experiment_path)
+        if "Describe" in self.explorations:
+            self.dataset.describe_data(self.experiment_path, initial=initial)
+            total_missing = self.dataset.missingness_counts(self.experiment_path, initial=initial)
+            self.dataset.missing_count_plot(self.experiment_path, plot=False, initial=initial)
+            self.dataset.counts_summary(self.experiment_path, total_missing, False,
+                                        show_plots=False, initial=initial)
+
+        # Export feature correlation plot if user specified
+        if "Feature Correlation" in self.explorations:
+            logging.info("Generating Feature Correlation Heatmap...")
+            self.dataset.feature_correlation(self.experiment_path, None, plot=False,
+                                             show_plots=False, initial=initial)
+
     def second_eda(self, top_features=20):
         # Running EDA after all the new data processing/manipulation
         logging.info("Running Basic Exploratory Analysis...")
@@ -1159,7 +1185,7 @@ def second_eda(self, top_features=20):
                 plot = True
                 x_data = self.dataset.feature_only_data()
                 self.dataset.feature_correlation(self.experiment_path, x_data, plot=plot, show_plots=self.show_plots)
-        del x_data
+                del x_data
 
         # Conduct uni-variate analyses of association between individual features and class
         if "Univariate Analysis" in self.explorations:
diff --git a/streamline/runners/dataprocess_runner.py b/streamline/runners/dataprocess_runner.py
index 582714b5..3bd9e13e 100644
--- a/streamline/runners/dataprocess_runner.py
+++ b/streamline/runners/dataprocess_runner.py
@@ -107,7 +107,7 @@ def __init__(self, data_path, output_path, experiment_name, exclude_eda_output=N
         self.top_features = top_features
         self.exclude_eda_output = exclude_eda_output
 
-        known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots']
+        known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots', 'correlation']
 
         exploration_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
         plot_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
@@ -123,6 +123,10 @@ def __init__(self, data_path, output_path, experiment_name, exclude_eda_output=N
                 plot_list.remove("Univariate Analysis")
             if 'correlation_plots' in exclude_eda_output:
                 plot_list.remove("Feature Correlation")
+                exploration_list.remove("Feature Correlation")
+            if 'correlation' in exclude_eda_output:
+                exploration_list.remove("Feature Correlation")
+                plot_list.remove("Feature Correlation")
 
         self.exploration_list = exploration_list
         self.plot_list = plot_list
diff --git a/streamline/tests/test_classification.py b/streamline/tests/test_classification.py
index 6b55592a..b87c7223 100644
--- a/streamline/tests/test_classification.py
+++ b/streamline/tests/test_classification.py
@@ -17,7 +17,7 @@
 
 algorithms, run_parallel, output_path = ["MI", "MS"], False, "./tests/"
 dataset_path, experiment_name = "./data/DemoData/", "demo"
-model_algorithms = ["LR", "DT", "NB", "ANN"]
+model_algorithms = ["LR", "DT", "NB"]
 rep_data_path = "./data/DemoRepData/"
 
 
@@ -27,7 +27,7 @@ def test_classification():
         os.mkdir(output_path)
 
     eda = DataProcessRunner(dataset_path, output_path, experiment_name,
-                            exclude_eda_output=None,
+                            exclude_eda_output=['correlation'],
                             outcome_label="Class", instance_label="InstanceID", n_splits=3, ignore_features=None,
                             categorical_features=['Gender', 'Symptoms ', 'Alcohol', 'Hepatitis B Surface Antigen',
                                                   'Hepatitis B e Antigen', 'Hepatitis B Core Antibody',