Skip to content

Commit

Permalink
Added addiitional known_exclude_options correlation that shuts off co…
Browse files Browse the repository at this point in the history
…rrelation matrix ops
  • Loading branch information
raptor419 committed Dec 6, 2023
1 parent 8d66125 commit a172201
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 8 deletions.
36 changes: 31 additions & 5 deletions streamline/dataprep/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, dataset, experiment_path, ignore_features=None,
self.experiment_path = experiment_path
self.random_state = random_state

known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots']
known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots', 'correlation']

explorations_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
plot_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
Expand All @@ -90,6 +90,9 @@ def __init__(self, dataset, experiment_path, ignore_features=None,
plot_list.remove("Univariate Analysis")
if 'correlation_plots' in exclude_eda_output:
plot_list.remove("Feature Correlation")
if 'correlation' in exclude_eda_output:
explorations_list.remove("Feature Correlation")
plot_list.remove("Feature Correlation")

for item in plot_list:
if item not in explorations_list:
Expand Down Expand Up @@ -463,12 +466,15 @@ def run_process(self, top_features=20):

# Run initial EDA from the Dataset Class
logging.info("Running Initial EDA:")
self.dataset.initial_eda(self.experiment_path)
# self.dataset.initial_eda(self.experiment_path)
self.initial_eda(initial='initial/')

# Running all data manipulation steps: cleaning and feature engineering
self.data_manipulation()

self.anomaly_detection()
# Removing anomaly detection to help debug big memory issue
# if "Anomaly" in self.explorations:
# self.anomaly_detection()

# Running EDA after all data manipulation
self.second_eda(top_features)
Expand Down Expand Up @@ -694,7 +700,11 @@ def data_manipulation(self):
transition_df.loc["E2"] = self.counts_summary(save=False)

# Drop highly correlated features with correlation greater that max_correlation
self.drop_highly_correlated_features() # Completed
if (self.correlation_removal_threshold is None or self.correlation_removal_threshold > 1
or "Feature Correlation" not in self.explorations):
pass
else:
self.drop_highly_correlated_features() # Completed
transition_df.loc["C4"] = self.counts_summary(save=False)

# Create features-only version of processed dataset and save as .csv
Expand Down Expand Up @@ -1138,6 +1148,22 @@ def drop_highly_correlated_features(self):
else:
logging.info("No Features with correlation higher than parameter")

def initial_eda(self, initial='initial/'):
# Describe and save description if user specified
logging.warning(self.experiment_path)
if "Describe" in self.explorations:
self.dataset.describe_data(self.experiment_path, initial=initial)
total_missing = self.dataset.missingness_counts(self.experiment_path, initial=initial)
self.dataset.missing_count_plot(self.experiment_path, plot=False, initial=initial)
self.dataset.counts_summary(self.experiment_path, total_missing, False,
show_plots=False, initial=initial)

# Export feature correlation plot if user specified
if "Feature Correlation" in self.explorations:
logging.info("Generating Feature Correlation Heatmap...")
self.dataset.feature_correlation(self.experiment_path, None, plot=False,
show_plots=False, initial=initial)

def second_eda(self, top_features=20):
# Running EDA after all the new data processing/manipulation
logging.info("Running Basic Exploratory Analysis...")
Expand All @@ -1159,7 +1185,7 @@ def second_eda(self, top_features=20):
plot = True
x_data = self.dataset.feature_only_data()
self.dataset.feature_correlation(self.experiment_path, x_data, plot=plot, show_plots=self.show_plots)
del x_data
del x_data

# Conduct uni-variate analyses of association between individual features and class
if "Univariate Analysis" in self.explorations:
Expand Down
6 changes: 5 additions & 1 deletion streamline/runners/dataprocess_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __init__(self, data_path, output_path, experiment_name, exclude_eda_output=N
self.top_features = top_features
self.exclude_eda_output = exclude_eda_output

known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots']
known_exclude_options = ['describe_csv', 'univariate_plots', 'correlation_plots', 'correlation']

exploration_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
plot_list = ["Describe", "Univariate Analysis", "Feature Correlation"]
Expand All @@ -123,6 +123,10 @@ def __init__(self, data_path, output_path, experiment_name, exclude_eda_output=N
plot_list.remove("Univariate Analysis")
if 'correlation_plots' in exclude_eda_output:
plot_list.remove("Feature Correlation")
exploration_list.remove("Feature Correlation")
if 'correlation' in exclude_eda_output:
exploration_list.remove("Feature Correlation")
plot_list.remove("Feature Correlation")

self.exploration_list = exploration_list
self.plot_list = plot_list
Expand Down
4 changes: 2 additions & 2 deletions streamline/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

algorithms, run_parallel, output_path = ["MI", "MS"], False, "./tests/"
dataset_path, experiment_name = "./data/DemoData/", "demo"
model_algorithms = ["LR", "DT", "NB", "ANN"]
model_algorithms = ["LR", "DT", "NB"]
rep_data_path = "./data/DemoRepData/"


Expand All @@ -27,7 +27,7 @@ def test_classification():
os.mkdir(output_path)

eda = DataProcessRunner(dataset_path, output_path, experiment_name,
exclude_eda_output=None,
exclude_eda_output=['correlation'],
outcome_label="Class", instance_label="InstanceID", n_splits=3, ignore_features=None,
categorical_features=['Gender', 'Symptoms ', 'Alcohol', 'Hepatitis B Surface Antigen',
'Hepatitis B e Antigen', 'Hepatitis B Core Antibody',
Expand Down

0 comments on commit a172201

Please sign in to comment.