-
Notifications
You must be signed in to change notification settings - Fork 0
/
auto-efficiency.py
120 lines (84 loc) · 3.69 KB
/
auto-efficiency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
np.random.seed(42)
# Reading the data
from ucimlrepo import fetch_ucirepo
# fetch dataset
auto_mpg = fetch_ucirepo(id=9)
# data (as pandas dataframes)
X = auto_mpg.data.features
y = auto_mpg.data.targets
# metadata
print(auto_mpg.metadata)
# variable information
print(auto_mpg.variables)
# join X and y to check for null values
data = pd.concat([X, y], axis=1)
print("Shape of extracted data: ", data.shape)
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
# data = pd.read_csv(url, delim_whitespace=True, header=None,
# names=["mpg", "cylinders", "displacement", "horsepower", "weight",
# "acceleration", "model year", "origin", "car name"])
# Clean the above data by removing redundant columns and rows with junk values
# Compare the performance of your model with the decision tree module from scikit learn
# Data Cleaning/Preprocessing
data.replace('?', np.nan, inplace=True)
print("\nNumber of NaN/Null values in training data:", data.isnull().sum().sum())
if data.isnull().sum().sum() > 0:
data.dropna(inplace=True)
print("Number of duplicated samples in training data: ", data.duplicated().sum())
if data.duplicated().sum() > 0:
data.drop_duplicates(inplace=True)
print("Shape of data after cleaning: ", data.shape)
# separate X and y
X = data.drop('mpg', axis=1)
y = data['mpg']
print("\nX shape:", X.shape)
print("y shape:", y.shape)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("\nX_train size: ", X_train.shape)
print("y_train size: ", y_train.shape)
print("X_test size: ", X_test.shape)
print("y_test size: ", y_test.shape)
# Our custom decision tree implementation
my_dt = DecisionTree(criterion="information_gain", max_depth=5)
my_dt.fit(X_train, y_train)
y_train_pred = my_dt.predict(X_train)
y_test_pred = my_dt.predict(X_test)
train_rmse = rmse(y_train_pred, y_train)
train_mae = mae(y_train_pred, y_train)
print("\nTrain Metrics (Custom):")
print(f" Root Mean Squared Error: {train_rmse:.4f}")
print(f" Mean Absolute Error: {train_mae:.4f}")
test_rmse = rmse(y_test_pred, y_test)
test_mae = mae(y_test_pred, y_test)
print("\nTest Metrics (Custom):")
print(f" Root Mean Squared Error: {test_rmse:.4f}")
print(f" Mean Absolute Error: {test_mae:.4f}")
# Scikit-learn's Decision Tree
sklearn_model = DecisionTreeRegressor(max_depth=5)
sklearn_model.fit(X_train, y_train)
y_train_pred_sklearn = sklearn_model.predict(X_train)
y_test_pred_sklearn = sklearn_model.predict(X_test)
train_rmse_sklearn = rmse(pd.Series(y_train_pred_sklearn), y_train)
train_mae_sklearn = mae(pd.Series(y_train_pred_sklearn), y_train)
print("\nTrain Metrics (Sklearn):")
print(f" Root Mean Squared Error: {train_rmse_sklearn:.4f}")
print(f" Mean Absolute Error: {train_mae_sklearn:.4f}")
test_rmse_sklearn = rmse(pd.Series(y_test_pred_sklearn), y_test)
test_mae_sklearn = mae(pd.Series(y_test_pred_sklearn), y_test)
print("\nTest Metrics (Sklearn):")
print(f" Root Mean Squared Error: {test_rmse_sklearn:.4f}")
print(f" Mean Absolute Error: {test_mae_sklearn:.4f}")
# Performance Comparison
print("\nPerformance Comparison:\n")
print(f"Our Decision Tree - Train RMSE: {train_rmse:.4f}")
print(f"Our Decision Tree - Test RMSE: {test_rmse:.4f}")
print(f"Scikit-Learn Decision Tree - Train RMSE: {train_rmse_sklearn:.4f}")
print(f"Scikit-Learn Decision Tree - Test RMSE: {test_rmse_sklearn:.4f}")