-
Notifications
You must be signed in to change notification settings - Fork 0
/
house pricing.py
145 lines (73 loc) · 2.48 KB
/
house pricing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
# coding: utf-8
# ## Project: Predicting Boston Housing Prices
# ### Data
#
# The modified Boston housing dataset consists of 489 data points, with each datapoint having 3 features. This dataset is a modified version of the Boston Housing dataset found on the <a href="https://archive.ics.uci.edu/ml/index.php">UCI Machine Learning Repository</a> and you can find the main dataset on the
# <a href="https://www.kaggle.com/c/boston-housing">Kaggle</a>
#
#
# ### Features
#
# RM: average number of rooms per dwelling (Total number of rooms in home)
# LSTAT: percentage of population considered lower status (Neighborhood poverty level )
# PTRATIO: pupil-teacher ratio by town (Student-teacher ratio of nearby schools)
# Target Variable: MEDV: median value of owner-occupied homes (house price)
# In[13]:
# Import libraries:
import numpy as np
import pandas as pd
get_ipython().run_line_magic('matplotlib', 'inline')
# In[16]:
# Load the Boston housing dataset
data = pd.read_csv('housing.csv')
data.info()
data.head(10)
# In[17]:
# Data Exploration
data.plot.scatter('RM','MEDV',c='r');
data.plot.scatter('LSTAT','MEDV',c='c');
data.plot.scatter('PTRATIO','MEDV',c='g');
# In[18]:
#define variables(features,prices)
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
features.head(5)
# In[19]:
#split the data to two sets. training set and testing set:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.25)
# In[20]:
print("training set:",X_train.shape,y_train.shape)
print("testing set:",X_test.shape,y_test.shape[0])
# In[21]:
#create the rgression model:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# In[22]:
#fit/train the model:
model.fit(X_train,y_train);
# In[23]:
#predict X_test by the model:
y_pred=model.predict(X_test)
# In[24]:
#model accuracy test:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
# In[25]:
#extra point: model accuracy train: to check underfitting and oferfitting
y_train_pre=model.predict(X_train)
r2_score(y_train, y_train_pre)
# In[ ]:
# ## Extra:
# prdict the price for house with:
# Total number of rooms in home =7 rooms
# Neighborhood poverty level as 20%
# Student-teacher ratio of nearby schools=19-to-1
#
# In[28]:
np.array([7,20,19]).reshape(1,-1).shape
# In[29]:
model.predict(np.array([7,20,19]).reshape(1,-1))
# In[ ]:
# In[ ]: