import pandas as pd
import numpy as np
housedat = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
#!wget $housedat
hd = pd.read_csv(housedat)
hd.head()
## longitude latitude ... median_house_value ocean_proximity
## 0 -122.23 37.88 ... 452600.0 NEAR BAY
## 1 -122.22 37.86 ... 358500.0 NEAR BAY
## 2 -122.24 37.85 ... 352100.0 NEAR BAY
## 3 -122.25 37.85 ... 341300.0 NEAR BAY
## 4 -122.25 37.85 ... 342200.0 NEAR BAY
##
## [5 rows x 10 columns]
For the rest of the homework use only these columns:
'latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity',
col_h = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']
hd = hd[col_h]
hd.T
## 0 1 2 ... 20637 20638 20639
## latitude 37.88 37.86 37.85 ... 39.43 39.43 39.37
## longitude -122.23 -122.22 -122.24 ... -121.22 -121.32 -121.24
## housing_median_age 41.0 21.0 52.0 ... 17.0 18.0 16.0
## total_rooms 880.0 7099.0 1467.0 ... 2254.0 1860.0 2785.0
## total_bedrooms 129.0 1106.0 190.0 ... 485.0 409.0 616.0
## population 322.0 2401.0 496.0 ... 1007.0 741.0 1387.0
## households 126.0 1138.0 177.0 ... 433.0 349.0 530.0
## median_income 8.3252 8.3014 7.2574 ... 1.7 1.8672 2.3886
## median_house_value 452600.0 358500.0 352100.0 ... 92300.0 84700.0 89400.0
## ocean_proximity NEAR BAY NEAR BAY NEAR BAY ... INLAND INLAND INLAND
##
## [10 rows x 20640 columns]
Select only the features from above and fill in the missing values with 0.
hd = hd.fillna(0)
hd.isnull().sum()
## latitude 0
## longitude 0
## housing_median_age 0
## total_rooms 0
## total_bedrooms 0
## population 0
## households 0
## median_income 0
## median_house_value 0
## ocean_proximity 0
## dtype: int64
Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
hd["rooms_per_household"] = hd.total_rooms / hd.households
Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
hd["bedrooms_per_room"] = hd.total_bedrooms / hd.total_rooms
Create a new column population_per_household by dividing the column population by the column households from dataframe.
hd["population_per_household"] = hd.population / hd.households
What is the most frequent observation (mode) for the column ocean_proximity?
hd.ocean_proximity.value_counts()
## <1H OCEAN 9136
## INLAND 6551
## NEAR OCEAN 2658
## NEAR BAY 2290
## ISLAND 5
## Name: ocean_proximity, dtype: int64
Split the data
from sklearn.model_selection import train_test_split
ftrain_set, test_set = train_test_split(hd, test_size = 0.2, random_state = 42)
0.2 * len(hd)
## 4128.0
train_set, val_set = train_test_split(ftrain_set, test_size = 4128, random_state = 42)
len(train_set), len(val_set), len(test_set)
## (12384, 4128, 4128)
Extract y
y_train = train_set.median_house_value
del train_set["median_house_value"]
y_val = val_set.median_house_value
del val_set["median_house_value"]
y_test = test_set.median_house_value
del test_set["median_house_value"]
num_col = ftrain_set.dtypes[ftrain_set.dtypes=="float64"].index
ftrain_set[num_col]
## latitude longitude ... bedrooms_per_room population_per_household
## 14196 32.71 -117.03 ... 0.200576 3.691814
## 8267 33.77 -118.16 ... 0.232703 1.738095
## 17445 34.66 -120.48 ... 0.174486 2.723214
## 14265 32.69 -117.11 ... 0.258269 3.994366
## 2271 36.78 -119.80 ... 0.180940 2.300000
## ... ... ... ... ... ...
## 11284 33.78 -117.96 ... 0.151128 3.032258
## 11964 34.02 -117.43 ... 0.184825 3.904232
## 5390 34.03 -118.38 ... 0.270823 3.332068
## 860 37.58 -121.96 ... 0.166993 3.178891
## 15795 37.77 -122.42 ... 0.311169 2.108696
##
## [16512 rows x 12 columns]
corr_matrix = ftrain_set[num_col].corr()
corr_matrix
## latitude ... population_per_household
## latitude 1.000000 ... 0.005837
## longitude -0.924485 ... -0.000598
## housing_median_age 0.005296 ... 0.016245
## total_rooms -0.029224 ... -0.024991
## total_bedrooms -0.059998 ... -0.028536
## population -0.102499 ... 0.072330
## households -0.064061 ... -0.027656
## median_income -0.076571 ... 0.022061
## median_house_value -0.142983 ... -0.022030
## rooms_per_household 0.110695 ... -0.004922
## bedrooms_per_room -0.118938 ... 0.003938
## population_per_household 0.005837 ... 1.000000
##
## [12 rows x 12 columns]
corr_matrix.unstack().drop_duplicates().abs().sort_values(ascending = False)
## latitude latitude 1.000000
## total_bedrooms households 0.980255
## total_rooms total_bedrooms 0.930489
## latitude longitude 0.924485
## total_rooms households 0.920482
## ...
## rooms_per_household population_per_household 0.004922
## population median_income 0.004122
## bedrooms_per_room population_per_household 0.003938
## total_bedrooms rooms_per_household 0.001659
## longitude population_per_household 0.000598
## Length: 67, dtype: float64
total_bedrooms and householdsy_mean = ftrain_set.median_house_value.mean()
y_mean
## 207194.6937378876
y_train.reset_index(drop = True, inplace = True)
for i in range(0,len(y_train)):
if y_train.iat[i] > y_mean:
y_train.iat[i] = 1
elif y_train.iat[i] <= y_mean:
y_train[i] = 0
## <string>:5: SettingWithCopyWarning:
## A value is trying to be set on a copy of a slice from a DataFrame
##
## See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
y_train
## 0 1.0
## 1 1.0
## 2 0.0
## 3 1.0
## 4 1.0
## ...
## 12379 0.0
## 12380 0.0
## 12381 1.0
## 12382 0.0
## 12383 0.0
## Name: median_house_value, Length: 12384, dtype: float64
y_val.reset_index(drop = True, inplace = True)
for i in range(0,len(y_val)):
if y_val.iat[i] > y_mean:
y_val.iat[i] = 1
elif y_val.iat[i] <= y_mean:
y_val[i] = 0
y_val
## 0 0.0
## 1 0.0
## 2 1.0
## 3 1.0
## 4 1.0
## ...
## 4123 0.0
## 4124 1.0
## 4125 1.0
## 4126 1.0
## 4127 0.0
## Name: median_house_value, Length: 4128, dtype: float64
y_test.reset_index(drop = True, inplace = True)
for i in range(0,len(y_test)):
if y_test.iat[i] > y_mean:
y_test.iat[i] = 1
elif y_test.iat[i] <= y_mean:
y_test[i] = 0
y_test
## 0 0.0
## 1 0.0
## 2 1.0
## 3 1.0
## 4 1.0
## ...
## 4123 1.0
## 4124 1.0
## 4125 1.0
## 4126 0.0
## 4127 0.0
## Name: median_house_value, Length: 4128, dtype: float64
# init above average
ftrain_set["above_average"] = 0
ftrain_set.loc[ftrain_set["median_house_value"] > y_mean, "above_average"] = 1
ftrain_set["above_average"]
## 14196 0
## 8267 1
## 17445 0
## 14265 0
## 2271 0
## ..
## 11284 1
## 11964 0
## 5390 1
## 860 1
## 15795 1
## Name: above_average, Length: 16512, dtype: int64
from sklearn.metrics import mutual_info_score
ftrain_set.dtypes
## latitude float64
## longitude float64
## housing_median_age float64
## total_rooms float64
## total_bedrooms float64
## population float64
## households float64
## median_income float64
## median_house_value float64
## ocean_proximity object
## rooms_per_household float64
## bedrooms_per_room float64
## population_per_household float64
## above_average int64
## dtype: object
mutual_info_score(ftrain_set.above_average, ftrain_set.ocean_proximity)
## 0.1014306752368672
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
train_dict = train_set.to_dict(orient = "records")
X_train = dv.fit_transform(train_dict)
val_dict = val_set.to_dict(orient = "records")
X_val = dv.transform(val_dict)
test_dict = test_set.to_dict(orient = "records")
X_test = dv.transform(test_dict)
Fit
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
Pred on Val
y_pred = model.predict(X_val)
round((y_val == y_pred).mean(), 2)
## 0.84
q5_col = ['total_rooms', 'total_bedrooms', 'population', 'households']
train_dict_q5 = train_set[q5_col].to_dict(orient = "records")
X_train_q5 = dv.fit_transform(train_dict_q5)
val_dict_q5 = val_set[q5_col].to_dict(orient = "records")
X_val_q5 = dv.transform(val_dict_q5)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_q5, y_train)
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
Pred q5 on Val
y_pred_q5 = model.predict(X_val_q5)
q5_acc = round((y_val == y_pred_q5).mean(), 2)
q5_acc
## 0.71
Iterate without 1 of the 5 selected features
from IPython.display import display
for c in q5_col:
print()
display(print(f"{c} is removed"))
q4 = q5_col.copy()
q4.remove(c)
train_dict_q4 = train_set[q4].to_dict(orient = "records")
X_train_q4 = dv.fit_transform(train_dict_q4)
val_dict_q4 = val_set[q4].to_dict(orient = "records")
X_val_q4 = dv.transform(val_dict_q4)
model.fit(X_train_q4, y_train)
y_pred_q4 = model.predict(X_val_q4)
acc = round((y_val == y_pred_q4).mean(), 2)
diff = round((q5_acc - acc), 2)
display(print(f"Accuracy is {acc}"))
display(print(f"Difference from the original is {diff}"))
print()
print()
q4 = q5_col.copy()
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
If there are multiple options, select the smallest alpha.
ftrain_setl, test_setl = train_test_split(hd, test_size = 0.2, random_state = 42)
Apply log transf to median_house_value
ftrain_setl.median_house_value = np.log(ftrain_setl.median_house_value)
ftrain_setl.median_house_value
## 14196 11.542484
## 8267 12.853438
## 17445 12.058732
## 14265 11.444647
## 2271 11.477298
## ...
## 11284 12.342350
## 11964 11.490680
## 5390 12.310883
## 860 12.554967
## 15795 12.691580
## Name: median_house_value, Length: 16512, dtype: float64
train_setl, val_setl = train_test_split(ftrain_setl, test_size = 4128, random_state = 42)
len(train_setl), len(val_setl), len(test_setl)
## (12384, 4128, 4128)
Extract y
y_trainl = train_setl.median_house_value
del train_setl["median_house_value"]
y_vall = val_setl.median_house_value
del val_setl["median_house_value"]
y_testl = test_setl.median_house_value
del test_setl["median_house_value"]
from sklearn.linear_model import Ridge
a = [0, 0.01, 0.1, 1, 10]
model = Ridge(alpha=a, solver="sag", random_state=42)
from sklearn.metrics import mean_squared_error
train_dictl = train_setl.to_dict(orient = "records")
X_trainl = dv.fit_transform(train_dictl)
val_dictl= val_setl.to_dict(orient = "records")
X_vall = dv.transform(val_dictl)
for i in a:
display(print(f"alpha = {i}"))
model = Ridge(alpha=i, solver="sag", random_state=42)
model.fit(X_trainl, y_trainl)
y_predl = model.predict(X_vall)
rmse = mean_squared_error(y_vall, y_predl, squared = False)
rounded_rmse = round(rmse, 3)
display(print(f"rmse = {rmse}"))
display(print(f"rounded rmse = {rounded_rmse}"))
print()
print()
Ridge(alpha=10, random_state=42, solver='sag')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge(alpha=10, random_state=42, solver='sag')