import pandas as pd
import numpy as np
housedat = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
#!wget $housedat
hd = pd.read_csv(housedat)
hd.head()
##    longitude  latitude  ...  median_house_value  ocean_proximity
## 0    -122.23     37.88  ...            452600.0         NEAR BAY
## 1    -122.22     37.86  ...            358500.0         NEAR BAY
## 2    -122.24     37.85  ...            352100.0         NEAR BAY
## 3    -122.25     37.85  ...            341300.0         NEAR BAY
## 4    -122.25     37.85  ...            342200.0         NEAR BAY
## 
## [5 rows x 10 columns]

For the rest of the homework use only these columns:

'latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity',
col_h = ['latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity']
hd = hd[col_h]
hd.T
##                        0         1         2      ...    20637    20638    20639
## latitude               37.88     37.86     37.85  ...    39.43    39.43    39.37
## longitude            -122.23   -122.22   -122.24  ...  -121.22  -121.32  -121.24
## housing_median_age      41.0      21.0      52.0  ...     17.0     18.0     16.0
## total_rooms            880.0    7099.0    1467.0  ...   2254.0   1860.0   2785.0
## total_bedrooms         129.0    1106.0     190.0  ...    485.0    409.0    616.0
## population             322.0    2401.0     496.0  ...   1007.0    741.0   1387.0
## households             126.0    1138.0     177.0  ...    433.0    349.0    530.0
## median_income         8.3252    8.3014    7.2574  ...      1.7   1.8672   2.3886
## median_house_value  452600.0  358500.0  352100.0  ...  92300.0  84700.0  89400.0
## ocean_proximity     NEAR BAY  NEAR BAY  NEAR BAY  ...   INLAND   INLAND   INLAND
## 
## [10 rows x 20640 columns]

Data Prep

Select only the features from above and fill in the missing values with 0.

hd = hd.fillna(0)
hd.isnull().sum()
## latitude              0
## longitude             0
## housing_median_age    0
## total_rooms           0
## total_bedrooms        0
## population            0
## households            0
## median_income         0
## median_house_value    0
## ocean_proximity       0
## dtype: int64

Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.

hd["rooms_per_household"] = hd.total_rooms / hd.households

Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.

hd["bedrooms_per_room"] = hd.total_bedrooms / hd.total_rooms

Create a new column population_per_household by dividing the column population by the column households from dataframe.

hd["population_per_household"] = hd.population / hd.households

Question 1

What is the most frequent observation (mode) for the column ocean_proximity?

hd.ocean_proximity.value_counts()
## <1H OCEAN     9136
## INLAND        6551
## NEAR OCEAN    2658
## NEAR BAY      2290
## ISLAND           5
## Name: ocean_proximity, dtype: int64

Answer: <1H OCEAN

Split the data

  • Split your data in train/val/test sets, with 60%/20%/20% distribution.
  • Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
  • Make sure that the target value (median_house_value) is not in your dataframe.
from sklearn.model_selection import train_test_split
ftrain_set, test_set = train_test_split(hd, test_size = 0.2, random_state = 42)
0.2 * len(hd)
## 4128.0
train_set, val_set = train_test_split(ftrain_set, test_size = 4128, random_state = 42)
len(train_set), len(val_set), len(test_set)
## (12384, 4128, 4128)

Extract y

y_train = train_set.median_house_value
del train_set["median_house_value"]

y_val = val_set.median_house_value
del val_set["median_house_value"]

y_test = test_set.median_house_value
del test_set["median_house_value"]

Question 2

  • Create the correlation matrix for the numerical features of your train dataset.
  • In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
  • What are the two features that have the biggest correlation in this dataset?
num_col = ftrain_set.dtypes[ftrain_set.dtypes=="float64"].index
ftrain_set[num_col]
##        latitude  longitude  ...  bedrooms_per_room  population_per_household
## 14196     32.71    -117.03  ...           0.200576                  3.691814
## 8267      33.77    -118.16  ...           0.232703                  1.738095
## 17445     34.66    -120.48  ...           0.174486                  2.723214
## 14265     32.69    -117.11  ...           0.258269                  3.994366
## 2271      36.78    -119.80  ...           0.180940                  2.300000
## ...         ...        ...  ...                ...                       ...
## 11284     33.78    -117.96  ...           0.151128                  3.032258
## 11964     34.02    -117.43  ...           0.184825                  3.904232
## 5390      34.03    -118.38  ...           0.270823                  3.332068
## 860       37.58    -121.96  ...           0.166993                  3.178891
## 15795     37.77    -122.42  ...           0.311169                  2.108696
## 
## [16512 rows x 12 columns]
corr_matrix = ftrain_set[num_col].corr()
corr_matrix
##                           latitude  ...  population_per_household
## latitude                  1.000000  ...                  0.005837
## longitude                -0.924485  ...                 -0.000598
## housing_median_age        0.005296  ...                  0.016245
## total_rooms              -0.029224  ...                 -0.024991
## total_bedrooms           -0.059998  ...                 -0.028536
## population               -0.102499  ...                  0.072330
## households               -0.064061  ...                 -0.027656
## median_income            -0.076571  ...                  0.022061
## median_house_value       -0.142983  ...                 -0.022030
## rooms_per_household       0.110695  ...                 -0.004922
## bedrooms_per_room        -0.118938  ...                  0.003938
## population_per_household  0.005837  ...                  1.000000
## 
## [12 rows x 12 columns]

corr_matrix.unstack().drop_duplicates().abs().sort_values(ascending = False)
## latitude             latitude                    1.000000
## total_bedrooms       households                  0.980255
## total_rooms          total_bedrooms              0.930489
## latitude             longitude                   0.924485
## total_rooms          households                  0.920482
##                                                    ...   
## rooms_per_household  population_per_household    0.004922
## population           median_income               0.004122
## bedrooms_per_room    population_per_household    0.003938
## total_bedrooms       rooms_per_household         0.001659
## longitude            population_per_household    0.000598
## Length: 67, dtype: float64

Answer: total_bedrooms and households

Make median_house_value binary

  • We need to turn the median_house_value variable from numeric into binary.
  • Let’s create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.
y_mean = ftrain_set.median_house_value.mean()
y_mean
## 207194.6937378876
y_train.reset_index(drop = True, inplace = True)

for i in range(0,len(y_train)):
  if y_train.iat[i] > y_mean:
    y_train.iat[i] = 1
  elif y_train.iat[i] <= y_mean: 
    y_train[i] = 0
## <string>:5: SettingWithCopyWarning: 
## A value is trying to be set on a copy of a slice from a DataFrame
## 
## See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
y_train
## 0        1.0
## 1        1.0
## 2        0.0
## 3        1.0
## 4        1.0
##         ... 
## 12379    0.0
## 12380    0.0
## 12381    1.0
## 12382    0.0
## 12383    0.0
## Name: median_house_value, Length: 12384, dtype: float64
y_val.reset_index(drop = True, inplace = True)

for i in range(0,len(y_val)):
  if y_val.iat[i] > y_mean:
    y_val.iat[i] = 1
  elif y_val.iat[i] <= y_mean: 
    y_val[i] = 0

y_val
## 0       0.0
## 1       0.0
## 2       1.0
## 3       1.0
## 4       1.0
##        ... 
## 4123    0.0
## 4124    1.0
## 4125    1.0
## 4126    1.0
## 4127    0.0
## Name: median_house_value, Length: 4128, dtype: float64
y_test.reset_index(drop = True, inplace = True)

for i in range(0,len(y_test)):
  if y_test.iat[i] > y_mean:
    y_test.iat[i] = 1
  elif y_test.iat[i] <= y_mean: 
    y_test[i] = 0

y_test
## 0       0.0
## 1       0.0
## 2       1.0
## 3       1.0
## 4       1.0
##        ... 
## 4123    1.0
## 4124    1.0
## 4125    1.0
## 4126    0.0
## 4127    0.0
## Name: median_house_value, Length: 4128, dtype: float64
# init above average
ftrain_set["above_average"] = 0
ftrain_set.loc[ftrain_set["median_house_value"] > y_mean, "above_average"] = 1
ftrain_set["above_average"]
## 14196    0
## 8267     1
## 17445    0
## 14265    0
## 2271     0
##         ..
## 11284    1
## 11964    0
## 5390     1
## 860      1
## 15795    1
## Name: above_average, Length: 16512, dtype: int64

Question 3

  • Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
  • What is the value of mutual information?
  • Round it to 2 decimal digits using round(score, 2)
from sklearn.metrics import mutual_info_score
ftrain_set.dtypes
## latitude                    float64
## longitude                   float64
## housing_median_age          float64
## total_rooms                 float64
## total_bedrooms              float64
## population                  float64
## households                  float64
## median_income               float64
## median_house_value          float64
## ocean_proximity              object
## rooms_per_household         float64
## bedrooms_per_room           float64
## population_per_household    float64
## above_average                 int64
## dtype: object
mutual_info_score(ftrain_set.above_average, ftrain_set.ocean_proximity)
## 0.1014306752368672

Answer: 0.1

Question 4

  • Now let’s train a logistic regression
  • Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
  • Fit the model on the training dataset.
    • To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    • model = LogisticRegression(solver=“liblinear”, C=1.0, max_iter=1000, random_state=42)
  • Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
train_dict = train_set.to_dict(orient = "records")
X_train = dv.fit_transform(train_dict)

val_dict = val_set.to_dict(orient = "records")
X_val = dv.transform(val_dict)

test_dict = test_set.to_dict(orient = "records")
X_test = dv.transform(test_dict)

Fit

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pred on Val

y_pred = model.predict(X_val)
round((y_val == y_pred).mean(), 2)
## 0.84

Answer: 0.84

Question 5

  • Let’s find the least useful feature using the feature elimination technique.
  • Train a model with all these features (using the same parameters as in Q4).
  • Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
  • For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
  • Which of following feature has the smallest difference?
    • total_rooms
    • total_bedrooms
    • population
    • households
  • note: the difference doesn’t have to be positive
q5_col = ['total_rooms', 'total_bedrooms', 'population', 'households']
train_dict_q5 = train_set[q5_col].to_dict(orient = "records")
X_train_q5 = dv.fit_transform(train_dict_q5)

val_dict_q5 = val_set[q5_col].to_dict(orient = "records")
X_val_q5 = dv.transform(val_dict_q5)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_q5, y_train)
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pred q5 on Val

y_pred_q5 = model.predict(X_val_q5)
q5_acc = round((y_val == y_pred_q5).mean(), 2)
q5_acc
## 0.71

Iterate without 1 of the 5 selected features

from IPython.display import display
for c in q5_col:
  print()
  display(print(f"{c} is removed")) 
  q4 = q5_col.copy()
  q4.remove(c)
  train_dict_q4 = train_set[q4].to_dict(orient = "records")
  X_train_q4 = dv.fit_transform(train_dict_q4)
  val_dict_q4 = val_set[q4].to_dict(orient = "records")
  X_val_q4 = dv.transform(val_dict_q4)
  model.fit(X_train_q4, y_train)
  y_pred_q4 = model.predict(X_val_q4)
  acc = round((y_val == y_pred_q4).mean(), 2)
  diff = round((q5_acc - acc), 2)
  display(print(f"Accuracy is {acc}"))
  display(print(f"Difference from the original is {diff}"))
  print()
  print()
  q4 = q5_col.copy()
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Answer: Smallest difference is household with 0.04

Question 6

  • For this question, we’ll see how to use a linear regression model from Scikit-Learn
  • We’ll need to use the original column ‘median_house_value’. Apply the logarithmic transformation to this column.
  • Fit the Ridge regression model (model = Ridge(alpha=a, solver=“sag”, random_state=42)) on the training data.
  • This model has a parameter alpha. Let’s try the following values: [0, 0.01, 0.1, 1, 10]
  • Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest alpha.

ftrain_setl, test_setl = train_test_split(hd, test_size = 0.2, random_state = 42)

Apply log transf to median_house_value

ftrain_setl.median_house_value = np.log(ftrain_setl.median_house_value)
ftrain_setl.median_house_value
## 14196    11.542484
## 8267     12.853438
## 17445    12.058732
## 14265    11.444647
## 2271     11.477298
##            ...    
## 11284    12.342350
## 11964    11.490680
## 5390     12.310883
## 860      12.554967
## 15795    12.691580
## Name: median_house_value, Length: 16512, dtype: float64
train_setl, val_setl = train_test_split(ftrain_setl, test_size = 4128, random_state = 42)
len(train_setl), len(val_setl), len(test_setl)
## (12384, 4128, 4128)

Extract y

y_trainl = train_setl.median_house_value
del train_setl["median_house_value"]

y_vall = val_setl.median_house_value
del val_setl["median_house_value"]

y_testl = test_setl.median_house_value
del test_setl["median_house_value"]
from sklearn.linear_model import Ridge
a = [0, 0.01, 0.1, 1, 10]
model = Ridge(alpha=a, solver="sag", random_state=42)
from sklearn.metrics import mean_squared_error
train_dictl = train_setl.to_dict(orient = "records")
X_trainl = dv.fit_transform(train_dictl)
val_dictl= val_setl.to_dict(orient = "records")
X_vall = dv.transform(val_dictl)
for i in a: 
    display(print(f"alpha = {i}"))
    model = Ridge(alpha=i, solver="sag", random_state=42)
    model.fit(X_trainl, y_trainl)
    y_predl = model.predict(X_vall)
    rmse = mean_squared_error(y_vall, y_predl, squared = False)
    rounded_rmse = round(rmse, 3)
    display(print(f"rmse = {rmse}"))
    display(print(f"rounded rmse = {rounded_rmse}"))
    print()
    print()
Ridge(alpha=10, random_state=42, solver='sag')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Answer: alpha 0