Load the library function

library(tidyverse)

Read the file and explore the data

setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Statistical BootCamp/WK4")
housing = read.csv('housing.csv')
head(housing)
##   longitude latitude housing_median_age total_rooms total_bedrooms
## 1   -122.23    37.88                 41         880            129
## 2   -122.22    37.86                 21        7099           1106
## 3   -122.24    37.85                 52        1467            190
## 4   -122.25    37.85                 52        1274            235
## 5   -122.25    37.85                 52        1627            280
## 6   -122.25    37.85                 52         919            213
##   population households median_income median_house_value ocean_proximity
## 1        322        126        8.3252             452600        NEAR BAY
## 2       2401       1138        8.3014             358500        NEAR BAY
## 3        496        177        7.2574             352100        NEAR BAY
## 4        558        219        5.6431             341300        NEAR BAY
## 5        565        259        3.8462             342200        NEAR BAY
## 6        413        193        4.0368             269700        NEAR BAY
summary(housing)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1448  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.64      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.71   3rd Qu.:37.00      3rd Qu.: 3148  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##                                                                     
##  total_bedrooms     population      households     median_income    
##  Min.   :   1.0   Min.   :    3   Min.   :   1.0   Min.   : 0.4999  
##  1st Qu.: 296.0   1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5634  
##  Median : 435.0   Median : 1166   Median : 409.0   Median : 3.5348  
##  Mean   : 537.9   Mean   : 1425   Mean   : 499.5   Mean   : 3.8707  
##  3rd Qu.: 647.0   3rd Qu.: 1725   3rd Qu.: 605.0   3rd Qu.: 4.7432  
##  Max.   :6445.0   Max.   :35682   Max.   :6082.0   Max.   :15.0001  
##  NA's   :207                                                        
##  median_house_value   ocean_proximity
##  Min.   : 14999     <1H OCEAN :9136  
##  1st Qu.:119600     INLAND    :6551  
##  Median :179700     ISLAND    :   5  
##  Mean   :206856     NEAR BAY  :2290  
##  3rd Qu.:264725     NEAR OCEAN:2658  
##  Max.   :500001                      
## 
dim(housing)
## [1] 20640    10

check whether each columns got missing value

apply(housing, 2, function(col)sum(is.na(col)))
##          longitude           latitude housing_median_age 
##                  0                  0                  0 
##        total_rooms     total_bedrooms         population 
##                  0                207                  0 
##         households      median_income median_house_value 
##                  0                  0                  0 
##    ocean_proximity 
##                  0

Get all the row which contains missing value

missing <- housing[rowSums(is.na(housing))>0,]

remove the missing value from the data

house_no_missing <- housing[rowSums(is.na(housing)) == 0, ]

Preparation for Geospatial analysis by using GOOGLE MAP

library(ggplot2)
# if(!requireNamespace("devtools")) install.packaged("devtools")
# devtools:: install_github("dkahle/ggmap", ref = "tidyup")
library(ggmap)

Google’s Terms of Service: https://cloud.google.com/maps-platform/terms/.

Please cite ggmap if you use it! See citation(“ggmap”) for details.

register_google(key= "AIzaSyBmXB5S5_NIqo6lAGH-_U-TbhrQjhOsplU")
house_loc = c(-130.25,30,-110.25,45)
our_map=get_map(location = house_loc,maptype="roadmap", source="google")
ggmap(our_map) + geom_point(aes(housing$longitude, housing$latitude), data = housing)

ggmap(our_map) + geom_point(aes(housing$longitude, housing$latitude, color = median_house_value), data = housing)+scale_color_gradient(low="black", high="red")

ggmap(our_map)+geom_point(aes(housing$longitude, housing$latitude,color=median_house_value, size=housing$population), data = housing)+scale_color_gradient(low="black", high="red")

ggmap(our_map)+geom_point(aes(housing$longitude, housing$latitude, color=median_house_value, size = housing$population), data=housing)+scale_color_gradient(low = 'black', high='red')+scale_size(range=c(0.5,3))

Calculate the correlation between all variables

correlation = cor(house_no_missing[ , 1:9])
correlation
##                      longitude    latitude housing_median_age total_rooms
## longitude           1.00000000 -0.92461611        -0.10935655  0.04548017
## latitude           -0.92461611  1.00000000         0.01189907 -0.03666681
## housing_median_age -0.10935655  0.01189907         1.00000000 -0.36062830
## total_rooms         0.04548017 -0.03666681        -0.36062830  1.00000000
## total_bedrooms      0.06960802 -0.06698283        -0.32045104  0.93037950
## population          0.10027030 -0.10899734        -0.29578730  0.85728125
## households          0.05651277 -0.07177419        -0.30276797  0.91899153
## median_income      -0.01555015 -0.07962632        -0.11827772  0.19788152
## median_house_value -0.04539822 -0.14463821         0.10643205  0.13329413
##                    total_bedrooms   population  households median_income
## longitude              0.06960802  0.100270301  0.05651277  -0.015550150
## latitude              -0.06698283 -0.108997344 -0.07177419  -0.079626319
## housing_median_age    -0.32045104 -0.295787297 -0.30276797  -0.118277723
## total_rooms            0.93037950  0.857281251  0.91899153   0.197881519
## total_bedrooms         1.00000000  0.877746743  0.97972827  -0.007722850
## population             0.87774674  1.000000000  0.90718590   0.005086624
## households             0.97972827  0.907185900  1.00000000   0.013433892
## median_income         -0.00772285  0.005086624  0.01343389   1.000000000
## median_house_value     0.04968618 -0.025299732  0.06489355   0.688355475
##                    median_house_value
## longitude                 -0.04539822
## latitude                  -0.14463821
## housing_median_age         0.10643205
## total_rooms                0.13329413
## total_bedrooms             0.04968618
## population                -0.02529973
## households                 0.06489355
## median_income              0.68835548
## median_house_value         1.00000000
correlation[,9]
##          longitude           latitude housing_median_age 
##        -0.04539822        -0.14463821         0.10643205 
##        total_rooms     total_bedrooms         population 
##         0.13329413         0.04968618        -0.02529973 
##         households      median_income median_house_value 
##         0.06489355         0.68835548         1.00000000

Create the multiple regression model by eliminating ocean_proximity

fit=lm(median_house_value~.-ocean_proximity, data=house_no_missing)
summary(fit)
## 
## Call:
## lm(formula = median_house_value ~ . - ocean_proximity, data = house_no_missing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -563557  -43632  -11367   30340  801274 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.585e+06  6.290e+04 -57.001  < 2e-16 ***
## longitude          -4.273e+04  7.171e+02 -59.588  < 2e-16 ***
## latitude           -4.251e+04  6.770e+02 -62.796  < 2e-16 ***
## housing_median_age  1.158e+03  4.339e+01  26.687  < 2e-16 ***
## total_rooms        -8.250e+00  7.943e-01 -10.387  < 2e-16 ***
## total_bedrooms      1.138e+02  6.931e+00  16.423  < 2e-16 ***
## population         -3.839e+01  1.084e+00 -35.407  < 2e-16 ***
## households          4.770e+01  7.547e+00   6.321 2.65e-10 ***
## median_income       4.030e+04  3.372e+02 119.504  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69570 on 20424 degrees of freedom
## Multiple R-squared:  0.6369, Adjusted R-squared:  0.6368 
## F-statistic:  4478 on 8 and 20424 DF,  p-value: < 2.2e-16

Install the car package

library(car)

Check the VIF Value and remove the vriable which higher VIF value

VIF is the vaerify the multicollinearity between the variables

vif(fit)
##          longitude           latitude housing_median_age 
##           8.713740           8.828919           1.260015 
##        total_rooms     total_bedrooms         population 
##          12.717000          36.003726           6.371238 
##         households      median_income 
##          35.136045           1.731511

Recreate the model by eliminate the variable have higher VIF value

fit2=lm(median_house_value~.-ocean_proximity-total_bedrooms, data=house_no_missing)
summary(fit2)
## 
## Call:
## lm(formula = median_house_value ~ . - ocean_proximity - total_bedrooms, 
##     data = house_no_missing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -540873  -44582  -11839   30811  867934 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.497e+06  6.308e+04 -55.434   <2e-16 ***
## longitude          -4.197e+04  7.203e+02 -58.273   <2e-16 ***
## latitude           -4.222e+04  6.812e+02 -61.983   <2e-16 ***
## housing_median_age  1.126e+03  4.363e+01  25.819   <2e-16 ***
## total_rooms        -1.775e+00  6.940e-01  -2.558   0.0105 *  
## population         -4.310e+01  1.052e+00 -40.952   <2e-16 ***
## households          1.490e+02  4.378e+00  34.025   <2e-16 ***
## median_income       3.838e+04  3.185e+02 120.518   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70030 on 20425 degrees of freedom
## Multiple R-squared:  0.6321, Adjusted R-squared:  0.632 
## F-statistic:  5014 on 7 and 20425 DF,  p-value: < 2.2e-16

Check the VIF value on new model again

vif(fit2)
##          longitude           latitude housing_median_age 
##           8.677840           8.822928           1.257568 
##        total_rooms         population         households 
##           9.583814           5.925295          11.673860 
##      median_income 
##           1.524467

Seems like the model almost stabilise with household VIF value >10 only

Remove the households variable and create the model again

fit3=lm(median_house_value~.-ocean_proximity-total_bedrooms-households, data = house_no_missing)
summary(fit3)
## 
## Call:
## lm(formula = median_house_value ~ . - ocean_proximity - total_bedrooms - 
##     households, data = house_no_missing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -503984  -47000  -13174   32448  514087 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.968e+06  6.326e+04  -62.72   <2e-16 ***
## longitude          -4.774e+04  7.196e+02  -66.34   <2e-16 ***
## latitude           -4.777e+04  6.798e+02  -70.27   <2e-16 ***
## housing_median_age  1.118e+03  4.485e+01   24.93   <2e-16 ***
## total_rooms         1.504e+01  5.009e-01   30.02   <2e-16 ***
## population         -2.541e+01  9.405e-01  -27.01   <2e-16 ***
## median_income       3.431e+04  3.033e+02  113.10   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 71980 on 20426 degrees of freedom
## Multiple R-squared:  0.6113, Adjusted R-squared:  0.6112 
## F-statistic:  5353 on 6 and 20426 DF,  p-value: < 2.2e-16

Check the VIF value on the new model again

vif(fit3)
##          longitude           latitude housing_median_age 
##           8.197084           8.316903           1.257530 
##        total_rooms         population      median_income 
##           4.725402           4.479289           1.308668

The VIF values of all the variables are stabilise with the VIF value <10. We can use the model 3 as the final model for prediction.