Load the library function
library(tidyverse)
Read the file and explore the data
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Statistical BootCamp/WK4")
housing = read.csv('housing.csv')
head(housing)
## longitude latitude housing_median_age total_rooms total_bedrooms
## 1 -122.23 37.88 41 880 129
## 2 -122.22 37.86 21 7099 1106
## 3 -122.24 37.85 52 1467 190
## 4 -122.25 37.85 52 1274 235
## 5 -122.25 37.85 52 1627 280
## 6 -122.25 37.85 52 919 213
## population households median_income median_house_value ocean_proximity
## 1 322 126 8.3252 452600 NEAR BAY
## 2 2401 1138 8.3014 358500 NEAR BAY
## 3 496 177 7.2574 352100 NEAR BAY
## 4 558 219 5.6431 341300 NEAR BAY
## 5 565 259 3.8462 342200 NEAR BAY
## 6 413 193 4.0368 269700 NEAR BAY
summary(housing)
## longitude latitude housing_median_age total_rooms
## Min. :-124.3 Min. :32.54 Min. : 1.00 Min. : 2
## 1st Qu.:-121.8 1st Qu.:33.93 1st Qu.:18.00 1st Qu.: 1448
## Median :-118.5 Median :34.26 Median :29.00 Median : 2127
## Mean :-119.6 Mean :35.63 Mean :28.64 Mean : 2636
## 3rd Qu.:-118.0 3rd Qu.:37.71 3rd Qu.:37.00 3rd Qu.: 3148
## Max. :-114.3 Max. :41.95 Max. :52.00 Max. :39320
##
## total_bedrooms population households median_income
## Min. : 1.0 Min. : 3 Min. : 1.0 Min. : 0.4999
## 1st Qu.: 296.0 1st Qu.: 787 1st Qu.: 280.0 1st Qu.: 2.5634
## Median : 435.0 Median : 1166 Median : 409.0 Median : 3.5348
## Mean : 537.9 Mean : 1425 Mean : 499.5 Mean : 3.8707
## 3rd Qu.: 647.0 3rd Qu.: 1725 3rd Qu.: 605.0 3rd Qu.: 4.7432
## Max. :6445.0 Max. :35682 Max. :6082.0 Max. :15.0001
## NA's :207
## median_house_value ocean_proximity
## Min. : 14999 <1H OCEAN :9136
## 1st Qu.:119600 INLAND :6551
## Median :179700 ISLAND : 5
## Mean :206856 NEAR BAY :2290
## 3rd Qu.:264725 NEAR OCEAN:2658
## Max. :500001
##
dim(housing)
## [1] 20640 10
check whether each columns got missing value
apply(housing, 2, function(col)sum(is.na(col)))
## longitude latitude housing_median_age
## 0 0 0
## total_rooms total_bedrooms population
## 0 207 0
## households median_income median_house_value
## 0 0 0
## ocean_proximity
## 0
Get all the row which contains missing value
missing <- housing[rowSums(is.na(housing))>0,]
remove the missing value from the data
house_no_missing <- housing[rowSums(is.na(housing)) == 0, ]
Preparation for Geospatial analysis by using GOOGLE MAP
library(ggplot2)
# if(!requireNamespace("devtools")) install.packaged("devtools")
# devtools:: install_github("dkahle/ggmap", ref = "tidyup")
library(ggmap)
Please cite ggmap if you use it! See citation(“ggmap”) for details.
register_google(key= "AIzaSyBmXB5S5_NIqo6lAGH-_U-TbhrQjhOsplU")
house_loc = c(-130.25,30,-110.25,45)
our_map=get_map(location = house_loc,maptype="roadmap", source="google")
ggmap(our_map) + geom_point(aes(housing$longitude, housing$latitude), data = housing)

ggmap(our_map) + geom_point(aes(housing$longitude, housing$latitude, color = median_house_value), data = housing)+scale_color_gradient(low="black", high="red")

ggmap(our_map)+geom_point(aes(housing$longitude, housing$latitude,color=median_house_value, size=housing$population), data = housing)+scale_color_gradient(low="black", high="red")

ggmap(our_map)+geom_point(aes(housing$longitude, housing$latitude, color=median_house_value, size = housing$population), data=housing)+scale_color_gradient(low = 'black', high='red')+scale_size(range=c(0.5,3))

Calculate the correlation between all variables
correlation = cor(house_no_missing[ , 1:9])
correlation
## longitude latitude housing_median_age total_rooms
## longitude 1.00000000 -0.92461611 -0.10935655 0.04548017
## latitude -0.92461611 1.00000000 0.01189907 -0.03666681
## housing_median_age -0.10935655 0.01189907 1.00000000 -0.36062830
## total_rooms 0.04548017 -0.03666681 -0.36062830 1.00000000
## total_bedrooms 0.06960802 -0.06698283 -0.32045104 0.93037950
## population 0.10027030 -0.10899734 -0.29578730 0.85728125
## households 0.05651277 -0.07177419 -0.30276797 0.91899153
## median_income -0.01555015 -0.07962632 -0.11827772 0.19788152
## median_house_value -0.04539822 -0.14463821 0.10643205 0.13329413
## total_bedrooms population households median_income
## longitude 0.06960802 0.100270301 0.05651277 -0.015550150
## latitude -0.06698283 -0.108997344 -0.07177419 -0.079626319
## housing_median_age -0.32045104 -0.295787297 -0.30276797 -0.118277723
## total_rooms 0.93037950 0.857281251 0.91899153 0.197881519
## total_bedrooms 1.00000000 0.877746743 0.97972827 -0.007722850
## population 0.87774674 1.000000000 0.90718590 0.005086624
## households 0.97972827 0.907185900 1.00000000 0.013433892
## median_income -0.00772285 0.005086624 0.01343389 1.000000000
## median_house_value 0.04968618 -0.025299732 0.06489355 0.688355475
## median_house_value
## longitude -0.04539822
## latitude -0.14463821
## housing_median_age 0.10643205
## total_rooms 0.13329413
## total_bedrooms 0.04968618
## population -0.02529973
## households 0.06489355
## median_income 0.68835548
## median_house_value 1.00000000
correlation[,9]
## longitude latitude housing_median_age
## -0.04539822 -0.14463821 0.10643205
## total_rooms total_bedrooms population
## 0.13329413 0.04968618 -0.02529973
## households median_income median_house_value
## 0.06489355 0.68835548 1.00000000
Create the multiple regression model by eliminating ocean_proximity
fit=lm(median_house_value~.-ocean_proximity, data=house_no_missing)
summary(fit)
##
## Call:
## lm(formula = median_house_value ~ . - ocean_proximity, data = house_no_missing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -563557 -43632 -11367 30340 801274
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.585e+06 6.290e+04 -57.001 < 2e-16 ***
## longitude -4.273e+04 7.171e+02 -59.588 < 2e-16 ***
## latitude -4.251e+04 6.770e+02 -62.796 < 2e-16 ***
## housing_median_age 1.158e+03 4.339e+01 26.687 < 2e-16 ***
## total_rooms -8.250e+00 7.943e-01 -10.387 < 2e-16 ***
## total_bedrooms 1.138e+02 6.931e+00 16.423 < 2e-16 ***
## population -3.839e+01 1.084e+00 -35.407 < 2e-16 ***
## households 4.770e+01 7.547e+00 6.321 2.65e-10 ***
## median_income 4.030e+04 3.372e+02 119.504 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69570 on 20424 degrees of freedom
## Multiple R-squared: 0.6369, Adjusted R-squared: 0.6368
## F-statistic: 4478 on 8 and 20424 DF, p-value: < 2.2e-16
Install the car package
library(car)
Check the VIF Value and remove the vriable which higher VIF value
VIF is the vaerify the multicollinearity between the variables
vif(fit)
## longitude latitude housing_median_age
## 8.713740 8.828919 1.260015
## total_rooms total_bedrooms population
## 12.717000 36.003726 6.371238
## households median_income
## 35.136045 1.731511
Recreate the model by eliminate the variable have higher VIF value
fit2=lm(median_house_value~.-ocean_proximity-total_bedrooms, data=house_no_missing)
summary(fit2)
##
## Call:
## lm(formula = median_house_value ~ . - ocean_proximity - total_bedrooms,
## data = house_no_missing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -540873 -44582 -11839 30811 867934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.497e+06 6.308e+04 -55.434 <2e-16 ***
## longitude -4.197e+04 7.203e+02 -58.273 <2e-16 ***
## latitude -4.222e+04 6.812e+02 -61.983 <2e-16 ***
## housing_median_age 1.126e+03 4.363e+01 25.819 <2e-16 ***
## total_rooms -1.775e+00 6.940e-01 -2.558 0.0105 *
## population -4.310e+01 1.052e+00 -40.952 <2e-16 ***
## households 1.490e+02 4.378e+00 34.025 <2e-16 ***
## median_income 3.838e+04 3.185e+02 120.518 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70030 on 20425 degrees of freedom
## Multiple R-squared: 0.6321, Adjusted R-squared: 0.632
## F-statistic: 5014 on 7 and 20425 DF, p-value: < 2.2e-16
Check the VIF value on new model again
vif(fit2)
## longitude latitude housing_median_age
## 8.677840 8.822928 1.257568
## total_rooms population households
## 9.583814 5.925295 11.673860
## median_income
## 1.524467
Seems like the model almost stabilise with household VIF value >10 only
Remove the households variable and create the model again
fit3=lm(median_house_value~.-ocean_proximity-total_bedrooms-households, data = house_no_missing)
summary(fit3)
##
## Call:
## lm(formula = median_house_value ~ . - ocean_proximity - total_bedrooms -
## households, data = house_no_missing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -503984 -47000 -13174 32448 514087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.968e+06 6.326e+04 -62.72 <2e-16 ***
## longitude -4.774e+04 7.196e+02 -66.34 <2e-16 ***
## latitude -4.777e+04 6.798e+02 -70.27 <2e-16 ***
## housing_median_age 1.118e+03 4.485e+01 24.93 <2e-16 ***
## total_rooms 1.504e+01 5.009e-01 30.02 <2e-16 ***
## population -2.541e+01 9.405e-01 -27.01 <2e-16 ***
## median_income 3.431e+04 3.033e+02 113.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71980 on 20426 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.6112
## F-statistic: 5353 on 6 and 20426 DF, p-value: < 2.2e-16
Check the VIF value on the new model again
vif(fit3)
## longitude latitude housing_median_age
## 8.197084 8.316903 1.257530
## total_rooms population median_income
## 4.725402 4.479289 1.308668
The VIF values of all the variables are stabilise with the VIF value <10. We can use the model 3 as the final model for prediction.