Dealing with highly correlated variables,
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.4
## Loaded glmnet 2.0-16
library(useful)
## Warning: package 'useful' was built under R version 3.4.4
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
library(coefplot)
## Warning: package 'coefplot' was built under R version 3.4.4
library(magrittr)
library(animation)
## Warning: package 'animation' was built under R version 3.4.4
land_train <- readr::read_csv('data/manhattan_Train.csv')
## Parsed with column specification:
## cols(
## .default = col_integer(),
## TotalValue = col_double(),
## Borough = col_character(),
## SchoolDistrict = col_character(),
## FireService = col_character(),
## ZoneDist1 = col_character(),
## ZoneDist2 = col_character(),
## ZoneDist3 = col_character(),
## ZoneDist4 = col_character(),
## Class = col_character(),
## LandUse = col_character(),
## OwnerType = col_character(),
## NumFloors = col_double(),
## LotFront = col_double(),
## LotDepth = col_double(),
## BldgFront = col_double(),
## BldgDepth = col_double(),
## Extension = col_character(),
## Proximity = col_character(),
## IrregularLot = col_character(),
## LotType = col_character()
## # ... with 8 more columns
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 504 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 3509 CommFAR no trailing characters .4 'data/manhattan_Train.csv' file 2 3510 CommFAR no trailing characters .4 'data/manhattan_Train.csv' row 3 3511 CommFAR no trailing characters .4 'data/manhattan_Train.csv' col 4 3537 CommFAR no trailing characters .4 'data/manhattan_Train.csv' expected 5 3538 CommFAR no trailing characters .4 'data/manhattan_Train.csv'
## ... ................. ... ........................................................................ ........ ........................................................................ ...... ........................................................................ .... ........................................................................ ... ........................................................................ ... ........................................................................ ........ ........................................................................
## See problems(...) for more details.
land_test <- readRDS('data/manhattan_Test.rds')
valueFormula <- TotalValue ~ FireService +
ZoneDist1 + ZoneDist2 + Class + LandUse +
OwnerType + LotArea + BldgArea + ComArea +
ResArea + OfficeArea + RetailArea +
GarageArea + FactryArea + NumBldgs +
NumFloors + UnitsRes + UnitsTotal +
LotFront + LotDepth + BldgFront +
BldgDepth + LotType + Landmark + BuiltFAR +
Built + HistoricDistrict - 1
valueFormula
## TotalValue ~ FireService + ZoneDist1 + ZoneDist2 + Class + LandUse +
## OwnerType + LotArea + BldgArea + ComArea + ResArea + OfficeArea +
## RetailArea + GarageArea + FactryArea + NumBldgs + NumFloors +
## UnitsRes + UnitsTotal + LotFront + LotDepth + BldgFront +
## BldgDepth + LotType + Landmark + BuiltFAR + Built + HistoricDistrict -
## 1
class(valueFormula)
## [1] "formula"
value1 <- lm(valueFormula, data = land_train)
coefplot(value1, sort='magnitude')
summary(value1)
##
## Call:
## lm(formula = valueFormula, data = land_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40513446 -618695 -63257 386739 16044900
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error
## FireServiceEngine 1.168e+06 6.615e+05
## FireServiceLadder 1.193e+06 6.612e+05
## FireServiceSquad 1.176e+06 6.668e+05
## ZoneDist1Manufacturing 1.027e+05 4.883e+04
## ZoneDist1Mixed Residential -6.827e+05 3.021e+05
## ZoneDist1Park 7.731e+05 2.139e+05
## ZoneDist1Residential -5.469e+05 2.641e+04
## ZoneDist2Manufacturing 4.269e+05 1.913e+05
## ZoneDist2Missing -9.439e+05 6.326e+04
## ZoneDist2Mixed Residential -2.566e+06 1.751e+06
## ZoneDist2Residential -9.953e+04 7.755e+04
## ClassAutomotive -8.012e+05 3.737e+05
## ClassCondo 7.915e+05 2.931e+05
## ClassEducation 1.544e+06 1.837e+05
## ClassElevator Apartment -6.724e+05 2.929e+05
## ClassGovernment 5.359e+05 2.776e+05
## ClassHealth 8.570e+05 2.074e+05
## ClassHotel 2.570e+05 2.979e+05
## ClassIndustrial -2.225e+06 4.398e+05
## ClassLoft -1.479e+06 2.931e+05
## ClassMisc 1.401e+06 2.553e+05
## ClassMixed -9.462e+05 2.914e+05
## ClassOffice -5.843e+05 2.863e+05
## ClassOutdoor 2.111e+06 7.098e+05
## ClassPublic 6.433e+05 2.155e+05
## ClassReligious 4.263e+05 1.749e+05
## ClassRetail -2.230e+05 2.861e+05
## ClassSingle Family -1.541e+06 2.856e+05
## ClassTheatre -1.146e+06 3.376e+05
## ClassTransport 3.460e+05 4.352e+05
## ClassTwo Family -1.430e+06 2.860e+05
## ClassUtility 1.556e+05 1.293e+06
## ClassVacant 3.864e+05 3.058e+05
## ClassWalk Up Apartment -7.045e+05 2.899e+05
## ClassWarehouse -1.728e+06 3.219e+05
## LandUseIndustrial & Manufacturing NA NA
## LandUseMixed Residential & Commercial Buildings -7.231e+05 7.375e+04
## LandUseMulti-Family Elevator Buildings -6.454e+05 9.191e+04
## LandUseMulti-Family Walk-Up Buildings -8.682e+05 8.092e+04
## LandUseOne &Two Family Buildings NA NA
## LandUseOpen Space & Outdoor Recreation -2.658e+06 7.125e+05
## LandUseParking Facilities 5.025e+05 3.582e+05
## LandUsePublic Facilities & Institutions -1.367e+06 2.323e+05
## LandUseTransportation & Utility -6.635e+05 3.482e+05
## LandUseVacant Land) NA NA
## OwnerTypeMixed 3.506e+04 9.292e+04
## OwnerTypePrivate -1.505e+05 7.640e+04
## OwnerTypePublic -4.370e+05 1.334e+05
## OwnerTypeUnknown -1.478e+04 7.707e+04
## LotArea 9.298e+00 6.853e-01
## BldgArea 3.966e+01 1.932e+00
## ComArea -9.647e+00 2.046e+00
## ResArea -1.385e+01 1.992e+00
## OfficeArea 1.571e+00 9.769e-01
## RetailArea -5.040e+00 1.558e+00
## GarageArea -1.747e+01 2.203e+00
## FactryArea -5.201e+00 5.075e+00
## NumBldgs 4.867e+04 1.783e+04
## NumFloors 2.737e+05 4.795e+03
## UnitsRes -5.386e+03 8.854e+02
## UnitsTotal 4.300e+03 6.185e+02
## LotFront 3.709e+03 2.040e+02
## LotDepth 3.162e+03 2.649e+02
## BldgFront 3.492e+03 2.974e+02
## BldgDepth -9.153e+02 2.620e+02
## LotTypeBlock Assemblage 6.795e+05 6.083e+05
## LotTypeConnecting 1.421e+06 5.914e+05
## LotTypeCorner 1.145e+06 5.866e+05
## LotTypeInside 6.693e+05 5.861e+05
## LotTypeInterior Lot 5.771e+05 6.413e+05
## LotTypeIsland Lot -2.166e+05 1.847e+06
## LotTypeMixed or Unknown 5.230e+05 5.907e+05
## LotTypeSubmerged Land Lot -3.247e+05 6.364e+05
## LotTypeWaterfront 2.883e+05 6.195e+05
## LandmarkYes 4.925e+05 7.888e+04
## BuiltFAR 3.656e+04 5.312e+03
## Built10s -6.708e+05 1.248e+05
## Built18th Century -5.711e+05 6.422e+04
## Built60s 3.510e+05 9.470e+04
## Built70s -6.276e+05 1.259e+05
## Built80s 2.931e+05 1.113e+05
## Built90s 5.114e+05 1.195e+05
## BuiltPostwar -1.178e+05 8.125e+04
## BuiltPrewar -3.201e+05 6.830e+04
## BuiltUnknown -7.726e+05 1.219e+05
## HistoricDistrictYes 3.715e+05 2.433e+04
## t value Pr(>|t|)
## FireServiceEngine 1.765 0.077564 .
## FireServiceLadder 1.805 0.071123 .
## FireServiceSquad 1.763 0.077926 .
## ZoneDist1Manufacturing 2.103 0.035495 *
## ZoneDist1Mixed Residential -2.260 0.023815 *
## ZoneDist1Park 3.615 0.000301 ***
## ZoneDist1Residential -20.703 < 2e-16 ***
## ZoneDist2Manufacturing 2.231 0.025671 *
## ZoneDist2Missing -14.921 < 2e-16 ***
## ZoneDist2Mixed Residential -1.465 0.142868
## ZoneDist2Residential -1.283 0.199342
## ClassAutomotive -2.144 0.032047 *
## ClassCondo 2.700 0.006927 **
## ClassEducation 8.407 < 2e-16 ***
## ClassElevator Apartment -2.296 0.021700 *
## ClassGovernment 1.930 0.053581 .
## ClassHealth 4.132 3.61e-05 ***
## ClassHotel 0.863 0.388362
## ClassIndustrial -5.059 4.23e-07 ***
## ClassLoft -5.044 4.58e-07 ***
## ClassMisc 5.486 4.14e-08 ***
## ClassMixed -3.247 0.001166 **
## ClassOffice -2.041 0.041258 *
## ClassOutdoor 2.974 0.002942 **
## ClassPublic 2.985 0.002841 **
## ClassReligious 2.437 0.014830 *
## ClassRetail -0.780 0.435649
## ClassSingle Family -5.397 6.84e-08 ***
## ClassTheatre -3.394 0.000690 ***
## ClassTransport 0.795 0.426574
## ClassTwo Family -5.000 5.77e-07 ***
## ClassUtility 0.120 0.904221
## ClassVacant 1.264 0.206308
## ClassWalk Up Apartment -2.430 0.015091 *
## ClassWarehouse -5.370 7.93e-08 ***
## LandUseIndustrial & Manufacturing NA NA
## LandUseMixed Residential & Commercial Buildings -9.804 < 2e-16 ***
## LandUseMulti-Family Elevator Buildings -7.022 2.23e-12 ***
## LandUseMulti-Family Walk-Up Buildings -10.729 < 2e-16 ***
## LandUseOne &Two Family Buildings NA NA
## LandUseOpen Space & Outdoor Recreation -3.731 0.000191 ***
## LandUseParking Facilities 1.403 0.160610
## LandUsePublic Facilities & Institutions -5.886 4.01e-09 ***
## LandUseTransportation & Utility -1.905 0.056743 .
## LandUseVacant Land) NA NA
## OwnerTypeMixed 0.377 0.705956
## OwnerTypePrivate -1.969 0.048934 *
## OwnerTypePublic -3.276 0.001055 **
## OwnerTypeUnknown -0.192 0.847914
## LotArea 13.566 < 2e-16 ***
## BldgArea 20.528 < 2e-16 ***
## ComArea -4.715 2.43e-06 ***
## ResArea -6.952 3.68e-12 ***
## OfficeArea 1.608 0.107832
## RetailArea -3.235 0.001217 **
## GarageArea -7.931 2.25e-15 ***
## FactryArea -1.025 0.305473
## NumBldgs 2.730 0.006343 **
## NumFloors 57.085 < 2e-16 ***
## UnitsRes -6.083 1.19e-09 ***
## UnitsTotal 6.953 3.65e-12 ***
## LotFront 18.179 < 2e-16 ***
## LotDepth 11.936 < 2e-16 ***
## BldgFront 11.742 < 2e-16 ***
## BldgDepth -3.493 0.000478 ***
## LotTypeBlock Assemblage 1.117 0.263970
## LotTypeConnecting 2.403 0.016263 *
## LotTypeCorner 1.952 0.050920 .
## LotTypeInside 1.142 0.253428
## LotTypeInterior Lot 0.900 0.368153
## LotTypeIsland Lot -0.117 0.906653
## LotTypeMixed or Unknown 0.885 0.375922
## LotTypeSubmerged Land Lot -0.510 0.609915
## LotTypeWaterfront 0.465 0.641629
## LandmarkYes 6.244 4.31e-10 ***
## BuiltFAR 6.882 6.00e-12 ***
## Built10s -5.373 7.79e-08 ***
## Built18th Century -8.892 < 2e-16 ***
## Built60s 3.706 0.000211 ***
## Built70s -4.983 6.28e-07 ***
## Built80s 2.633 0.008467 **
## Built90s 4.278 1.89e-05 ***
## BuiltPostwar -1.449 0.147299
## BuiltPrewar -4.687 2.78e-06 ***
## BuiltUnknown -6.338 2.35e-10 ***
## HistoricDistrictYes 15.268 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1747000 on 32167 degrees of freedom
## Multiple R-squared: 0.7772, Adjusted R-squared: 0.7766
## F-statistic: 1352 on 83 and 32167 DF, p-value: < 2.2e-16
## This is the x matrice
landx_train <- build.x(valueFormula, data = land_train, contrasts = FALSE, sparse = TRUE)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## This is the y matrice. No Contrast and no sparce argument for the y matrice
landy_train <- build.y(valueFormula, data = land_train)
## Fitting the model
value2 <- glmnet(x=landx_train, y=landy_train, family = 'gaussian')
coefpath(value2)
summary(value2)
## Length Class Mode
## a0 94 -none- numeric
## beta 8930 dgCMatrix S4
## df 94 -none- numeric
## dim 2 -none- numeric
## lambda 94 -none- numeric
## dev.ratio 94 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
GLMNET is one of the faster model fitting function out there. GLMNET fits a model for each value of lambda. We ended up with a lot of possible models depending of the value of lambda selected. The question now is how do we select the optimal lambda? This will be answered below
How to choose the optimal value of lambda which will lead to the optimal model?
## The function animation below shows how to split the dataset into 10 block of 10 test sets which will each be used for for testing the model while the rest of the 9th will be used for training the model. This will be done in loop until each 10th of the data set is used as a test dataset.
animation::cv.ani(k=10)
This animation splits the dataset into 10th/90th for test/train and is very useful to perform cross validation. For each value of lambda, the performance is tested 10 times for each 10th of the dataset and so on until the 10 10th of the dataset are used to test a value of lambda. Then we move to the next value of lambda and repeat the same and so on. This is called cross validation.
cv.ani = Cross validation animation
value3 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5)
coefpath(value3)
nfolds=5 tells to split the model into 5 and uses 4/5th for training and 1/5th for testing and repeat this 5 times for the model to get the MSE. According to the creators of the glmnet algorithm, nfolds 5 is good enough for computing and for getting a good fitting. It can be considered to use nfolds = length(dataset) which will be ressource intensive and will not necessarily improve the model fitting or parameter tuning.
On the plot, we have a
Lambda 1se is the maximum value of lambda within one standard error of the lambda min which means that value of lambda is within the confidence interval of lambda min. Lambda 1se is the maximum value of lambda which is still within acceptable MSE value and all models selected between lambda min and lambda 1se can still be considered as reasonable.
plot(value3)
The lower the Mean squared error the better.
coefplot(value3, lambda='lambda.min', sort = 'magnitude')
## Warning: Removed 38 rows containing missing values (geom_errorbarh).
## Warning: Removed 38 rows containing missing values (geom_errorbarh).
coefplot(value3, lambda='lambda.1se', sort = 'magnitude')
## Warning: Removed 18 rows containing missing values (geom_errorbarh).
## Warning: Removed 18 rows containing missing values (geom_errorbarh).
Some of the values seems to be having a coef of zero on the plot. This is simply a scalling issue.
coefplot(value3, lambda='lambda.1se', sort = 'magnitude') + xlim(-100, 100)
## Warning: Removed 18 rows containing missing values (geom_errorbarh).
## Warning: Removed 18 rows containing missing values (geom_errorbarh).
## Warning: Removed 15 rows containing missing values (geom_point).
By zooming in we can clearly see that they are not zero(0) they are somehow too small compared to other values plotted.
value4 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5, alpha=1)
coefpath(value4)
Alpha=1 means we get the lasso with a penalty 1.
value5 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5, alpha=0)
coefpath(value5)
Combination of lasso and ridge (alpha=0.6)
value6 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5, alpha=0.6)
coefpath(value6)
landx_test <- build.x(valueFormula, data = land_test, contrasts = FALSE, sparse = TRUE)
landPredictions_6 <- predict(value6, newx = landx_test, s='lambda.1se')
head(landPredictions_6)
## 1
## 1 1588829
## 2 1289088
## 3 1363508
## 4 1138912
## 5 2022969
## 6 1229006
real_to_fitted <- data.frame(land_test$TotalValue, landPredictions_6)
tail(real_to_fitted)
## land_test.TotalValue X1
## 4033 3075300 5098116
## 4034 1223100 2624644
## 4035 2080800 2992224
## 4036 1406250 2348231
## 4037 2266200 5392510
## 4038 1971000 3541060
## Using Lambda min
landPredictions_7 <- predict(value6, newx = landx_test, s='lambda.min')
real_to_fitted2 <- data.frame(land_test$TotalValue, landPredictions_7)
head(real_to_fitted2)
## land_test.TotalValue X1
## 1 711450 1878845
## 2 413550 1626028
## 3 660150 1703070
## 4 634500 1121083
## 5 616500 2153370
## 6 866700 1418300
value6$lambda
## [1] 3886125.030 3540892.393 3226329.272 2939711.072 2678555.243
## [6] 2440599.778 2223783.620 2026228.813 1846224.230 1682210.758
## [11] 1532767.790 1396600.925 1272530.748 1159482.623 1056477.383
## [16] 962622.844 877106.084 799186.397 728188.880 663498.587
## [21] 604555.201 550848.183 501912.349 457323.839 416696.450
## [26] 379678.286 345948.713 315215.583 287212.700 261697.515
## [31] 238449.029 217265.874 197964.573 180377.946 164353.666
## [36] 149752.940 136449.301 124327.521 113282.607 103218.892
## [41] 94049.211 85694.138 78081.307 71144.779 64824.473
## [46] 59065.645 53818.416 49037.336 44680.995 40711.658
## [51] 37094.947 33799.534 30796.876 28060.967 25568.108
## [56] 23296.708 21227.092 19341.336 17623.105 16057.517
## [61] 14631.011 13331.233 12146.923 11067.824 10084.589
## [66] 9188.702 8372.403 7628.621 6950.916 6333.416
## [71] 5770.772 5258.113 4790.997 4365.378 3977.570
## [76] 3624.213 3302.248 3008.885 2741.584 2498.030
## [81] 2276.112 2073.908 1889.668 1721.795 1568.835
## [86] 1429.464