Dimensionality and Elastic Net

Preparing and loading packages and datasets

Dealing with highly correlated variables,

library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.4
## Loaded glmnet 2.0-16
library(useful)
## Warning: package 'useful' was built under R version 3.4.4
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
library(coefplot)
## Warning: package 'coefplot' was built under R version 3.4.4
library(magrittr)
library(animation)
## Warning: package 'animation' was built under R version 3.4.4
land_train <- readr::read_csv('data/manhattan_Train.csv')
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   TotalValue = col_double(),
##   Borough = col_character(),
##   SchoolDistrict = col_character(),
##   FireService = col_character(),
##   ZoneDist1 = col_character(),
##   ZoneDist2 = col_character(),
##   ZoneDist3 = col_character(),
##   ZoneDist4 = col_character(),
##   Class = col_character(),
##   LandUse = col_character(),
##   OwnerType = col_character(),
##   NumFloors = col_double(),
##   LotFront = col_double(),
##   LotDepth = col_double(),
##   BldgFront = col_double(),
##   BldgDepth = col_double(),
##   Extension = col_character(),
##   Proximity = col_character(),
##   IrregularLot = col_character(),
##   LotType = col_character()
##   # ... with 8 more columns
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 504 parsing failures.
## row # A tibble: 5 x 5 col     row col     expected               actual file                       expected   <int> <chr>   <chr>                  <chr>  <chr>                      actual 1  3509 CommFAR no trailing characters .4     'data/manhattan_Train.csv' file 2  3510 CommFAR no trailing characters .4     'data/manhattan_Train.csv' row 3  3511 CommFAR no trailing characters .4     'data/manhattan_Train.csv' col 4  3537 CommFAR no trailing characters .4     'data/manhattan_Train.csv' expected 5  3538 CommFAR no trailing characters .4     'data/manhattan_Train.csv'
## ... ................. ... ........................................................................ ........ ........................................................................ ...... ........................................................................ .... ........................................................................ ... ........................................................................ ... ........................................................................ ........ ........................................................................
## See problems(...) for more details.
land_test <- readRDS('data/manhattan_Test.rds')

valueFormula <- TotalValue ~ FireService + 
    ZoneDist1 + ZoneDist2 + Class + LandUse + 
    OwnerType + LotArea + BldgArea + ComArea + 
    ResArea + OfficeArea + RetailArea + 
    GarageArea + FactryArea + NumBldgs + 
    NumFloors + UnitsRes + UnitsTotal + 
    LotFront + LotDepth + BldgFront + 
    BldgDepth + LotType + Landmark + BuiltFAR +
    Built + HistoricDistrict - 1

valueFormula
## TotalValue ~ FireService + ZoneDist1 + ZoneDist2 + Class + LandUse + 
##     OwnerType + LotArea + BldgArea + ComArea + ResArea + OfficeArea + 
##     RetailArea + GarageArea + FactryArea + NumBldgs + NumFloors + 
##     UnitsRes + UnitsTotal + LotFront + LotDepth + BldgFront + 
##     BldgDepth + LotType + Landmark + BuiltFAR + Built + HistoricDistrict - 
##     1
class(valueFormula)
## [1] "formula"
value1 <- lm(valueFormula, data = land_train)

coefplot(value1, sort='magnitude')

summary(value1)
## 
## Call:
## lm(formula = valueFormula, data = land_train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -40513446   -618695    -63257    386739  16044900 
## 
## Coefficients: (3 not defined because of singularities)
##                                                   Estimate Std. Error
## FireServiceEngine                                1.168e+06  6.615e+05
## FireServiceLadder                                1.193e+06  6.612e+05
## FireServiceSquad                                 1.176e+06  6.668e+05
## ZoneDist1Manufacturing                           1.027e+05  4.883e+04
## ZoneDist1Mixed Residential                      -6.827e+05  3.021e+05
## ZoneDist1Park                                    7.731e+05  2.139e+05
## ZoneDist1Residential                            -5.469e+05  2.641e+04
## ZoneDist2Manufacturing                           4.269e+05  1.913e+05
## ZoneDist2Missing                                -9.439e+05  6.326e+04
## ZoneDist2Mixed Residential                      -2.566e+06  1.751e+06
## ZoneDist2Residential                            -9.953e+04  7.755e+04
## ClassAutomotive                                 -8.012e+05  3.737e+05
## ClassCondo                                       7.915e+05  2.931e+05
## ClassEducation                                   1.544e+06  1.837e+05
## ClassElevator Apartment                         -6.724e+05  2.929e+05
## ClassGovernment                                  5.359e+05  2.776e+05
## ClassHealth                                      8.570e+05  2.074e+05
## ClassHotel                                       2.570e+05  2.979e+05
## ClassIndustrial                                 -2.225e+06  4.398e+05
## ClassLoft                                       -1.479e+06  2.931e+05
## ClassMisc                                        1.401e+06  2.553e+05
## ClassMixed                                      -9.462e+05  2.914e+05
## ClassOffice                                     -5.843e+05  2.863e+05
## ClassOutdoor                                     2.111e+06  7.098e+05
## ClassPublic                                      6.433e+05  2.155e+05
## ClassReligious                                   4.263e+05  1.749e+05
## ClassRetail                                     -2.230e+05  2.861e+05
## ClassSingle Family                              -1.541e+06  2.856e+05
## ClassTheatre                                    -1.146e+06  3.376e+05
## ClassTransport                                   3.460e+05  4.352e+05
## ClassTwo Family                                 -1.430e+06  2.860e+05
## ClassUtility                                     1.556e+05  1.293e+06
## ClassVacant                                      3.864e+05  3.058e+05
## ClassWalk Up Apartment                          -7.045e+05  2.899e+05
## ClassWarehouse                                  -1.728e+06  3.219e+05
## LandUseIndustrial & Manufacturing                       NA         NA
## LandUseMixed Residential & Commercial Buildings -7.231e+05  7.375e+04
## LandUseMulti-Family Elevator Buildings          -6.454e+05  9.191e+04
## LandUseMulti-Family Walk-Up Buildings           -8.682e+05  8.092e+04
## LandUseOne &Two Family Buildings                        NA         NA
## LandUseOpen Space & Outdoor Recreation          -2.658e+06  7.125e+05
## LandUseParking Facilities                        5.025e+05  3.582e+05
## LandUsePublic Facilities & Institutions         -1.367e+06  2.323e+05
## LandUseTransportation & Utility                 -6.635e+05  3.482e+05
## LandUseVacant Land)                                     NA         NA
## OwnerTypeMixed                                   3.506e+04  9.292e+04
## OwnerTypePrivate                                -1.505e+05  7.640e+04
## OwnerTypePublic                                 -4.370e+05  1.334e+05
## OwnerTypeUnknown                                -1.478e+04  7.707e+04
## LotArea                                          9.298e+00  6.853e-01
## BldgArea                                         3.966e+01  1.932e+00
## ComArea                                         -9.647e+00  2.046e+00
## ResArea                                         -1.385e+01  1.992e+00
## OfficeArea                                       1.571e+00  9.769e-01
## RetailArea                                      -5.040e+00  1.558e+00
## GarageArea                                      -1.747e+01  2.203e+00
## FactryArea                                      -5.201e+00  5.075e+00
## NumBldgs                                         4.867e+04  1.783e+04
## NumFloors                                        2.737e+05  4.795e+03
## UnitsRes                                        -5.386e+03  8.854e+02
## UnitsTotal                                       4.300e+03  6.185e+02
## LotFront                                         3.709e+03  2.040e+02
## LotDepth                                         3.162e+03  2.649e+02
## BldgFront                                        3.492e+03  2.974e+02
## BldgDepth                                       -9.153e+02  2.620e+02
## LotTypeBlock Assemblage                          6.795e+05  6.083e+05
## LotTypeConnecting                                1.421e+06  5.914e+05
## LotTypeCorner                                    1.145e+06  5.866e+05
## LotTypeInside                                    6.693e+05  5.861e+05
## LotTypeInterior Lot                              5.771e+05  6.413e+05
## LotTypeIsland Lot                               -2.166e+05  1.847e+06
## LotTypeMixed or Unknown                          5.230e+05  5.907e+05
## LotTypeSubmerged Land Lot                       -3.247e+05  6.364e+05
## LotTypeWaterfront                                2.883e+05  6.195e+05
## LandmarkYes                                      4.925e+05  7.888e+04
## BuiltFAR                                         3.656e+04  5.312e+03
## Built10s                                        -6.708e+05  1.248e+05
## Built18th Century                               -5.711e+05  6.422e+04
## Built60s                                         3.510e+05  9.470e+04
## Built70s                                        -6.276e+05  1.259e+05
## Built80s                                         2.931e+05  1.113e+05
## Built90s                                         5.114e+05  1.195e+05
## BuiltPostwar                                    -1.178e+05  8.125e+04
## BuiltPrewar                                     -3.201e+05  6.830e+04
## BuiltUnknown                                    -7.726e+05  1.219e+05
## HistoricDistrictYes                              3.715e+05  2.433e+04
##                                                 t value Pr(>|t|)    
## FireServiceEngine                                 1.765 0.077564 .  
## FireServiceLadder                                 1.805 0.071123 .  
## FireServiceSquad                                  1.763 0.077926 .  
## ZoneDist1Manufacturing                            2.103 0.035495 *  
## ZoneDist1Mixed Residential                       -2.260 0.023815 *  
## ZoneDist1Park                                     3.615 0.000301 ***
## ZoneDist1Residential                            -20.703  < 2e-16 ***
## ZoneDist2Manufacturing                            2.231 0.025671 *  
## ZoneDist2Missing                                -14.921  < 2e-16 ***
## ZoneDist2Mixed Residential                       -1.465 0.142868    
## ZoneDist2Residential                             -1.283 0.199342    
## ClassAutomotive                                  -2.144 0.032047 *  
## ClassCondo                                        2.700 0.006927 ** 
## ClassEducation                                    8.407  < 2e-16 ***
## ClassElevator Apartment                          -2.296 0.021700 *  
## ClassGovernment                                   1.930 0.053581 .  
## ClassHealth                                       4.132 3.61e-05 ***
## ClassHotel                                        0.863 0.388362    
## ClassIndustrial                                  -5.059 4.23e-07 ***
## ClassLoft                                        -5.044 4.58e-07 ***
## ClassMisc                                         5.486 4.14e-08 ***
## ClassMixed                                       -3.247 0.001166 ** 
## ClassOffice                                      -2.041 0.041258 *  
## ClassOutdoor                                      2.974 0.002942 ** 
## ClassPublic                                       2.985 0.002841 ** 
## ClassReligious                                    2.437 0.014830 *  
## ClassRetail                                      -0.780 0.435649    
## ClassSingle Family                               -5.397 6.84e-08 ***
## ClassTheatre                                     -3.394 0.000690 ***
## ClassTransport                                    0.795 0.426574    
## ClassTwo Family                                  -5.000 5.77e-07 ***
## ClassUtility                                      0.120 0.904221    
## ClassVacant                                       1.264 0.206308    
## ClassWalk Up Apartment                           -2.430 0.015091 *  
## ClassWarehouse                                   -5.370 7.93e-08 ***
## LandUseIndustrial & Manufacturing                    NA       NA    
## LandUseMixed Residential & Commercial Buildings  -9.804  < 2e-16 ***
## LandUseMulti-Family Elevator Buildings           -7.022 2.23e-12 ***
## LandUseMulti-Family Walk-Up Buildings           -10.729  < 2e-16 ***
## LandUseOne &Two Family Buildings                     NA       NA    
## LandUseOpen Space & Outdoor Recreation           -3.731 0.000191 ***
## LandUseParking Facilities                         1.403 0.160610    
## LandUsePublic Facilities & Institutions          -5.886 4.01e-09 ***
## LandUseTransportation & Utility                  -1.905 0.056743 .  
## LandUseVacant Land)                                  NA       NA    
## OwnerTypeMixed                                    0.377 0.705956    
## OwnerTypePrivate                                 -1.969 0.048934 *  
## OwnerTypePublic                                  -3.276 0.001055 ** 
## OwnerTypeUnknown                                 -0.192 0.847914    
## LotArea                                          13.566  < 2e-16 ***
## BldgArea                                         20.528  < 2e-16 ***
## ComArea                                          -4.715 2.43e-06 ***
## ResArea                                          -6.952 3.68e-12 ***
## OfficeArea                                        1.608 0.107832    
## RetailArea                                       -3.235 0.001217 ** 
## GarageArea                                       -7.931 2.25e-15 ***
## FactryArea                                       -1.025 0.305473    
## NumBldgs                                          2.730 0.006343 ** 
## NumFloors                                        57.085  < 2e-16 ***
## UnitsRes                                         -6.083 1.19e-09 ***
## UnitsTotal                                        6.953 3.65e-12 ***
## LotFront                                         18.179  < 2e-16 ***
## LotDepth                                         11.936  < 2e-16 ***
## BldgFront                                        11.742  < 2e-16 ***
## BldgDepth                                        -3.493 0.000478 ***
## LotTypeBlock Assemblage                           1.117 0.263970    
## LotTypeConnecting                                 2.403 0.016263 *  
## LotTypeCorner                                     1.952 0.050920 .  
## LotTypeInside                                     1.142 0.253428    
## LotTypeInterior Lot                               0.900 0.368153    
## LotTypeIsland Lot                                -0.117 0.906653    
## LotTypeMixed or Unknown                           0.885 0.375922    
## LotTypeSubmerged Land Lot                        -0.510 0.609915    
## LotTypeWaterfront                                 0.465 0.641629    
## LandmarkYes                                       6.244 4.31e-10 ***
## BuiltFAR                                          6.882 6.00e-12 ***
## Built10s                                         -5.373 7.79e-08 ***
## Built18th Century                                -8.892  < 2e-16 ***
## Built60s                                          3.706 0.000211 ***
## Built70s                                         -4.983 6.28e-07 ***
## Built80s                                          2.633 0.008467 ** 
## Built90s                                          4.278 1.89e-05 ***
## BuiltPostwar                                     -1.449 0.147299    
## BuiltPrewar                                      -4.687 2.78e-06 ***
## BuiltUnknown                                     -6.338 2.35e-10 ***
## HistoricDistrictYes                              15.268  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1747000 on 32167 degrees of freedom
## Multiple R-squared:  0.7772, Adjusted R-squared:  0.7766 
## F-statistic:  1352 on 83 and 32167 DF,  p-value: < 2.2e-16

Let’s build the matrices using the train data and the formula

## This is the x matrice
landx_train <- build.x(valueFormula, data = land_train, contrasts = FALSE, sparse = TRUE)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## This is the y matrice. No Contrast and no sparce argument for the y matrice
landy_train <- build.y(valueFormula, data = land_train)

## Fitting the model

value2 <- glmnet(x=landx_train, y=landy_train, family = 'gaussian')

coefpath(value2)
summary(value2)
##           Length Class     Mode   
## a0          94   -none-    numeric
## beta      8930   dgCMatrix S4     
## df          94   -none-    numeric
## dim          2   -none-    numeric
## lambda      94   -none-    numeric
## dev.ratio   94   -none-    numeric
## nulldev      1   -none-    numeric
## npasses      1   -none-    numeric
## jerr         1   -none-    numeric
## offset       1   -none-    logical
## call         4   -none-    call   
## nobs         1   -none-    numeric

GLMNET is one of the faster model fitting function out there. GLMNET fits a model for each value of lambda. We ended up with a lot of possible models depending of the value of lambda selected. The question now is how do we select the optimal lambda? This will be answered below

Optimal model selection

How to choose the optimal value of lambda which will lead to the optimal model?

## The function animation below shows how to split the dataset into 10 block of 10 test sets which will each be used for for testing the model while the rest of the 9th will be used for training the model. This will be done in loop until each 10th of the data set is used as a test dataset.

animation::cv.ani(k=10)

This animation splits the dataset into 10th/90th for test/train and is very useful to perform cross validation. For each value of lambda, the performance is tested 10 times for each 10th of the dataset and so on until the 10 10th of the dataset are used to test a value of lambda. Then we move to the next value of lambda and repeat the same and so on. This is called cross validation.

cv.ani = Cross validation animation

Cross validation and tuning parameters

value3 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5)

coefpath(value3)

nfolds=5 tells to split the model into 5 and uses 4/5th for training and 1/5th for testing and repeat this 5 times for the model to get the MSE. According to the creators of the glmnet algorithm, nfolds 5 is good enough for computing and for getting a good fitting. It can be considered to use nfolds = length(dataset) which will be ressource intensive and will not necessarily improve the model fitting or parameter tuning.

On the plot, we have a

  • lambda min which corresponds to the best value of lambda fitted. That best value of lambda corresponds to a model selected and we can see the coefficient values displayed on the screen and the variables with a coefficient of zero were those not selected in the model selection.
  • Lambda 1se is the maximum value of lambda within one standard error of the lambda min which means that value of lambda is within the confidence interval of lambda min. Lambda 1se is the maximum value of lambda which is still within acceptable MSE value and all models selected between lambda min and lambda 1se can still be considered as reasonable.

  • plotting the coef against the MSE (Mean squared error)

    plot(value3)

    The lower the Mean squared error the better.

    Plotting the coef from the lambda min

    coefplot(value3, lambda='lambda.min', sort = 'magnitude')
    ## Warning: Removed 38 rows containing missing values (geom_errorbarh).
    
    ## Warning: Removed 38 rows containing missing values (geom_errorbarh).

    Plotting the coef from the lambda 1se

    coefplot(value3, lambda='lambda.1se', sort = 'magnitude')
    ## Warning: Removed 18 rows containing missing values (geom_errorbarh).
    
    ## Warning: Removed 18 rows containing missing values (geom_errorbarh).

    Some of the values seems to be having a coef of zero on the plot. This is simply a scalling issue.

    coefplot(value3, lambda='lambda.1se', sort = 'magnitude') + xlim(-100, 100)
    ## Warning: Removed 18 rows containing missing values (geom_errorbarh).
    
    ## Warning: Removed 18 rows containing missing values (geom_errorbarh).
    ## Warning: Removed 15 rows containing missing values (geom_point).

    By zooming in we can clearly see that they are not zero(0) they are somehow too small compared to other values plotted.

    Lasso penalty 1

    value4 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5, alpha=1)
    coefpath(value4)

    Alpha=1 means we get the lasso with a penalty 1.

    Ridge (alpha = 0)

    value5 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5, alpha=0)
    coefpath(value5)

    Combination of lasso and ridge (alpha=0.6)

    value6 <- cv.glmnet(x=landx_train, y=landy_train, family='gaussian', nfolds = 5, alpha=0.6)
    coefpath(value6)

    Testing the result using our test dataset

    landx_test <- build.x(valueFormula, data = land_test, contrasts = FALSE, sparse = TRUE)
    
    landPredictions_6 <- predict(value6, newx = landx_test, s='lambda.1se')
    
    head(landPredictions_6)
    ##         1
    ## 1 1588829
    ## 2 1289088
    ## 3 1363508
    ## 4 1138912
    ## 5 2022969
    ## 6 1229006
    real_to_fitted  <- data.frame(land_test$TotalValue, landPredictions_6)
    
    tail(real_to_fitted)
    ##      land_test.TotalValue      X1
    ## 4033              3075300 5098116
    ## 4034              1223100 2624644
    ## 4035              2080800 2992224
    ## 4036              1406250 2348231
    ## 4037              2266200 5392510
    ## 4038              1971000 3541060
    ## Using Lambda min
    landPredictions_7 <- predict(value6, newx = landx_test, s='lambda.min')
    real_to_fitted2  <- data.frame(land_test$TotalValue, landPredictions_7)
    
    head(real_to_fitted2)
    ##   land_test.TotalValue      X1
    ## 1               711450 1878845
    ## 2               413550 1626028
    ## 3               660150 1703070
    ## 4               634500 1121083
    ## 5               616500 2153370
    ## 6               866700 1418300
    value6$lambda
    ##  [1] 3886125.030 3540892.393 3226329.272 2939711.072 2678555.243
    ##  [6] 2440599.778 2223783.620 2026228.813 1846224.230 1682210.758
    ## [11] 1532767.790 1396600.925 1272530.748 1159482.623 1056477.383
    ## [16]  962622.844  877106.084  799186.397  728188.880  663498.587
    ## [21]  604555.201  550848.183  501912.349  457323.839  416696.450
    ## [26]  379678.286  345948.713  315215.583  287212.700  261697.515
    ## [31]  238449.029  217265.874  197964.573  180377.946  164353.666
    ## [36]  149752.940  136449.301  124327.521  113282.607  103218.892
    ## [41]   94049.211   85694.138   78081.307   71144.779   64824.473
    ## [46]   59065.645   53818.416   49037.336   44680.995   40711.658
    ## [51]   37094.947   33799.534   30796.876   28060.967   25568.108
    ## [56]   23296.708   21227.092   19341.336   17623.105   16057.517
    ## [61]   14631.011   13331.233   12146.923   11067.824   10084.589
    ## [66]    9188.702    8372.403    7628.621    6950.916    6333.416
    ## [71]    5770.772    5258.113    4790.997    4365.378    3977.570
    ## [76]    3624.213    3302.248    3008.885    2741.584    2498.030
    ## [81]    2276.112    2073.908    1889.668    1721.795    1568.835
    ## [86]    1429.464