## 'data.frame': 506 obs. of 12 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## crim zn indus nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.3850
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio lstat medv
## Min. :187.0 Min. :12.60 Min. : 1.73 Min. : 5.00
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95 1st Qu.:17.02
## Median :330.0 Median :19.05 Median :11.36 Median :21.20
## Mean :408.2 Mean :18.46 Mean :12.65 Mean :22.53
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :711.0 Max. :22.00 Max. :37.97 Max. :50.00
## crim zn indus nox rm age dis rad tax
## 0 0 0 0 0 0 0 0 0
## ptratio lstat medv
## 0 0 0
## Subset selection object
## Call: regsubsets.formula(medv ~ ., House, nvmax = 12)
## 11 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: exhaustive
## crim zn indus nox rm age dis rad tax ptratio lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " "*" " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " "*" " " "*" " " " " "*" "*"
## 5 ( 1 ) " " " " " " "*" "*" " " "*" " " " " "*" "*"
## 6 ( 1 ) " " "*" " " "*" "*" " " "*" " " " " "*" "*"
## 7 ( 1 ) "*" "*" " " "*" "*" " " "*" " " " " "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" " " "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## [1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7081737 0.7123280
## [8] 0.7177158 0.7233609 0.7229701 0.7225202
## [1] 9
## (Intercept) crim zn nox rm
## 42.00325750 -0.12830421 0.04609990 -17.34628361 3.71212599
## dis rad tax ptratio lstat
## -1.55247645 0.30001213 -0.01326654 -0.96398758 -0.55358655
## Subset selection object
## Call: regsubsets.formula(medv ~ ., House, nvmax = 11, method = "forward")
## 11 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: forward
## crim zn indus nox rm age dis rad tax ptratio lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " "*" " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " "*" " " "*" " " " " "*" "*"
## 5 ( 1 ) " " " " " " "*" "*" " " "*" " " " " "*" "*"
## 6 ( 1 ) " " "*" " " "*" "*" " " "*" " " " " "*" "*"
## 7 ( 1 ) "*" "*" " " "*" "*" " " "*" " " " " "*" "*"
## 8 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## [1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7081737 0.7123280
## [8] 0.7155823 0.7233609 0.7229701 0.7225202
## [1] 9
## (Intercept) crim zn indus nox
## 42.327911314 -0.127477498 0.047659839 0.034055546 -18.382995035
## rm age dis rad tax
## 3.693753664 0.005970144 -1.501890356 0.312183845 -0.014205246
## ptratio lstat
## -0.977285768 -0.563314193
The backward stepwise selection commences with the model containing all of the predictors, then removes the least useful predictors, one by one. At each step, the algorithm evaluates the prediction accuracy of the new model and continues to remove predictors until no improvement in prediction accuracy can be obtained.
## Subset selection object
## Call: regsubsets.formula(medv ~ ., House, nvmax = 12)
## 11 Variables (and intercept)
## Forced in Forced out
## crim FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: exhaustive
## crim zn indus nox rm age dis rad tax ptratio lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " "*" " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " "*" " " "*" " " " " "*" "*"
## 5 ( 1 ) " " " " " " "*" "*" " " "*" " " " " "*" "*"
## 6 ( 1 ) " " "*" " " "*" "*" " " "*" " " " " "*" "*"
## 7 ( 1 ) "*" "*" " " "*" "*" " " "*" " " " " "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" " " "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## [1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7081737 0.7123280
## [8] 0.7177158 0.7233609 0.7229701 0.7225202
## [1] 9
## (Intercept) crim zn nox rm
## 42.00325750 -0.12830421 0.04609990 -17.34628361 3.71212599
## dis rad tax ptratio lstat
## -1.55247645 0.30001213 -0.01326654 -0.96398758 -0.55358655
First I will create a matrix and a vector for the model…
x <- model.matrix(medv~., House)[,-12]
y <- House$medv
## [1] 1.000000e+10 7.390722e+09 5.462277e+09 4.037017e+09 2.983647e+09
## [6] 2.205131e+09 1.629751e+09 1.204504e+09 8.902151e+08 6.579332e+08
## [11] 4.862602e+08 3.593814e+08 2.656088e+08 1.963041e+08 1.450829e+08
## [16] 1.072267e+08 7.924829e+07 5.857021e+07 4.328761e+07 3.199267e+07
## [21] 2.364489e+07 1.747528e+07 1.291550e+07 9.545485e+06 7.054802e+06
## [26] 5.214008e+06 3.853529e+06 2.848036e+06 2.104904e+06 1.555676e+06
## [31] 1.149757e+06 8.497534e+05 6.280291e+05 4.641589e+05 3.430469e+05
## [36] 2.535364e+05 1.873817e+05 1.384886e+05 1.023531e+05 7.564633e+04
## [41] 5.590810e+04 4.132012e+04 3.053856e+04 2.257020e+04 1.668101e+04
## [46] 1.232847e+04 9.111628e+03 6.734151e+03 4.977024e+03 3.678380e+03
## [51] 2.718588e+03 2.009233e+03 1.484968e+03 1.097499e+03 8.111308e+02
## [56] 5.994843e+02 4.430621e+02 3.274549e+02 2.420128e+02 1.788650e+02
## [61] 1.321941e+02 9.770100e+01 7.220809e+01 5.336699e+01 3.944206e+01
## [66] 2.915053e+01 2.154435e+01 1.592283e+01 1.176812e+01 8.697490e+00
## [71] 6.428073e+00 4.750810e+00 3.511192e+00 2.595024e+00 1.917910e+00
## [76] 1.417474e+00 1.047616e+00 7.742637e-01 5.722368e-01 4.229243e-01
## [81] 3.125716e-01 2.310130e-01 1.707353e-01 1.261857e-01 9.326033e-02
## [86] 6.892612e-02 5.094138e-02 3.764936e-02 2.782559e-02 2.056512e-02
## [91] 1.519911e-02 1.123324e-02 8.302176e-03 6.135907e-03 4.534879e-03
## [96] 3.351603e-03 2.477076e-03 1.830738e-03 1.353048e-03 1.000000e-03
## [1] 75646.33
## (Intercept) (Intercept) crim zn indus
## 2.253610e+01 0.000000e+00 -5.040559e-05 1.725516e-05 -7.872320e-05
## nox rm age dis rad
## -4.116964e-03 1.105232e-03 -1.494948e-05 1.324493e-04 -4.892548e-05
## tax ptratio
## -3.103500e-06 -2.618964e-04
predict(rr_fit, s=1200, type="coefficients")
## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 22.7301236283
## (Intercept) .
## crim -0.0031172471
## zn 0.0010616509
## indus -0.0048472079
## nox -0.2527791958
## rm 0.0695738320
## age -0.0009166809
## dis 0.0079181297
## rad -0.0029924671
## tax -0.0001910427
## ptratio -0.0163521164
plot(rr_fit)
## [1] 0.7474429
## 1
## 1 28.30784
## 2 24.62849
## 4 28.10659
## 5 28.65952
## 7 23.65600
## 9 19.08708
## [1] 31.61026
## [1] 1.000000e+10 7.390722e+09 5.462277e+09 4.037017e+09 2.983647e+09
## [6] 2.205131e+09 1.629751e+09 1.204504e+09 8.902151e+08 6.579332e+08
## [11] 4.862602e+08 3.593814e+08 2.656088e+08 1.963041e+08 1.450829e+08
## [16] 1.072267e+08 7.924829e+07 5.857021e+07 4.328761e+07 3.199267e+07
## [21] 2.364489e+07 1.747528e+07 1.291550e+07 9.545485e+06 7.054802e+06
## [26] 5.214008e+06 3.853529e+06 2.848036e+06 2.104904e+06 1.555676e+06
## [31] 1.149757e+06 8.497534e+05 6.280291e+05 4.641589e+05 3.430469e+05
## [36] 2.535364e+05 1.873817e+05 1.384886e+05 1.023531e+05 7.564633e+04
## [41] 5.590810e+04 4.132012e+04 3.053856e+04 2.257020e+04 1.668101e+04
## [46] 1.232847e+04 9.111628e+03 6.734151e+03 4.977024e+03 3.678380e+03
## [51] 2.718588e+03 2.009233e+03 1.484968e+03 1.097499e+03 8.111308e+02
## [56] 5.994843e+02 4.430621e+02 3.274549e+02 2.420128e+02 1.788650e+02
## [61] 1.321941e+02 9.770100e+01 7.220809e+01 5.336699e+01 3.944206e+01
## [66] 2.915053e+01 2.154435e+01 1.592283e+01 1.176812e+01 8.697490e+00
## [71] 6.428073e+00 4.750810e+00 3.511192e+00 2.595024e+00 1.917910e+00
## [76] 1.417474e+00 1.047616e+00 7.742637e-01 5.722368e-01 4.229243e-01
## [81] 3.125716e-01 2.310130e-01 1.707353e-01 1.261857e-01 9.326033e-02
## [86] 6.892612e-02 5.094138e-02 3.764936e-02 2.782559e-02 2.056512e-02
## [91] 1.519911e-02 1.123324e-02 8.302176e-03 6.135907e-03 4.534879e-03
## [96] 3.351603e-03 2.477076e-03 1.830738e-03 1.353048e-03 1.000000e-03
lr_fit$lambda[3]
## [1] 5462277218
coef(lr_fit)[,3]
## (Intercept) (Intercept) crim zn indus nox
## 22.53281 0.00000 0.00000 0.00000 0.00000 0.00000
## rm age dis rad tax ptratio
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## lstat
## 0.00000
## [1] 0.001353048
## (Intercept) (Intercept) crim zn indus
## 42.141651307 0.000000000 -0.126868372 0.047265379 0.031536593
## nox rm age dis rad
## -18.231216287 3.699802108 0.005697395 -1.497918406 0.307433154
## tax ptratio lstat
## -0.013971646 -0.975026105 -0.562814717
lr_fit$lambda[70]
## [1] 8.69749
coef(lr_fit)[,70]
## (Intercept) (Intercept) crim zn indus nox
## 22.53281 0.00000 0.00000 0.00000 0.00000 0.00000
## rm age dis rad tax ptratio
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## lstat
## 0.00000
## (Intercept)
## 22.53281
## [1] 0.007662987
## 1
## 1 30.121452
## 2 25.333570
## 5 28.019669
## 7 22.874511
## 8 18.868194
## 9 9.796525
## [1] 26.00348
This is the Mean Squared Error
Coefficients of the best model are now printed below.
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 41.634644661
## (Intercept) .
## crim -0.124841753
## zn 0.045910788
## indus 0.023830593
## nox -17.776888249
## rm 3.715300985
## age 0.004762168
## dis -1.484557929
## rad 0.295399737
## tax -0.013357477
## ptratio -0.969269290
## lstat -0.561008977
## [1] 26.00348
they properly explain the initial predictors they are related to the response variable as well.
plsr_fit <-plsr(medv~., data =House, scale=T, validation="CV")
## Data: X dimension: 506 11
## Y dimension: 506 1
## Fit method: kernelpls
## Number of components considered: 11
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 9.206 6.634 5.164 5.094 5.026 5.003 4.985
## adjCV 9.206 6.631 5.160 5.086 5.016 4.993 4.975
## 7 comps 8 comps 9 comps 10 comps 11 comps
## CV 4.980 4.975 4.974 4.974 4.974
## adjCV 4.969 4.965 4.964 4.964 4.964
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 52.36 64.16 70.45 76.62 84.21 87.66 89.42
## medv 48.85 69.56 71.29 72.28 72.55 72.72 72.84
## 8 comps 9 comps 10 comps 11 comps
## X 93.21 95.84 98.17 100.00
## medv 72.85 72.86 72.86 72.86
## [1] 24.39372
## , , 7 comps
##
## medv
## crim -1.1837911
## zn 1.0847533
## indus 0.2459288
## nox -2.2364267
## rm 2.6363328
## age 0.1348533
## dis -3.1805166
## rad 2.6584263
## tax -2.2096019
## ptratio -2.1684118
## lstat -3.9348413
## Data: X dimension: 506 11
## Y dimension: 506 1
## Fit method: kernelpls
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 52.36 64.16 70.45 76.62 84.21 87.66 89.42
## medv 48.85 69.56 71.29 72.28 72.55 72.72 72.84
## , , 7 comps
##
## medv
## crim -1.1837911
## zn 1.0847533
## indus 0.2459288
## nox -2.2364267
## rm 2.6363328
## age 0.1348533
## dis -3.1805166
## rad 2.6584263
## tax -2.2096019
## ptratio -2.1684118
## lstat -3.9348413
## [1] 25.652950 32.442835 29.412962 25.569401 21.935131 9.857613
## [1] 29.40936
## Partial least squares regression , fitted with the kernel algorithm.
## Call:
## plsr(formula = medv ~ ., ncomp = 7, data = house_train, scale = T)