Statement iii is correct as LASSO will restrict the coeffcients of regression and introduce more bias in order to lower the overall variance. The key component of this restriction comes from the λ choosen.
set.seed(1)
data("College")
trainIndex = createDataPartition(College$Private, p = .8, list = FALSE, times = 1)
traindata = College[trainIndex,]
testdata = College[-trainIndex,]
ols.fit = lm(Apps ~ . , data = traindata)
ols.preds = predict(ols.fit, testdata)
ols.results = sqrt(mean((testdata$Apps - ols.preds)^2))
ols.results
## [1] 998.9924
There is a mean squared error rate of 998.9924
set.seed(1)
xtrain = model.matrix(Apps~.,traindata)[,-2]
ytrain = traindata$Apps
xtest = model.matrix(Apps~.,testdata)[,-2]
ytest = testdata$Apps
ridge.mod=cv.glmnet(xtrain, ytrain, alpha=0)
plot(ridge.mod)
bestlam=ridge.mod$lambda.min
bestlam
## [1] 383.7957
ridge.preds = predict(ridge.mod, s=bestlam, newx = xtest)
ridge.results = sqrt(mean((ytest - ridge.preds)^2))
ridge.results
## [1] 982.1588
The best mean squared error is 982.1588 using the cv model. The best λ is 383.7957.
set.seed(1)
lasso.mod = cv.glmnet(xtrain, ytrain,alpha=1)
plot(lasso.mod)
bestlam = lasso.mod$lambda.min
bestlam
## [1] 2.048202
lasso.preds = predict(lasso.mod, s=bestlam, newx = xtest)
lasso.results = sqrt(mean((ytest - lasso.preds)^2))
lasso.results
## [1] 984.5824
lasso.preds = predict(lasso.mod, s=bestlam, newx = xtest, type = "coefficients")
lasso.preds
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -783.46149358
## (Intercept) .
## Accept 1.62255874
## Enroll -0.91230250
## Top10perc 49.56165734
## Top25perc -11.83787235
## F.Undergrad 0.06579904
## P.Undergrad 0.06514468
## Outstate -0.10827200
## Room.Board 0.13622918
## Books -0.06499497
## Personal 0.01644014
## PhD -5.59529900
## Terminal -1.12203775
## S.F.Ratio 19.44189122
## perc.alumni -0.19164224
## Expend 0.06098425
## Grad.Rate 6.48207516
Using the Lasso method, there is a mean squared error rate of 984.5824 and a λ of 2.048202.
pcr.fit = pcr(Apps~., data=College, scale=TRUE, validation ="CV")
validationplot(pcr.fit,val.type="MSEP")
pcr.preds = predict(pcr.fit, testdata, ncomp = 10)
pcr.results = sqrt(mean((ytest - pcr.preds)^2))
pcr.results
## [1] 1165.594
The test error is 1165.594, with the ideal M being 5 components as there diminishing returns with more included components as compared to the actual increase of explained variance.
pls.fit = plsr(Apps~., data=College, scale=TRUE, validation ="CV")
validationplot(pls.fit,val.type="MSEP")
pls.preds = predict(pls.fit, testdata, ncomp = 10)
pls.results = sqrt(mean((ytest - pls.preds)^2))
pls.results
## [1] 946.0582
The test error is 946.0582, with the ideal M being 5 components as there again are very little increases of explained variance beyond 5.
results = data.frame(
tst.mse = c(
ols.results,
ridge.results,
lasso.results,
pcr.results,
pls.results
)
)
results
## tst.mse
## 1 998.9924
## 2 982.1588
## 3 984.5824
## 4 1165.5943
## 5 946.0582
Comparing the test Mean Square Errors of each method back to back, we can see that our pcr model scored the worst with the highest test error rate and the pls model scored the best. The range between these two approaches is quite large, but the the other 3 methods have very similar error rates which are not too much larger than the pls model.
data(Boston)
set.seed(654)
trainIndex = createDataPartition(Boston$crim, p = .8, list = FALSE, times = 1)
traindata = Boston[trainIndex,]
testdata = Boston[-trainIndex,]
regfit.full = regsubsets(crim~., data = traindata, nvmax = 13)
regfit.summary = summary(regfit.full)
regfit.summary
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = traindata, nvmax = 13)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " " " "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " " " " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" " " " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" " " "*" "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" "*" "*" " " "*" "*"
## 9 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" " " " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
regfit.summary$rsq
## [1] 0.3804489 0.4120104 0.4154127 0.4226819 0.4272780 0.4328064 0.4370670
## [8] 0.4386613 0.4400320 0.4407432 0.4413101 0.4417514 0.4418438
par(mfrow=c(2,2))
plot(regfit.summary$rss, xlab="Number of Variables ", ylab="RSS", type="l")
which.min(regfit.summary$rss)
## [1] 13
plot(regfit.summary$adjr2, xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
which.max(regfit.summary$adjr2)
## [1] 8
points(8,regfit.summary$adjr2[8], col="red", cex=2, pch =20)
plot(regfit.summary$cp,xlab="Number of Variables ",ylab="Cp",type='l')
which.min(regfit.summary$cp)
## [1] 7
points(7,regfit.summary$cp[7], col="red", cex=2, pch =20)
plot(regfit.summary$bic ,xlab="Number of Variables ", ylab="BIC", type='l')
which.min(regfit.summary$bic)
## [1] 2
points(2,regfit.summary$bic[2], col="red", cex=2, pch =20)
test.mat=model.matrix(crim~.,data=Boston[-trainIndex ,])
val.errors =rep(NA ,13)
for(i in 1:13){
coefi=coef(regfit.full ,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((Boston$crim[-trainIndex]-pred)^2)
}
which.min(val.errors)
## [1] 9
coef(regfit.full,9)
## (Intercept) zn indus nox dis
## 20.704188450 0.044408784 -0.091908825 -12.858961262 -1.061866823
## rad ptratio black lstat medv
## 0.592280830 -0.380260955 -0.004310983 0.152521590 -0.190166376
predict.regsubsets = function(object , newdata ,id ,...){
form = as.formula (object$call[[2]])
mat = model.matrix(form ,newdata )
coefi = coef(object ,id=id)
xvars = names(coefi)
mat[,xvars]%*%coefi
}
regfit.best = regsubsets(crim~., data = Boston, nvmax=13)
coef(regfit.best, 9)
## (Intercept) zn indus nox dis
## 19.124636156 0.042788127 -0.099385948 -10.466490364 -1.002597606
## rad ptratio black lstat medv
## 0.539503547 -0.270835584 -0.008003761 0.117805932 -0.180593877
k = 10
set.seed(1)
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA,k,13)
for(j in 1:k){
best.fit = regsubsets (crim~., data = Boston[folds!=j,],nvmax=13)
for(i in 1:13){
pred = predict (best.fit, Boston[folds == j, ], id=i)
cv.errors[j,i] = mean((Boston$crim[folds == j] - pred)^2)
}
}
cv.rmse = sqrt(apply(cv.errors, 2, mean))
plot(cv.rmse, type = "b")
which.min(cv.rmse)
## [1] 9
subset.results = cv.rmse[12]
set.seed(654)
xtrain = model.matrix(crim~.,traindata)[,-1]
ytrain = traindata$crim
xtest = model.matrix(crim~.,testdata)[,-1]
ytest = testdata$crim
ridge.mod=cv.glmnet(xtrain, ytrain, alpha=0)
plot(ridge.mod)
bestlam=ridge.mod$lambda.min
bestlam
## [1] 0.5668146
ridge.preds = predict(ridge.mod, s=bestlam, newx = xtest)
ridge.results = sqrt(mean((ytest - ridge.preds)^2))
ridge.results
## [1] 3.526942
coef(ridge.mod)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.180199047
## zn -0.002985378
## indus 0.032458045
## chas -0.196174947
## nox 2.334853515
## rm -0.170437048
## age 0.007406539
## dis -0.113696130
## rad 0.055762793
## tax 0.002458256
## ptratio 0.083035661
## black -0.002827627
## lstat 0.043604682
## medv -0.027503362
set.seed(654)
lasso.mod = cv.glmnet(xtrain, ytrain,alpha=1)
plot(lasso.mod)
bestlam = lasso.mod$lambda.min
bestlam
## [1] 0.09455037
lasso.preds = predict(lasso.mod, s=bestlam, newx = xtest)
lasso.results = sqrt(mean((ytest - lasso.preds)^2))
lasso.results
## [1] 3.61382
coef(lasso.mod)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.4776734
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.2420251
## tax .
## ptratio .
## black .
## lstat .
## medv .
set.seed(654)
pcr.fit = pcr(crim~., data=traindata, scale=TRUE, validation ="CV")
validationplot(pcr.fit,val.type="MSEP")
pcr.preds = predict(pcr.fit, testdata, ncomp = 10)
pcr.results = sqrt(mean((ytest - pcr.preds)^2))
pcr.results
## [1] 3.716394
coef(pcr.fit)
## , , 13 comps
##
## crim
## zn 1.0865303
## indus -0.3718479
## chas -0.2085649
## nox -1.5021515
## rm 0.3126469
## age 0.1568212
## dis -2.1447694
## rad 5.6760660
## tax -0.7287607
## ptratio -0.8238037
## black -0.3490060
## lstat 1.1020150
## medv -2.0119239
set.seed(654)
pls.fit = plsr(crim~., data=traindata, scale=TRUE, validation ="CV")
validationplot(pls.fit,val.type="MSEP")
pls.preds = predict(pls.fit, testdata, ncomp = 10)
pls.results = sqrt(mean((ytest - pls.preds)^2))
pls.results
## [1] 3.720899
coef(pls.fit)
## , , 13 comps
##
## crim
## zn 1.0865303
## indus -0.3718479
## chas -0.2085649
## nox -1.5021515
## rm 0.3126469
## age 0.1568212
## dis -2.1447694
## rad 5.6760660
## tax -0.7287607
## ptratio -0.8238037
## black -0.3490060
## lstat 1.1020150
## medv -2.0119239
results = data.frame(
tst.mse = c(
subset.results,
ridge.results,
lasso.results,
pcr.results,
pls.results
)
)
results
## tst.mse
## 1 6.548839
## 2 3.526942
## 3 3.613820
## 4 3.716394
## 5 3.720899
With the five different models for predicting crime rate (Best subset selection, Ridge, LASSO, PCR, and PLS), the results point towards Ridge being the best method with the smallest mean squared error after comparing the predictions with the test data. The LASSO removes all but one variable ‘rad’ which feels the model might be lacking in usefulness.
In this case, the best model would be the Ridge model. All of the other models are very close except the best subset, which is a bit distanced away. The separation between the Ridge model and principle component models is only a hundredth and could be random chance resulting from the seed choosen.
While the ridge model contains each possible component, some have much greater weight placed on their coefficients than others. Age, Tax, and Black all only have only have coefficients in the thousandths. That is because the ridge model doesn’t eliminate variables like LASSO but rather reduces their weight so they still play a role in the model.