library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(MASS)
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.2
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.3
## Loading required package: Matrix
## Loaded glmnet 4.1-3
library(pcr)
## Warning: package 'pcr' was built under R version 4.1.3
library(pls)
## Warning: package 'pls' was built under R version 4.1.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
Answer: (iii) Less flexible and will give improved prediction accuracy when its increase in bias is less than its decrease in variance. As lambda increases, flexibility of fit decreases, and so the estimated coefficients decrease with some being zero. This leads to a big decrease in the variance of the predictions for a small increase in bias. LASSO has a big advantage over Ridge Regression, since it allows to have coefficients equal to 0, it allows for dimensional reduction when needed.
Answer: (iii) just like (a) with lasso, however since the ridge regression will not allow any coefficient to be equal to zero., this is not a problem when it comes to prediction accuracy, but it is when it comes to interpretability, which came make our model difficult to interpret.
Answer: ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. Non-linear models are more flexible, they tend to have higher variance and lower bias. Predictions will improve if the variance goes up.
library(ISLR)
attach(College)
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
set.seed(1)
sample_data = sample.split(College, SplitRatio = 0.8 )
College_train = subset(College, sample_data==TRUE)
College_test = subset(College, sample_data==FALSE)
lm_fit = lm(Apps~., data=College_train)
summary(lm_fit)
##
## Call:
## lm(formula = Apps ~ ., data = College_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5263.2 -488.6 -44.8 344.4 7262.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -571.34216 473.11428 -1.208 0.227680
## PrivateYes -448.71859 157.35594 -2.852 0.004503 **
## Accept 1.61208 0.04533 35.565 < 2e-16 ***
## Enroll -0.97719 0.22367 -4.369 1.48e-05 ***
## Top10perc 56.23440 6.38232 8.811 < 2e-16 ***
## Top25perc -17.63194 5.14680 -3.426 0.000656 ***
## F.Undergrad 0.07075 0.03783 1.871 0.061904 .
## P.Undergrad 0.07217 0.03528 2.046 0.041247 *
## Outstate -0.08662 0.02179 -3.975 7.92e-05 ***
## Room.Board 0.14704 0.05606 2.623 0.008940 **
## Books 0.08428 0.28391 0.297 0.766691
## Personal 0.01505 0.07195 0.209 0.834404
## PhD -7.94030 5.35009 -1.484 0.138308
## Terminal -5.73813 5.89438 -0.973 0.330710
## S.F.Ratio 26.68115 16.12664 1.654 0.098565 .
## perc.alumni 4.15144 4.83079 0.859 0.390487
## Expend 0.07483 0.01330 5.625 2.87e-08 ***
## Grad.Rate 8.99128 3.35424 2.681 0.007556 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1061 on 587 degrees of freedom
## Multiple R-squared: 0.928, Adjusted R-squared: 0.9259
## F-statistic: 445 on 17 and 587 DF, p-value: < 2.2e-16
preds = predict(lm_fit, newdata = College_test, type=c("response"))
mse.lm = mean((preds-College_test$Apps)^2)
mse.lm
## [1] 985897.4
Test Error is 985897.4
X = model.matrix(Apps~.,College_train)
Y = model.matrix(Apps~.,College_test)
grid=10^seq(10,-2,length=100)
ridge.mod = glmnet(X,College_train$Apps,alpha=0, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(X,College_train$Apps,alpha=0, lambda = grid, thresh=1e-12)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.01
ridge_preds = predict(ridge.mod, s=bestlam, newx=Y)
ridge_mse = mean((ridge_preds - College_test$Apps)^2)
ridge_mse
## [1] 985884.4
The RMSE (Root mean square error) with the Ridge Regression Model is 985884.4
X = model.matrix(Apps~.,College_train)
Y = model.matrix(Apps~.,College_test)
grid=10^seq(10,-2,length=100)
lasso.mod = glmnet(X,College_train$Apps,alpha=1, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(X,College_train$Apps,alpha=1, lambda = grid, thresh=1e-12)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.01
set.seed(1)
lasso_preds = predict(lasso.mod, s=bestlam, newx=Y)
MSE_lasso = mean((lasso_preds - College_test$Apps)^2)
MSE_lasso
## [1] 985846.2
sum(coef(lasso.mod)[,1]==0)
## [1] 18
lasso_coef = predict(lasso.mod, s=bestlam, type = "coefficients")
lasso_coef
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -571.37349624
## (Intercept) .
## PrivateYes -448.67960912
## Accept 1.61200915
## Enroll -0.97662944
## Top10perc 56.22661428
## Top25perc -17.62596080
## F.Undergrad 0.07068298
## P.Undergrad 0.07216568
## Outstate -0.08660653
## Room.Board 0.14702390
## Books 0.08423451
## Personal 0.01503276
## PhD -7.93884537
## Terminal -5.73772053
## S.F.Ratio 26.67424398
## perc.alumni 4.14758411
## Expend 0.07483143
## Grad.Rate 8.98999983
Test Error from the Lasso model is 985846.2, which is actually a little less than both from the Ridge Regression Model and Least Squares Regression Model. In total there are 17 variables with non-zero values.
set.seed(1)
pcr_fit= pcr(Apps~., data = College_train, scale=TRUE, validation="CV")
validationplot(pcr_fit,val.type="MSEP")
pcr_fit$ncomp
## [1] 17
We can see that the model stops when the number of components (M) reaches 17, that means that the model actually used all the variables for the final model, and there was no reduction on the dimensionality.
pcr_preds = predict(pcr_fit, College_test, ncomp = 17)
MSE_pcr = mean((pcr_preds - College_test$Apps)^2)
MSE_pcr
## [1] 985897.4
The MSE for the PCR model is 985897.4. Still our lowest MSE comes from the LASSO model that is 985846.2.
pls_model=plsr(Apps~., data=College_train,scale=TRUE, validation="CV")
validationplot(pls_model,val.type="MSEP")
pls_model$ncomp
## [1] 17
Same as PCR, the number of components (variables) stays the same, the 17 original ones. No dimensionality reduction.
pls_preds = predict(pls_model, College_test, ncomp = 17)
MSE_pls = mean((pls_preds - College_test$Apps)^2)
MSE_pls
## [1] 985897.4
It gives us the exact same MSE as the PCR model = 985897.4
models_mse = c(mse.lm, ridge_mse, MSE_lasso, MSE_pcr, MSE_pls)
models_columns = c("OLS", "Ridge", "LASSO", "PCR", "PLS")
models_summary = data.frame(models_columns, models_mse)
models_summary
## models_columns models_mse
## 1 OLS 985897.4
## 2 Ridge 985884.4
## 3 LASSO 985846.2
## 4 PCR 985897.4
## 5 PLS 985897.4
Comments: We can see that the lowest MSE comes from the LASSO model with 985846.2, then comes the Ridge MSE with 985884.4, and all 3 of OLS, PCR and PLS have exactly the same MSE 985897.4, which in this case is because all 3 models include the 17 variables (predictors) of the full model.
detach(College)
attach(Boston)
set.seed(1)
sample_data2 = sample.split(Boston, SplitRatio = 0.8 )
Boston.train = subset(Boston, sample_data==TRUE)
Boston.test = subset(Boston, sample_data==FALSE)
Boston.full = lm(crim~.,data=Boston.train)
Boston.predict = predict(Boston.full,Boston.test)
Boston.MSE = mean((Boston.predict-Boston.test$crim)^2)
Boston.MSE
## [1] 14.30321
set.seed(1)
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
Boston.bsm=regsubsets(crim~.,data=Boston.train,nbest=1,nvmax=13)
summary(Boston.bsm)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Boston.train, nbest = 1,
## nvmax = 13)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" " " "*"
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 9 ( 1 ) "*" " " " " "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
Boston.test.matrix = model.matrix(crim ~., data = Boston.test, nvmax = 13)
nvmax = 13
model.errors = rep(NA, nvmax)
for (i in 1:nvmax) {
coefi = coef(Boston.bsm, id = i)
pred = Boston.test.matrix[, names(coefi)] %*% coefi
model.errors[i] = mean((pred - Boston.test$crim)^2)
}
plot(model.errors, xlab = "Num of predictors", ylab = "Test MSE", pch = 19, type = "b")
which.min(model.errors)
## [1] 8
coef(Boston.bsm, which.min(model.errors))
## (Intercept) zn nox dis rad
## 21.913194458 0.051966244 -13.778765242 -1.005071050 0.579865593
## ptratio black lstat medv
## -0.343146453 -0.007591664 0.092386821 -0.216649543
MSE.Boston.bsm = model.errors[8]
MSE.Boston.bsm
## [1] 13.90846
Boston.train.matrix=model.matrix(crim~.,Boston.train)
Boston.test.matrix=model.matrix(crim~.,Boston.test)
grid=10^seq(10,-2,length=100)
Boston_lasso_initial = glmnet(Boston.train.matrix, Boston.train$crim, alpha = 0, lambda = grid)
Boston_cv_lasso = cv.glmnet(Boston.train.matrix, Boston.train$crim, alpha = 0, lambda = grid)
plot(Boston_cv_lasso)
It still needed to find the lambda that will reduce the model Error.
Boston.bestL.lasso = Boston_cv_lasso$lambda.min
Boston.bestL.lasso
## [1] 0.09326033
Boston_lasso=glmnet(Boston.train.matrix,Boston.train$crim,alpha=0,lambda=Boston.bestL.lasso)
coef(Boston_lasso)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 14.503889794
## (Intercept) .
## zn 0.050128027
## indus -0.083488177
## chas -0.330331902
## nox -10.080306226
## rm 0.783264290
## age 0.002469111
## dis -0.997854542
## rad 0.589696467
## tax -0.001570836
## ptratio -0.256575559
## black -0.007305088
## lstat 0.132131938
## medv -0.236462704
Boston.preds.lasso = predict(Boston_lasso,s=Boston.bestL.lasso,newx=Boston.test.matrix)
Boston.lasso.MSE = mean((Boston.test$crim-Boston.preds.lasso)^2)
Boston.lasso.MSE
## [1] 13.92824
Results: For the Lasso Model we have a MSE of 13.92824 with a Lambda of 0.09326033
set.seed(1)
bridge.mod = glmnet(Boston.train.matrix,Boston.train$crim,alpha=0, lambda = grid, thresh=1e-12)
cv.out=cv.glmnet(Boston.train.matrix,Boston.train$crim,alpha=0, lambda = grid, thresh=1e-12)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.09326033
Boston.ridge.model=glmnet(Boston.train.matrix,Boston.train$crim,alpha=0,lambda=bestlam)
coef(Boston.ridge.model)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 14.503889794
## (Intercept) .
## zn 0.050128027
## indus -0.083488177
## chas -0.330331902
## nox -10.080306226
## rm 0.783264290
## age 0.002469111
## dis -0.997854542
## rad 0.589696467
## tax -0.001570836
## ptratio -0.256575559
## black -0.007305088
## lstat 0.132131938
## medv -0.236462704
Boston.ridge.pred=predict(Boston.ridge.model,s=bestlam,newx=Boston.test.matrix)
Boston.ridge.MSE = mean((Boston.ridge.pred-Boston.test$crim)^2)
Boston.ridge.MSE
## [1] 13.92824
Results: For the Ridge Reg Model we have a MSE of 13.92824 with a Lambda of 0.09326033 (just like LASSO model)
boston.pcr.model = pcr(crim~.,data=Boston.train,scale=TRUE,validation="CV")
summary(boston.pcr.model)
## Data: X dimension: 394 13
## Y dimension: 394 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 9.33 7.873 7.879 7.391 7.404 7.413 7.465
## adjCV 9.33 7.870 7.875 7.386 7.397 7.408 7.455
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 7.464 7.319 7.358 7.359 7.387 7.344 7.263
## adjCV 7.454 7.298 7.347 7.346 7.371 7.327 7.245
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.02 60.29 69.81 76.60 83.04 88.09 91.18 93.43
## crim 29.60 29.71 38.05 38.42 38.42 38.88 39.05 41.21
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.43 97.09 98.51 99.56 100.0
## crim 41.23 41.33 41.95 43.18 44.4
validationplot(boston.pcr.model,val.type="MSEP")
boston.pcr.model$ncomp
## [1] 13
boston.pcr.new.model= pcr(crim~.,data=Boston.train,scale=TRUE,ncomp=13)
summary(boston.pcr.new.model)
## Data: X dimension: 394 13
## Y dimension: 394 1
## Fit method: svdpc
## Number of components considered: 13
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.02 60.29 69.81 76.60 83.04 88.09 91.18 93.43
## crim 29.60 29.71 38.05 38.42 38.42 38.88 39.05 41.21
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.43 97.09 98.51 99.56 100.0
## crim 41.23 41.33 41.95 43.18 44.4
boston.predict.pcr = predict(boston.pcr.model,Boston.test,ncomp=13)
pcr.MSE = mean((Boston.test$crim-boston.predict.pcr)^2)
pcr.MSE
## [1] 14.30321
model_columns = c("Best Subset selection", "Lasso", "Ridge regression", "PCR")
model_errors = c(MSE.Boston.bsm, Boston.lasso.MSE, Boston.ridge.MSE, pcr.MSE)
data.frame(model_columns, model_errors)
## model_columns model_errors
## 1 Best Subset selection 13.90846
## 2 Lasso 13.92824
## 3 Ridge regression 13.92824
## 4 PCR 14.30321
Answer: From the past table we can see that the model with the lower test MSE is the Best Subset Selection with an MSE of 13.90846.
Answer: No, it involves only 8 of them. The final model would be the following: Crime ~ zn + nox + dis + rad + ptratio + black + lstat + medv
which.min(model.errors)
## [1] 8
names(coef(Boston.bsm, which.min(model.errors)))
## [1] "(Intercept)" "zn" "nox" "dis" "rad"
## [6] "ptratio" "black" "lstat" "medv"