#QUESTION 2
#The correct answer is iii. Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. This is because Lasso's solution has a reduction in variance, but there is also a small increase in bias. least sqaures estimates will still have high variance. Lasso's solution shrinks coefficient estimates. This removes the non-essential variables.
#The correct answer is iii. Ridge regression relative to least squares is similar to lasso because it shrinks coefficient estimates, which decreases variance and increases the bias slightly. Ridge regression is less flexible than the least squares.
#The correct answer is ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. This is because non-linear models are more flexible than the least squres and also has less bias.
#QUESTION 9 PART A)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
data("College")
attach(College)
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
#QUESTION 9 PART B)
pls.fit<-lm(Apps~., data=College, subset=train)
summary(pls.fit)
##
## Call:
## lm(formula = Apps ~ ., data = College, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4828.9 -415.2 -37.2 304.7 7403.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -973.47840 636.69721 -1.529 0.127131
## PrivateYes -433.46171 200.44873 -2.162 0.031223 *
## Accept 1.64082 0.05013 32.730 < 2e-16 ***
## Enroll -1.51619 0.25788 -5.879 9.20e-09 ***
## Top10perc 38.41423 8.38030 4.584 6.25e-06 ***
## Top25perc -9.01858 6.81347 -1.324 0.186440
## F.Undergrad 0.15574 0.04627 3.366 0.000844 ***
## P.Undergrad 0.05234 0.03858 1.357 0.175748
## Outstate -0.08629 0.02811 -3.069 0.002303 **
## Room.Board 0.15152 0.06932 2.186 0.029461 *
## Books 0.13939 0.44432 0.314 0.753913
## Personal 0.02124 0.08432 0.252 0.801260
## PhD -9.95235 7.51318 -1.325 0.186104
## Terminal -3.45012 8.22565 -0.419 0.675142
## S.F.Ratio 45.97788 21.38242 2.150 0.032182 *
## perc.alumni 3.93984 5.87940 0.670 0.503206
## Expend 0.09754 0.02203 4.427 1.26e-05 ***
## Grad.Rate 6.33697 4.25658 1.489 0.137406
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1081 on 370 degrees of freedom
## Multiple R-squared: 0.9432, Adjusted R-squared: 0.9406
## F-statistic: 361.3 on 17 and 370 DF, p-value: < 2.2e-16
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
#QUESTION 9 PART C)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]
set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)

bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
#The best lambda chosen by cross-validation is 325.8075.
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,])
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030
#The MSE is 2423212, which is higher than the linear regression.
#PROBLEM 9 PART D)
set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
#The best lambda chosen using cross-validation of lasso model is 12.26644.
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,])
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
#The MSE for lasso regression is 1555235
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3
#3 non-zero coefficinet estimates.
install.packages("pls")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'pls' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lsunb\AppData\Local\Temp\RtmpuetWgG\downloaded_packages
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
#PROBLEM 9 PART E)
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 4463 2362 2385 2092 1835 1832
## adjCV 4440 4465 2359 2384 2066 1824 1823
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1829 1821 1753 1764 1771 1772 1784
## adjCV 1818 1813 1746 1757 1764 1764 1777
## 14 comps 15 comps 16 comps 17 comps
## CV 1778 1747 1401 1338
## adjCV 1773 1701 1385 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.4816 57.40 64.84 70.54 75.80 80.23 84.04 87.64
## Apps 0.1398 72.45 72.50 80.45 84.74 84.77 85.06 85.16
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.87 93.29 95.33 96.98 98.05 98.75 99.37
## Apps 85.93 86.16 86.21 86.32 86.40 86.61 92.49
## 16 comps 17 comps
## X 99.85 100.00
## Apps 93.65 94.32
validationplot(pcr.fit, val.type = "MSEP")

#The lowest cross-validation error happens with all 17 components included.
pcr.pred <- predict(pcr.fit,x[test,], ncomp=17)
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608
#The MSE is 1475528
#PROBLEM 9 PART F)
set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 2153 1701 1711 1650 1498 1413
## adjCV 4440 2147 1677 1704 1615 1470 1392
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1378 1358 1356 1342 1338 1337 1337
## adjCV 1361 1342 1339 1326 1322 1321 1321
## 14 comps 15 comps 16 comps 17 comps
## CV 1338 1338 1338 1338
## adjCV 1322 1322 1322 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.76 31.99 61.88 65.00 68.68 73.82 78.33 81.38
## Apps 77.92 87.80 88.38 92.06 93.59 93.92 94.01 94.09
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.24 85.59 87.76 90.49 92.69 94.27 97.03
## Apps 94.20 94.27 94.30 94.31 94.32 94.32 94.32
## 16 comps 17 comps
## X 99.21 100.00
## Apps 94.32 94.32
validationplot(pls.fit, val.type = "MSEP")

#The lowest cross-validation error happens with 12 components.
pls.pred <- predict(pls.fit,x[test,],ncomp=12)
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346
#The test MSE for PLS is 1476525. This value is lower than higher than PCR.
#PROBLEM 9 PART G)
errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))

print(sort(errors))
## lasso pls pcr lm ridge
## 1045922 1085346 1093608 1093608 1138030
#From the results, we can see that the LM and PCR has the lowest test error. It is later followed by PLS, Lasso and Ridge.
#PROBLEM 11
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
data(Boston)
attach(Boston)
set.seed(4)
train <- sample(1:nrow(Boston), nrow(Boston)*0.70)
test <- -train
y.test <- crim[test]
#PROBLEM 11 PART A)
lm.fit <- lm(crim~., data=Boston, subset=train)
summary(lm.fit)
##
## Call:
## lm(formula = crim ~ ., data = Boston, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.231 -2.307 -0.450 1.065 73.806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.469057 8.580239 2.153 0.032058 *
## zn 0.052939 0.024593 2.153 0.032055 *
## indus -0.049318 0.104361 -0.473 0.636824
## chas -0.504106 1.524087 -0.331 0.741029
## nox -11.086886 6.713267 -1.651 0.099559 .
## rm 0.392079 0.699486 0.561 0.575490
## age 0.004511 0.021219 0.213 0.831773
## dis -1.138266 0.342130 -3.327 0.000974 ***
## rad 0.633764 0.110299 5.746 2.03e-08 ***
## tax -0.004751 0.006545 -0.726 0.468409
## ptratio -0.343971 0.236630 -1.454 0.146970
## lstat 0.059951 0.093464 0.641 0.521672
## medv -0.252998 0.072429 -3.493 0.000540 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.651 on 341 degrees of freedom
## Multiple R-squared: 0.4509, Adjusted R-squared: 0.4316
## F-statistic: 23.34 on 12 and 341 DF, p-value: < 2.2e-16
#From the linear regression, we can see that the significant variables are zn, dis, rad, lstat, and medv.
lm.pred <- predict(lm.fit, Boston[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 36.50002
#The test MSE is 42.30248.
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

which.min(mean.cv.errors)
## [1] 11
mean.cv.errors[which.min(mean.cv.errors)]
## [1] 43.27333
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.lasso = cv.glmnet(x, y, type.measure = "mse")
plot(cv.lasso)

coef(cv.lasso)
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.7799525
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.1920089
## tax .
## ptratio .
## lstat .
## medv .
sqrt(cv.lasso$cvm[cv.lasso$lambda == cv.lasso$lambda.1se])
## [1] 7.74365
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.ridge = cv.glmnet(x, y, type.measure = "mse", alpha = 0)
plot(cv.ridge)

coef(cv.ridge)
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.524520701
## zn -0.003033435
## indus 0.030059812
## chas -0.170770364
## nox 1.926593977
## rm -0.144481578
## age 0.006340719
## dis -0.096538533
## rad 0.046822404
## tax 0.002130833
## ptratio 0.072300494
## lstat 0.036566920
## medv -0.024081652
sqrt(cv.ridge$cvm[cv.ridge$lambda == cv.ridge$lambda.1se])
## [1] 7.714514
pcr.crime = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
summary(pcr.crime)
## Data: X dimension: 506 12
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 12
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.291 7.288 6.893 6.867 6.813 6.794
## adjCV 8.61 7.287 7.284 6.886 6.863 6.809 6.789
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## CV 6.649 6.664 6.653 6.646 6.608 6.513
## adjCV 6.642 6.658 6.647 6.640 6.600 6.506
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 49.93 63.64 72.94 80.21 86.83 90.26 92.79 94.99
## crim 29.39 29.55 37.39 37.85 38.85 39.23 41.73 41.82
## 9 comps 10 comps 11 comps 12 comps
## X 96.78 98.33 99.48 100.00
## crim 42.12 42.43 43.58 44.93
validationplot(pcr.fit, val.type = "MSEP")

errors <- c(lm.error, ridge.error, lasso.error, pcr.error,min(mean.cv.errors))
names(errors) <- c("lm", "ridge", "lasso", "pcr", "cross+best")
print(sort(errors))
## lm cross+best lasso pcr ridge
## 3.650002e+01 4.327333e+01 1.045922e+06 1.093608e+06 1.138030e+06
#PROBLEM 11 PART B)
#The best model is the PCR because it has the lowest error.
#PROBLEM 11 PART C)
#The model chosen (PCR) does involve all of the features in the dataset involved in the modeling process. This is because PCR is combining the features to create new componenets. Even the features that seem less important, they still play role in overall model.