#QUESTION 2
#The correct answer is iii.  Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. This is because Lasso's solution has a reduction in variance, but there is also a small increase in bias. least sqaures estimates will still have high variance. Lasso's solution shrinks coefficient estimates. This removes the non-essential variables. 

#The correct answer is iii. Ridge regression relative to least squares is similar to lasso because it shrinks coefficient estimates, which decreases variance and increases the bias slightly. Ridge regression is less flexible than the least squares. 

#The correct answer is ii. More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. This is because non-linear models are more flexible than the least squres and also has less bias. 
#QUESTION 9 PART A)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
data("College")
attach(College)
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
#QUESTION 9 PART B)
pls.fit<-lm(Apps~., data=College, subset=train)
summary(pls.fit)
## 
## Call:
## lm(formula = Apps ~ ., data = College, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4828.9  -415.2   -37.2   304.7  7403.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -973.47840  636.69721  -1.529 0.127131    
## PrivateYes  -433.46171  200.44873  -2.162 0.031223 *  
## Accept         1.64082    0.05013  32.730  < 2e-16 ***
## Enroll        -1.51619    0.25788  -5.879 9.20e-09 ***
## Top10perc     38.41423    8.38030   4.584 6.25e-06 ***
## Top25perc     -9.01858    6.81347  -1.324 0.186440    
## F.Undergrad    0.15574    0.04627   3.366 0.000844 ***
## P.Undergrad    0.05234    0.03858   1.357 0.175748    
## Outstate      -0.08629    0.02811  -3.069 0.002303 ** 
## Room.Board     0.15152    0.06932   2.186 0.029461 *  
## Books          0.13939    0.44432   0.314 0.753913    
## Personal       0.02124    0.08432   0.252 0.801260    
## PhD           -9.95235    7.51318  -1.325 0.186104    
## Terminal      -3.45012    8.22565  -0.419 0.675142    
## S.F.Ratio     45.97788   21.38242   2.150 0.032182 *  
## perc.alumni    3.93984    5.87940   0.670 0.503206    
## Expend         0.09754    0.02203   4.427 1.26e-05 ***
## Grad.Rate      6.33697    4.25658   1.489 0.137406    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1081 on 370 degrees of freedom
## Multiple R-squared:  0.9432, Adjusted R-squared:  0.9406 
## F-statistic: 361.3 on 17 and 370 DF,  p-value: < 2.2e-16
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
#QUESTION 9 PART C)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]

set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)

bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
#The best lambda chosen by cross-validation is 325.8075.
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,]) 
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030
#The MSE is 2423212, which is higher than the linear regression. 
#PROBLEM 9 PART D)
set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
#The best lambda chosen using cross-validation of lasso model is 12.26644.
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,]) 
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
#The MSE for lasso regression is 1555235
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3
#3 non-zero coefficinet estimates.
install.packages("pls")
## Installing package into 'C:/Users/lsunb/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'pls' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lsunb\AppData\Local\Temp\RtmpuetWgG\downloaded_packages
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
#PROBLEM 9 PART E)
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4440     4463     2362     2385     2092     1835     1832
## adjCV         4440     4465     2359     2384     2066     1824     1823
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1829     1821     1753      1764      1771      1772      1784
## adjCV     1818     1813     1746      1757      1764      1764      1777
##        14 comps  15 comps  16 comps  17 comps
## CV         1778      1747      1401      1338
## adjCV      1773      1701      1385      1322
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     31.4816    57.40    64.84    70.54    75.80    80.23    84.04    87.64
## Apps   0.1398    72.45    72.50    80.45    84.74    84.77    85.06    85.16
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.87     93.29     95.33     96.98     98.05     98.75     99.37
## Apps    85.93     86.16     86.21     86.32     86.40     86.61     92.49
##       16 comps  17 comps
## X        99.85    100.00
## Apps     93.65     94.32
validationplot(pcr.fit, val.type = "MSEP")

#The lowest cross-validation error happens with all 17 components included. 
pcr.pred <- predict(pcr.fit,x[test,], ncomp=17) 
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608
#The MSE is 1475528
#PROBLEM 9 PART F)
set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4440     2153     1701     1711     1650     1498     1413
## adjCV         4440     2147     1677     1704     1615     1470     1392
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1378     1358     1356      1342      1338      1337      1337
## adjCV     1361     1342     1339      1326      1322      1321      1321
##        14 comps  15 comps  16 comps  17 comps
## CV         1338      1338      1338      1338
## adjCV      1322      1322      1322      1322
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       25.76    31.99    61.88    65.00    68.68    73.82    78.33    81.38
## Apps    77.92    87.80    88.38    92.06    93.59    93.92    94.01    94.09
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       83.24     85.59     87.76     90.49     92.69     94.27     97.03
## Apps    94.20     94.27     94.30     94.31     94.32     94.32     94.32
##       16 comps  17 comps
## X        99.21    100.00
## Apps     94.32     94.32
validationplot(pls.fit, val.type = "MSEP")

#The lowest cross-validation error happens with 12 components. 
pls.pred <- predict(pls.fit,x[test,],ncomp=12) 
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346
#The test MSE for PLS is 1476525. This value is lower than higher than PCR.
#PROBLEM 9 PART G)
errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))

print(sort(errors))
##   lasso     pls     pcr      lm   ridge 
## 1045922 1085346 1093608 1093608 1138030
#From the results, we can see that the LM and PCR has the lowest test error. It is later followed by PLS, Lasso and Ridge. 
#PROBLEM 11
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
## 
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
data(Boston)
attach(Boston)
set.seed(4)
train <- sample(1:nrow(Boston), nrow(Boston)*0.70)
test <- -train
y.test <- crim[test]
#PROBLEM 11 PART A)
lm.fit <- lm(crim~., data=Boston, subset=train)
summary(lm.fit)
## 
## Call:
## lm(formula = crim ~ ., data = Boston, subset = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.231 -2.307 -0.450  1.065 73.806 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.469057   8.580239   2.153 0.032058 *  
## zn            0.052939   0.024593   2.153 0.032055 *  
## indus        -0.049318   0.104361  -0.473 0.636824    
## chas         -0.504106   1.524087  -0.331 0.741029    
## nox         -11.086886   6.713267  -1.651 0.099559 .  
## rm            0.392079   0.699486   0.561 0.575490    
## age           0.004511   0.021219   0.213 0.831773    
## dis          -1.138266   0.342130  -3.327 0.000974 ***
## rad           0.633764   0.110299   5.746 2.03e-08 ***
## tax          -0.004751   0.006545  -0.726 0.468409    
## ptratio      -0.343971   0.236630  -1.454 0.146970    
## lstat         0.059951   0.093464   0.641 0.521672    
## medv         -0.252998   0.072429  -3.493 0.000540 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.651 on 341 degrees of freedom
## Multiple R-squared:  0.4509, Adjusted R-squared:  0.4316 
## F-statistic: 23.34 on 12 and 341 DF,  p-value: < 2.2e-16
#From the linear regression, we can see that the significant variables are zn, dis, rad, lstat, and medv.
lm.pred <- predict(lm.fit, Boston[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 36.50002
#The test MSE is 42.30248.
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
predict.regsubsets = function(object, newdata, id, ...) {
    form = as.formula(object$call[[2]])
    mat = model.matrix(form, newdata)
    coefi = coef(object, id = id)
    mat[, names(coefi)] %*% coefi
}

k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
    best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
    for (j in 1:p) {
        pred = predict(best.fit, Boston[folds == i, ], id = j)
        cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
    }
}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")

which.min(mean.cv.errors)
## [1] 11
mean.cv.errors[which.min(mean.cv.errors)]
## [1] 43.27333
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.lasso = cv.glmnet(x, y, type.measure = "mse")
plot(cv.lasso)

coef(cv.lasso)
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                    s1
## (Intercept) 1.7799525
## zn          .        
## indus       .        
## chas        .        
## nox         .        
## rm          .        
## age         .        
## dis         .        
## rad         0.1920089
## tax         .        
## ptratio     .        
## lstat       .        
## medv        .
sqrt(cv.lasso$cvm[cv.lasso$lambda == cv.lasso$lambda.1se])
## [1] 7.74365
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.ridge = cv.glmnet(x, y, type.measure = "mse", alpha = 0)
plot(cv.ridge)

coef(cv.ridge)
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.524520701
## zn          -0.003033435
## indus        0.030059812
## chas        -0.170770364
## nox          1.926593977
## rm          -0.144481578
## age          0.006340719
## dis         -0.096538533
## rad          0.046822404
## tax          0.002130833
## ptratio      0.072300494
## lstat        0.036566920
## medv        -0.024081652
sqrt(cv.ridge$cvm[cv.ridge$lambda == cv.ridge$lambda.1se])
## [1] 7.714514
pcr.crime = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
summary(pcr.crime)
## Data:    X dimension: 506 12 
##  Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 12
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            8.61    7.291    7.288    6.893    6.867    6.813    6.794
## adjCV         8.61    7.287    7.284    6.886    6.863    6.809    6.789
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## CV       6.649    6.664    6.653     6.646     6.608     6.513
## adjCV    6.642    6.658    6.647     6.640     6.600     6.506
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       49.93    63.64    72.94    80.21    86.83    90.26    92.79    94.99
## crim    29.39    29.55    37.39    37.85    38.85    39.23    41.73    41.82
##       9 comps  10 comps  11 comps  12 comps
## X       96.78     98.33     99.48    100.00
## crim    42.12     42.43     43.58     44.93
validationplot(pcr.fit, val.type = "MSEP")

errors <- c(lm.error, ridge.error, lasso.error, pcr.error,min(mean.cv.errors))
names(errors) <- c("lm", "ridge", "lasso", "pcr", "cross+best")
print(sort(errors))
##           lm   cross+best        lasso          pcr        ridge 
## 3.650002e+01 4.327333e+01 1.045922e+06 1.093608e+06 1.138030e+06
#PROBLEM 11 PART B)
#The best model is the PCR because it has the lowest error. 

#PROBLEM 11 PART C)
#The model chosen (PCR) does involve all of the features in the dataset involved in the modeling process. This is because PCR is combining the features to create new componenets. Even the features that seem less important, they still play role in overall model.