knitr::opts_chunk$set(echo = TRUE)

2. For parts (a) through (c), indicate which of i. through iv. is correct. Justify your answer.

##A) Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. The lasso is a more restrictive model, and thus it has the possibility of reducing overfitting and variance in predictions. As long as it does not result in too high of a bias due to its added constraints, it will outperform least squares which might be fitting spurious parameters.

##B) Less flexible and hence will give improved prediction accuracy when its increase in bias is less than its decrease in variance. Although not as restrictive as the lasso, the ridge is more restrictive, and for the same reasions as outlined above this is the case.

##C) More flexible and hence will give improved prediction accuracy when its increase in variance is less than its decrease in bias. Non linear methods are generally more flexible than least squares. They perform better when the linearity assumption is strongly broken. These methods will have more variance due to their more sensitive fits to the underlying data, and to perform well will need to have a substantial drop in bias.

9. In this exercise, we will predict the number of applications received using the other variables in the College data set.

library(ISLR2)
##(a) Split the data set into a training set and a test set.

train=sample(c(TRUE,FALSE),nrow(College),rep=TRUE)
test=(!train)
College.train=College[train,,drop=F]
College.test=College[test,,drop=F]

##(b) Fit a linear model using least squares on the training set, and report the test error obtained.

lm.fit=lm(Apps~.,data=College.train)
summary(lm.fit)
## 
## Call:
## lm(formula = Apps ~ ., data = College.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3649.9  -374.8     4.2   303.2  7814.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -581.71789  549.84960  -1.058  0.29078    
## PrivateYes  -407.58384  191.08269  -2.133  0.03359 *  
## Accept         1.75321    0.04888  35.866  < 2e-16 ***
## Enroll        -1.45165    0.25488  -5.695 2.55e-08 ***
## Top10perc     35.62531    8.16562   4.363 1.68e-05 ***
## Top25perc     -6.32476    6.25467  -1.011  0.31259    
## F.Undergrad    0.10884    0.04442   2.450  0.01475 *  
## P.Undergrad    0.04614    0.03953   1.167  0.24399    
## Outstate      -0.09430    0.02689  -3.507  0.00051 ***
## Room.Board     0.16669    0.06663   2.502  0.01280 *  
## Books         -0.02771    0.34164  -0.081  0.93540    
## Personal       0.03516    0.08060   0.436  0.66291    
## PhD           -6.11693    6.30790  -0.970  0.33283    
## Terminal      -6.62848    6.97225  -0.951  0.34239    
## S.F.Ratio     23.33220   16.88668   1.382  0.16792    
## perc.alumni    7.19484    5.89253   1.221  0.22287    
## Expend         0.07974    0.01786   4.464 1.07e-05 ***
## Grad.Rate      5.23686    4.01360   1.305  0.19280    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1007 on 363 degrees of freedom
## Multiple R-squared:  0.9431, Adjusted R-squared:  0.9405 
## F-statistic: 354.1 on 17 and 363 DF,  p-value: < 2.2e-16
pred=predict(lm.fit,College.test)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9033761
##(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
College.train.X=scale(model.matrix(Apps~.,data=College.train)[,-1],scale=T,center=T)
College.train.Y=College.train$Apps
College.test.X=scale(model.matrix(Apps~.,data=College.test)[,-1],
      attr(College.train.X,"scaled:center"),
      attr(College.train.X,"scaled:scale"))
College.test.Y=College.test$Apps
cv.out=cv.glmnet(College.train.X,College.train.Y,alpha=0)
bestlam=cv.out$lambda.min
bestlam
## [1] 394.015
lasso.mod=glmnet(College.train.X,College.train.Y,alpha=0,lambda=bestlam)
pred=predict(lasso.mod,College.test.X,s=bestlam)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9093068
##(d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

cv.out=cv.glmnet(College.train.X,College.train.Y,alpha=1)
bestlam=cv.out$lambda.min
bestlam
## [1] 2.10274
lasso.mod=glmnet(College.train.X,College.train.Y,alpha=1,lambda=bestlam)
pred=predict(lasso.mod,College.test.X,s=bestlam)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9038741
#Number of coefficients equal to 0
sum(coef(lasso.mod)[,1]==0)
## [1] 0
names(coef(lasso.mod)[, 1][coef(lasso.mod)[, 1] == 0])
## character(0)
##(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)
## Warning: package 'pls' was built under R version 4.2.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(1)
pcr.fit=pcr(Apps~.,data=College.train, scale=TRUE, validation="CV")
summary(pcr.fit) #lowest at M=17
## Data:    X dimension: 381 17 
##  Y dimension: 381 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4132     4155     2375     2382     2211     1935     1943
## adjCV         4132     4159     2368     2379     2167     1923     1933
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1917     1809     1793      1791      1785      1787      1805
## adjCV     1920     1790     1781      1782      1775      1778      1798
##        14 comps  15 comps  16 comps  17 comps
## CV         1804      1587      1256      1150
## adjCV      1819      1552      1241      1140
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X     30.9351    55.65    63.48    69.41    75.02    79.58    83.49    87.11
## Apps   0.3736    69.05    69.44    77.05    81.26    81.30    81.58    84.01
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.32     92.93     94.99     96.78     97.83     98.64     99.36
## Apps    84.02     84.52     84.57     84.57     84.58     85.24     92.55
##       16 comps  17 comps
## X        99.83    100.00
## Apps     93.65     94.31
pred=predict(pcr.fit,College.test,ncomp=17)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9033761
##(f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

library(pls)
set.seed(1)
pls.fit=plsr(Apps~.,data=College.train, scale=TRUE, validation="CV")
summary(pls.fit) #pretty much lowest at 9 comps, certainly closest to lowest there
## Data:    X dimension: 381 17 
##  Y dimension: 381 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4132     2160     1729     1691     1564     1359     1214
## adjCV         4132     2150     1699     1696     1536     1338     1201
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1185     1171     1159      1156      1152      1149      1151
## adjCV     1175     1162     1149      1146      1142      1139      1141
##        14 comps  15 comps  16 comps  17 comps
## CV         1150      1150      1150      1150
## adjCV      1140      1140      1140      1140
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       24.56    32.26    60.59    64.34    68.95    73.52    77.26    80.86
## Apps    75.63    86.27    87.39    91.30    93.25    93.91    94.01    94.10
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       82.48     85.75     87.78     89.92     93.06     94.23     97.13
## Apps    94.24     94.27     94.29     94.31     94.31     94.31     94.31
##       16 comps  17 comps
## X        98.40    100.00
## Apps     94.31     94.31
pred=predict(pls.fit,College.test,ncomp=9)
rss=sum((pred-College.test$Apps)^2)
tss=sum((College.test$Apps-mean(College.test$Apps))^2)
test.rsq=1-(rss/tss)
test.rsq
## [1] 0.9060727
##(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

##Ordinary least squares, PLS regression, lasso, and PCR regression performed (more or less equally) best. PLS regression was able to cut out a few things, chosing a model that used 9 of the possible 17 components, and 83% of the variance, while still performing pretty much as well. Interestingly the Lasso, while not performing quite as well, still performed pretty comparably 0.8995 vs 0.9052 (a difference of `r 0.9052 - 0.8995`). The lasso though only set 3 variables to 0 (Enroll (students enrolled), Terminal (pct fac w/ terminal degree), and S.F. Ratio(student/factulty ratio)). Ridge regression performed the poorest at $R^2=0.84$.

11. We will now try to predict per capita crime rate in the Boston data set.

##(a) Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
data("Boston")
train <- sample(1:nrow(Boston), nrow(Boston)*0.70)
test <- -train
y.test <- Boston$crim[test]

##Least Sqaures
lm.fit <- lm(crim~., data=Boston, subset=train)
summary(lm.fit)
## 
## Call:
## lm(formula = crim ~ ., data = Boston, subset = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.464 -2.385 -0.483  0.974 74.451 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  20.572687   9.296673   2.213 0.027568 *  
## zn            0.050093   0.023354   2.145 0.032665 *  
## indus        -0.047622   0.108818  -0.438 0.661932    
## chas         -0.652652   1.594414  -0.409 0.682550    
## nox         -13.622079   7.071091  -1.926 0.054882 .  
## rm            0.470542   0.752745   0.625 0.532324    
## age           0.003977   0.022766   0.175 0.861444    
## dis          -1.218832   0.365848  -3.332 0.000959 ***
## rad           0.610610   0.109344   5.584 4.81e-08 ***
## tax          -0.005257   0.006584  -0.798 0.425204    
## ptratio      -0.319125   0.237077  -1.346 0.179173    
## black        -0.004493   0.004680  -0.960 0.337716    
## lstat         0.096664   0.096140   1.005 0.315395    
## medv         -0.245278   0.078829  -3.112 0.002019 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.795 on 340 degrees of freedom
## Multiple R-squared:  0.4378, Adjusted R-squared:  0.4163 
## F-statistic: 20.37 on 13 and 340 DF,  p-value: < 2.2e-16
##Best Subset Selection

library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
best_subset <- regsubsets(crim ~ ., data=Boston, subset=train, nvmax=13)
best_subset_summary <- summary(best_subset)

# Pick the best model using validation set approach
test_matrix <- model.matrix(crim~., data=Boston[test,])

val.errors <- rep(NA,13)
for(i in 1:13){
 coefi <- coef(best_subset,id=i)
 pred <- test_matrix[,names(coefi)]%*%coefi
 val.errors[i] <- mean((Boston$crim[test]-pred)^2) 
}

which.min(val.errors)
## [1] 12
coef(best_subset, which.min(val.errors))
##   (Intercept)            zn         indus          chas           nox 
##  20.472011282   0.049521661  -0.046296575  -0.639677973 -13.343568597 
##            rm           dis           rad           tax       ptratio 
##   0.500239621  -1.233831693   0.609954300  -0.005259068  -0.317914057 
##         black         lstat          medv 
##  -0.004444226   0.101162410  -0.245665204
# Perform best subset selection with the chosen model on the entire dataset in order to obtain more accurate coefficient estimates

best_subset_full <- regsubsets(crim ~., data=Boston, nvmax = 13)
coef(best_subset_full, 6)
##  (Intercept)           zn          nox          dis          rad        black 
## 14.642639407  0.053963088 -9.238768232 -0.992810697  0.499838443 -0.008710565 
##         medv 
## -0.195989936
# Choose the model using cross-validation and the best subset method

predict.regsubsets <- function (object, newdata, id,...){
 form<-as.formula(object$call [[2]])
 mat<-model.matrix(form,newdata)
 coefi<-coef(object, id=id)
 xvars<-names(coefi)
 mat[,xvars]%*%coefi 
}

k <- 10
set.seed(2)
folds <- sample(1:k,nrow(Boston), replace=TRUE)
cv.errors <- matrix(NA,k,13, dimnames=list(NULL, paste(1:13)))

for(j in 1:k){
  best.fit=regsubsets(crim~., data=Boston[folds!=j, ], nvmax=13)
  for(i in 1:13) {
    pred=predict(best.fit,Boston[folds==j,],id=i)
    cv.errors[j,i]=mean((Boston$crim[folds==j]-pred)^2)
  }
}

mean.cv.errors <- apply(cv.errors,2,mean) 
mean.cv.errors
##        1        2        3        4        5        6        7        8 
## 44.45496 42.50923 44.07822 44.55039 44.07080 43.85521 43.25223 43.11403 
##        9       10       11       12       13 
## 42.69129 42.56528 42.83967 42.88256 42.92730
best_subset_cv <- regsubsets(crim~., data=Boston, nvmax=13)
coef(best_subset_cv, 12)
##   (Intercept)            zn         indus          chas           nox 
##  16.985713928   0.044673247  -0.063848469  -0.744367726 -10.202169211 
##            rm           dis           rad           tax       ptratio 
##   0.439588002  -0.993556631   0.587660185  -0.003767546  -0.269948860 
##         black         lstat          medv 
##  -0.007518904   0.128120290  -0.198877768
##Backward Stepwise
bwd <- regsubsets(crim~., data=Boston, subset=train, nvmax=13, method="backward")

test_matrix <- model.matrix(crim~., data=Boston[test,])

val.errors <- rep(NA,13)
for(i in 1:13){
 coefi <- coef(bwd,id=i)
 pred <- test_matrix[,names(coefi)]%*%coefi
 val.errors[i] <- mean((Boston$crim[test]-pred)^2) 
}

which.min(val.errors)
## [1] 12
bwd_mse <- val.errors[8]
bwd_mse
## [1] 32.0661
bwd_full <- regsubsets(crim ~., data=Boston, nvmax = 13)
coef(bwd_full, 8)
##   (Intercept)            zn           nox           dis           rad 
##  19.683127801   0.043293393 -12.753707757  -0.918318253   0.532616533 
##       ptratio         black         lstat          medv 
##  -0.310540942  -0.007922426   0.110173124  -0.174207166
##Lasso Regression


x <- model.matrix(crim~., Boston)[, -1]
y <- Boston$crim

lasso.fit <- glmnet(x[train, ], y[train], alpha=1)
cv.lasso.fit <- cv.glmnet(x[train, ], y[train], alpha=1)
plot(cv.lasso.fit)

bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 0.02254785
##Ridge Regression

ridge.fit <- glmnet(x[train, ], y[train], alpha=0)
cv.ridge.fit <- cv.glmnet(x[train, ], y[train], alpha=0)
plot(cv.ridge.fit)

bestlam.ridge <- cv.ridge.fit$lambda.min
bestlam.ridge
## [1] 0.5456868
ridge.pred <- predict(ridge.fit, s=bestlam.ridge, newx=x[test,]) 
ridge.error <- mean((ridge.pred-y[test])^2)
ridge.error
## [1] 31.01025
##(b) Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, crossvalidation, or some other reasonable alternative, as opposed to using training error.

##The best model is a 9 variable model selected using cross-validation and backward stepwise selection method.

##(c) ) Does your chosen model involve all of the features in the data set? Why or why not?

##No, it contains 9 variables: zn, indus, nox, dis, rad, ptratio, black, lstat, and medv.