This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
###2
###a
Answer iii. The Lasso solution is a trade off between a small increase in bias and a reduction in variance. The solution can aslo reduce coefficient estimates.
###b
Answer iii. Similar to Lasso, the redge regression can reduce coefficient estimates, therefore reducing variance with higher bias.
###c
Answer ii. Unlike least squares, non-linear models are more flexible and have less bias.
###9
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
head(College)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
###a
set.seed(1)
index<-sample(1:nrow(College), 0.5*nrow(College))
college_training <- College[index, ]
college_test <- College[-index, ]
###b
ols_fit<-lm(Apps ~., data=college_training)
summary(ols_fit)
##
## Call:
## lm(formula = Apps ~ ., data = college_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5741.2 -479.5 15.3 359.6 7258.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.902e+02 6.381e+02 -1.238 0.216410
## PrivateYes -3.070e+02 2.006e+02 -1.531 0.126736
## Accept 1.779e+00 5.420e-02 32.830 < 2e-16 ***
## Enroll -1.470e+00 3.115e-01 -4.720 3.35e-06 ***
## Top10perc 6.673e+01 8.310e+00 8.030 1.31e-14 ***
## Top25perc -2.231e+01 6.533e+00 -3.415 0.000708 ***
## F.Undergrad 9.269e-02 5.529e-02 1.676 0.094538 .
## P.Undergrad 9.397e-03 5.493e-02 0.171 0.864275
## Outstate -1.084e-01 2.700e-02 -4.014 7.22e-05 ***
## Room.Board 2.115e-01 7.224e-02 2.928 0.003622 **
## Books 2.912e-01 3.985e-01 0.731 0.465399
## Personal 6.133e-03 8.803e-02 0.070 0.944497
## PhD -1.548e+01 6.681e+00 -2.316 0.021082 *
## Terminal 6.415e+00 7.290e+00 0.880 0.379470
## S.F.Ratio 2.283e+01 2.047e+01 1.115 0.265526
## perc.alumni 1.134e+00 6.083e+00 0.186 0.852274
## Expend 4.857e-02 1.619e-02 2.999 0.002890 **
## Grad.Rate 7.490e+00 4.397e+00 1.703 0.089324 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1083 on 370 degrees of freedom
## Multiple R-squared: 0.9389, Adjusted R-squared: 0.9361
## F-statistic: 334.3 on 17 and 370 DF, p-value: < 2.2e-16
ols_pred<-predict(ols_fit, college_test)
ols_err<-mean((college_test$Apps-ols_pred)^2)
ols_err
## [1] 1135758
###c
train_mat <- model.matrix(Apps ~., data = college_training)[, -1]
test_mat <- model.matrix(Apps ~., data = college_test)[, -1]
grid <- 10^seq(10, -5, length = 1000)
ridge_fit <- cv.glmnet(train_mat, college_train$Apps, alpha = 0, lambda = grid, thresh = 1e-12)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'drop': object 'college_train' not found
ridge_bestlam <- ridge_fit$lambda.min
## Error in eval(expr, envir, enclos): object 'ridge_fit' not found
ridge_bestlam
## Error in eval(expr, envir, enclos): object 'ridge_bestlam' not found
ridge_pred <- predict(ridge_fit, newx = test_mat, s = ridge_bestlam)
## Error in predict(ridge_fit, newx = test_mat, s = ridge_bestlam): object 'ridge_fit' not found
ridge_err <- mean((college_test$Apps - ridge_pred)^2)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'mean': object 'ridge_pred' not found
ridge_err
## Error in eval(expr, envir, enclos): object 'ridge_err' not found
###d
lasso_fit <- cv.glmnet(train_mat, college_train$Apps, alpha = 1, lambda = grid, thresh = 1e-12)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'drop': object 'college_train' not found
lasso_bestlam <- lasso_fit$lambda.min
## Error in eval(expr, envir, enclos): object 'lasso_fit' not found
lasso_bestlam
## Error in eval(expr, envir, enclos): object 'lasso_bestlam' not found
lasso_pred <- predict(lasso_fit, newx = test_mat, s = lasso_bestlam)
## Error in predict(lasso_fit, newx = test_mat, s = lasso_bestlam): object 'lasso_fit' not found
lasso_err <- mean((college_test$Apps - lasso_pred)^2)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'mean': object 'lasso_pred' not found
lasso_err
## Error in eval(expr, envir, enclos): object 'lasso_err' not found
###e
Error=FALSE
library(pls)
## Warning: package 'pls' was built under R version 4.1.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr_fit <- pcr(Apps ~., data = college_training, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")
pcr_pred <- predict(pcr_fit, college_test, ncomp = 9)
pcr_err <- mean((college_test$Apps - pcr_pred)^2)
pcr_err
## [1] 1583520
###f
pls_fit <- plsr(Apps ~., data = college_train, scale = TRUE, validation = "CV")
## Error in is.data.frame(data): object 'college_train' not found
validationplot(pls_fit, val.type = "MSEP")
## Error in MSEP(object = pls_fit): object 'pls_fit' not found
pls_pred <- predict(pls_fit, college_test, ncomp = 5)
## Error in predict(pls_fit, college_test, ncomp = 5): object 'pls_fit' not found
pls_err <- mean((college_test$Apps - pls_pred)^2)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'mean': object 'pls_pred' not found
pls_err
## Error in eval(expr, envir, enclos): object 'pls_err' not found
###g
###a
library(leaps)
## Warning: package 'leaps' was built under R version 4.1.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
set.seed(1)
attach(Boston)
predict.regsubsets = function(object, newdata, id, ...)
{form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)}}
mean.cv.errors <- apply(cv.errors, 2, mean)
plot(mean.cv.errors, type = "b", xlab = "Number of variables", ylab = "CV error")
which.min(mean.cv.errors)
## [1] 9
mean.cv.errors[which.min(mean.cv.errors)]
## [1] 42.81453
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.lasso = cv.glmnet(x, y, type.measure = "mse")
plot(cv.lasso)
coef(cv.lasso)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2.176491
## zn .
## indus .
## chas .
## nox .
## rm .
## age .
## dis .
## rad 0.150484
## tax .
## ptratio .
## black .
## lstat .
## medv .
sqrt(cv.lasso$cvm[cv.lasso$lambda == cv.lasso$lambda.1se])
## [1] 7.921353
x = model.matrix(crim ~ . - 1, data = Boston)
y = Boston$crim
cv.ridge = cv.glmnet(x, y, type.measure = "mse", alpha = 0)
plot(cv.ridge)
coef(cv.ridge)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 1.523899542
## zn -0.002949852
## indus 0.029276741
## chas -0.166526007
## nox 1.874769665
## rm -0.142852604
## age 0.006207995
## dis -0.094547258
## rad 0.045932737
## tax 0.002086668
## ptratio 0.071258052
## black -0.002605281
## lstat 0.035745604
## medv -0.023480540
sqrt(cv.ridge$cvm[cv.ridge$lambda == cv.ridge$lambda.1se])
## [1] 7.669133
pcr.crime = pcr(crim ~ ., data = Boston, scale = TRUE, validation = "CV")
summary(pcr.crime)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.175 7.180 6.724 6.731 6.727 6.727
## adjCV 8.61 7.174 7.179 6.721 6.725 6.724 6.724
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.722 6.614 6.618 6.607 6.598 6.553 6.488
## adjCV 6.718 6.609 6.613 6.602 6.592 6.546 6.481
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
###b
The cross-validation error with the reported MSE of 42.81 had the best selection model that performed well on this data.
###c
The Model I’ve chosen has the lowest MSE and a limited number of predictors. Through having a low number of predictors and and the lowest MSE, this model is able to have a reduced variance and good accuracy.