set.seed(1)
library(ISLR)
trainid <- sample(c(TRUE, FALSE), nrow(College), replace = T, prob = c(.7, .3))
testid <- !trainid
train <- College[trainid,]
test <- College[testid,]
dim(College)
## [1] 777 18
dim(train)
## [1] 545 18
dim(test)
## [1] 232 18
lm <- lm(Apps ~ ., data = train)
summary(lm)
##
## Call:
## lm(formula = Apps ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2913.6 -410.4 -57.3 295.6 6913.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.850e+02 4.541e+02 -1.288 0.19818
## PrivateYes -6.669e+02 1.580e+02 -4.222 2.85e-05 ***
## Accept 1.237e+00 5.892e-02 20.994 < 2e-16 ***
## Enroll -1.943e-01 2.089e-01 -0.930 0.35260
## Top10perc 4.691e+01 6.474e+00 7.247 1.53e-12 ***
## Top25perc -1.447e+01 5.029e+00 -2.876 0.00418 **
## F.Undergrad 7.494e-02 3.610e-02 2.076 0.03839 *
## P.Undergrad -2.268e-03 4.153e-02 -0.055 0.95648
## Outstate -2.849e-02 2.165e-02 -1.316 0.18876
## Room.Board 1.523e-01 5.618e-02 2.711 0.00692 **
## Books 4.950e-02 2.629e-01 0.188 0.85072
## Personal 4.800e-02 6.914e-02 0.694 0.48784
## PhD -8.823e+00 5.500e+00 -1.604 0.10929
## Terminal -2.546e-01 5.981e+00 -0.043 0.96606
## S.F.Ratio 5.011e+00 1.439e+01 0.348 0.72790
## perc.alumni -5.429e+00 4.679e+00 -1.160 0.24645
## Expend 6.563e-02 1.381e-02 4.751 2.62e-06 ***
## Grad.Rate 8.967e+00 3.279e+00 2.735 0.00646 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 973.4 on 527 degrees of freedom
## Multiple R-squared: 0.9272, Adjusted R-squared: 0.9248
## F-statistic: 394.7 on 17 and 527 DF, p-value: < 2.2e-16
preds <- predict(lm, test[, -2])
mean((test$Apps - preds)^2)
## [1] 1799366
The test MSE is 1,799,366.
set.seed(1)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
lambdas <- 10^seq(10, -2, length = 100)
x <- model.matrix(Apps ~., train)[,-1]
y <- train$Apps
cvridge <- cv.glmnet(x, y, alpha = 0, lambda = lambdas)
plot(cvridge)
lambdamin <- cvridge$lambda.min
ridge <- glmnet(x, y, alpha = 0, lambda = lambdamin)
testx <- model.matrix(Apps ~., test)[,-1]
pred <- predict(ridge, s = lambdamin, newx = testx)
mean((test$Apps-pred)^2)
## [1] 1843563
The test MSE is 1,843,563.
set.seed(1)
lambdas <- 10^seq(10, -2, length = 100)
x <- model.matrix(Apps ~., train)[,-1]
y <- train$Apps
cvlasso <- cv.glmnet(x, y, alpha = 1, lambda = lambdas)
plot(cvlasso)
lambdamin <- cvridge$lambda.min
lasso <- glmnet(x, y, alpha = 1, lambda = lambdamin)
predict(lasso, type = "coef", s = lambdamin)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -629.31754546
## PrivateYes -658.40809124
## Accept 1.20461877
## Enroll .
## Top10perc 43.20175107
## Top25perc -11.75159264
## F.Undergrad 0.05295635
## P.Undergrad .
## Outstate -0.02244043
## Room.Board 0.14549717
## Books 0.03425734
## Personal 0.04176773
## PhD -8.21769961
## Terminal -0.13983134
## S.F.Ratio 2.69567690
## perc.alumni -5.69358140
## Expend 0.06437940
## Grad.Rate 8.37907539
testx <- model.matrix(Apps ~., test)[,-1]
pred <- predict(lasso, s = lambdamin, newx = testx)
mean((test$Apps-pred)^2)
## [1] 1867426
The test MSE is 1,867,462, and there are 16 non-zero coefficients out of 18 predictors, and excluding the intercept.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pcr <- pcr(Apps ~., data = train, scale = TRUE, validation = "CV")
summary(pcr)
## Data: X dimension: 545 17
## Y dimension: 545 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3554 3507 1657 1648 1358 1236 1237
## adjCV 3554 3508 1655 1647 1338 1232 1235
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1232 1213 1155 1146 1151 1152 1151
## adjCV 1230 1234 1153 1144 1149 1150 1149
## 14 comps 15 comps 16 comps 17 comps
## CV 1151 1162 1030 1024
## adjCV 1149 1160 1027 1020
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.45 57.40 64.77 70.61 76.09 80.87 84.57 87.76
## Apps 4.66 78.56 78.91 86.10 88.35 88.35 88.49 88.49
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.87 93.28 95.38 97.10 98.18 98.94 99.45
## Apps 90.03 90.17 90.17 90.19 90.24 90.29 90.30
## 16 comps 17 comps
## X 99.82 100.00
## Apps 92.38 92.72
validationplot(pcr, val.type = "MSEP")
pred <- predict(pcr, test[, -2], ncomp = 17)
mean((test$Apps - pred)^2)
## [1] 1799366
Cross-validation chose M = 17, so it makes sense that the test MSE is identical to the test MSE from OLS: 1,799,366.
set.seed(1)
pls <- plsr(Apps ~., data = train, scale = TRUE, validation = "CV")
summary(pls)
## Data: X dimension: 545 17
## Y dimension: 545 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3554 1498 1258 1134 1105 1077 1037
## adjCV 3554 1496 1262 1132 1102 1075 1033
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1031 1025 1025 1026 1026 1025 1024
## adjCV 1028 1022 1021 1022 1022 1021 1020
## 14 comps 15 comps 16 comps 17 comps
## CV 1024 1024 1024 1024
## adjCV 1020 1020 1020 1020
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.28 43.81 62.98 66.74 71.70 74.19 78.60 81.45
## Apps 82.73 87.96 90.38 91.15 91.74 92.54 92.62 92.66
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.77 86.09 87.81 91.28 93.22 94.18 96.23
## Apps 92.68 92.70 92.71 92.71 92.72 92.72 92.72
## 16 comps 17 comps
## X 97.90 100.00
## Apps 92.72 92.72
validationplot(pls, val.type = "MSEP")
pred <- predict(pls, test[, -2], ncomp = 17)
mean((test$Apps - pred)^2)
## [1] 1799366
Based on the AdjCV error, choose M = 17, so again the test MSE is 1,799,366.
Every model fit performed either worse or equal to OLS, and each model did not differ widely from one another. I would say that we can predict number of applications with a moderate level of accuracy. Using the RMSE, we can expect true values to deviate from predictions by about 1341 (in absolute value), which is not bad considering the number of applications range from 81 to 48094.
sqrt(1799366)
## [1] 1341.404
summary(College$Apps)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81 776 1558 3002 3624 48094
set.seed(1)
beta <- c(rep(0, 5), sample(c(1:100), size = 15, replace = T))
X <- matrix(runif(min = -100, max = 100, n = 20000), ncol = 20, nrow = 1000)
eps <- rnorm(1000, mean = 0, sd = 20)
Y <- X %*% beta
Y <- Y + eps
df <- as.data.frame(cbind(Y, X))
colnames(df)[1] <- "y"
set.seed(1)
trainid <- sample(1:nrow(df), 900)
train <- df[trainid,]
test <- df[-trainid,]
library(leaps)
regfit <- regsubsets(y ~., data = train, nvmax = 20)
regsummary <- summary(regfit)
trainmse <- regsummary$rss / nrow(train)
plot(1:20, trainmse, type = "l", col = "red", cex = 2, xlab = "# of Predictors", ylab = "Training MSE", main = "")
testmatrix <- model.matrix(y ~., data = test)
testmse <- rep(NA, 20)
for(i in 1:20){
newfit <- coef(regfit, id = i)
pred <- testmatrix[, names(newfit)] %*% newfit
testmse[i] <- mean((pred - test$y)^2)
}
plot(1:20, testmse, type = "l", col = "red", cex = 2, xlab = "# of Predictors", ylab = "Test MSE", main = "")
which.min(testmse)
## [1] 15
The test set MSE is minimized when the model contains 15 out of the 20 predictors. This makes sense based on the way the data was generated; 15 features have some true relationship with y, and 5 have no true relationship with y.
display <- matrix(NA, nrow = 2, ncol = 20)
display[1,] <- c(rep(0, 5), coef(regfit, id = 15)[-1])
display[2,] <- beta
display
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0 0 0 0 0 67.98894 39.01081 1.002864 34.00291 86.99526
## [2,] 0 0 0 0 0 68.00000 39.00000 1.000000 34.00000 87.00000
## [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
## [1,] 43.03866 13.99744 81.98576 58.9952 50.99791 97.01231 84.99103 21.02882
## [2,] 43.00000 14.00000 82.00000 59.0000 51.00000 97.00000 85.00000 21.00000
## [,19] [,20]
## [1,] 54.01 74.00394
## [2,] 54.00 74.00000
This model very accurately estimates the true values of the coefficients used when generating the data.
sqcoef <- rep(NA, 20)
for(i in 1:20){
newfit <- coef(regfit, id = i)[-1]
nms <- gsub("V", "", names(newfit))
nms <- as.numeric(nms)
nms <- nms - 1
betas <- beta[nms]
sqdiff <- c()
for(j in 1:length(betas)){
sqdiff[j] <- (newfit[[j]] - betas[j])^2
}
sqcoef[i] <- sqrt(sum(sqdiff))
}
plot(1:20, sqcoef, type = "l", col = "red", cex = 2, xlab = "# of Predictors", ylab = "Coefficient error")
This plot increases as predictors are first added, peaks around 7 predictors, and decreases until levelling out at 15 predictors. This is similar to the Test MSE plot since both reach a minimum at 15 predictors and level out afterwards. They differ in that the Test MSE plot decreases monotonically before reaching 15 predictors, but this plot increases before reaching the minimum.
beta <- c(rep(1,5), rep(0,5))
trainX <- matrix(runif(1000, -100, 100), ncol = 10, nrow = 100)
trainy <- trainX %*% beta
traineps <- rnorm(100, 0, 10)
trainy <- trainy + traineps
traindf <- as.data.frame(cbind(trainy, trainX))
colnames(traindf)[1] <- "y"
beta <- c(rep(1,5), rep(0,5))
testX <- matrix(runif(100000, -100, 100), ncol = 10, nrow = 10000)
testy <- testX %*% beta
testeps <- rnorm(10000, 0, 10)
testy <- testy + testeps
testdf <- as.data.frame(cbind(testy, testX))
colnames(testdf)[1] <-"y"
set.seed(1)
x <- model.matrix(y ~., data = traindf)[1]
lasso.fit <- cv.glmnet(trainX, trainy, alpha = 1)
bestlam <- lasso.fit$lambda.min
lasso.pred <- predict(lasso.fit, s = bestlam, newx = testX)
errlasso1 <- mean((testdf$y - lasso.pred)^2)
errlasso1
## [1] 116.9449
lasso.coef <- predict(lasso.fit, type = "coef", s = bestlam)
lasso.coef
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.716677441
## V1 0.961476035
## V2 0.962240044
## V3 0.962569792
## V4 0.986769237
## V5 0.972646167
## V6 -0.006209438
## V7 .
## V8 .
## V9 .
## V10 .
id <- which(lasso.coef != 0)[-1]
newtraindf <- traindf[,c(1,id)]
lmfit <- lm(y ~., data = newtraindf)
preds <- predict(lmfit, newdata = testdf[,-1])
ErrOLS1 <- mean((testdf$y - preds)^2)
ErrOLS1
## [1] 106.4691
set.seed(1)
errLasso <- rep(NA, 1000)
errOLS <- rep(NA, 1000)
for(i in 1:1000){
#newtrainset
beta <- c(rep(1,5), rep(0,5))
trainX <- matrix(runif(1000, -100, 100), ncol = 10, nrow = 100)
trainy <- trainX %*% beta
traineps <- rnorm(100, 0, 10)
trainy <- trainy + traineps
traindf <- as.data.frame(cbind(trainy, trainX))
colnames(traindf)[1] <- "y"
#lasso model
x <- model.matrix(y ~., data = traindf)[1]
lasso.fit <- cv.glmnet(trainX, trainy, alpha = 1)
bestlam <- lasso.fit$lambda.min
lasso.pred <- predict(lasso.fit, s = bestlam, newx = testX)
errLasso[i] <- mean((testdf$y - lasso.pred)^2)
#OLS after lasso
lasso.coef <- predict(lasso.fit, type = "coef", s = bestlam)
id <- which(lasso.coef != 0)[-1]
newtraindf <- traindf[,c(1,id)]
lmfit <- lm(y ~., data = newtraindf)
preds <- predict(lmfit, newdata = testdf[,-1])
errOLS[i] <- mean((testdf$y - preds)^2)
}
mean(errLasso)
## [1] 108.2129
mean(errOLS)
## [1] 108.7736
So the average test error for lasso is 108.2129 and the average error for OLS after lasso is 108.7736.
set.seed(1)
stdev <- 2^seq(-2, 13, length = 10)
final <- data.frame(stdev = stdev, errLasso = rep(NA, 10), errOLS = rep(NA, 10))
for(j in 1:10){
errLasso <- rep(NA, 10)
errOLS <- rep(NA, 10)
for(i in 1:1000){
#newtrainset
beta <- c(rep(1,5), rep(0,5))
trainX <- matrix(runif(1000, -100, 100), ncol = 10, nrow = 100)
trainy <- trainX %*% beta
traineps <- rnorm(100, 0, stdev[j])
trainy <- trainy + traineps
traindf <- as.data.frame(cbind(trainy, trainX))
colnames(traindf)[1] <- "y"
#lasso model
lasso.fit <- cv.glmnet(trainX, trainy, alpha = 1)
bestlam <- lasso.fit$lambda.min
lasso.pred <- predict(lasso.fit, s = bestlam, newx = testX)
errLasso[i] <- mean((testdf$y - lasso.pred)^2)
#OLS after lasso
lasso.coef <- predict(lasso.fit, type = "coef", s = bestlam)
if(all(lasso.coef[2:11] == 0)){
preds <- lasso.coef[1]
}
else{
id <- which(lasso.coef != 0)[-1]
newtraindf <- traindf[,c(1,id)]
lmfit <- lm(y ~., data = newtraindf)
preds <- predict(lmfit, newdata = testdf[,-1])
}
errOLS[i] <- mean((testdf$y - preds)^2)
}
final[j, 2] <- mean(errLasso)
final[j, 3] <- mean(errOLS)
}
plot(stdev, final$errLasso, type = "l", col = "red", cex = 2, xlab = "Sigma", ylab = "Average Test MSE", main = "Variation in Test MSE as Noise Increases")
lines(stdev, final$errOLS, col = "blue", cex = 2)
legend(0, 1200000, legend = c("OLS", "Lasso"), col = c("blue", "red"), lty = c(1,1))
final
## stdev errLasso errOLS
## 1 0.2500000 115.1028 9.704953e+01
## 2 0.7937005 114.6176 9.708378e+01
## 3 2.5198421 109.1741 9.744753e+01
## 4 8.0000000 104.2308 1.046079e+02
## 5 25.3984168 169.0754 1.730754e+02
## 6 80.6349472 848.5173 8.852332e+02
## 7 256.0000000 7734.8472 8.723062e+03
## 8 812.7493386 30334.8163 5.171328e+04
## 9 2580.3183102 151720.3626 3.202814e+05
## 10 8192.0000000 1405369.9149 3.092121e+06
With small values of sigma, OLS and Lasso have low test errors, and the difference in performance is small. As sigma increases, the test errors for both models increase. Lasso performs much better than OLS for large values of sigma.
This makes sense if we think of OLS as the more data-dependent model; OLS follows the relationship given in the dataset more closely than lasso. Since there is a high dependence between the predictions from OLS and the true values in the dataset, we can say OLS has more degrees of freedom than lasso. As sigma increased, the signal to noise ratio decreased, and since OLS is more flexible, it was “led astray” by the noise in the model. On the other hand, lasso has an amount of shrinkage that decreases variance and increases bias relative to OLS. The graph above shows that as noise increases, the decrease in variance for lasso is greater than the increase in bias due to shrinkage. Lasso is better equipped to ignore noise as it is added to the model.