library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.2
c<-College
set.seed(1)
train<-sample(1:nrow(c), nrow(c)*.8)
c.train<-c[train,]
c.test<-c[-train,]
library(dplyr)
library(glmnet)
library(pls)
c.lm<-lm(Apps~., data = c.train)
c.lm.pred<-predict(c.lm, c.test, type = 'response')
mean((c.lm.pred-c.test$Apps)^2)
## [1] 1075064
c.train.x<-model.matrix(Apps~.,data=c.train)
c.train.y<-c.train$Apps
c.test.x<-model.matrix(Apps~.,data=c.test)
c.test.y<-c.test$Apps
c.ridge.fit<-cv.glmnet(c.train.x,c.train.y, alpha = 0)
bestlam.ridge<- c.ridge.fit$lambda.min
c.ridge.pred<-predict(c.ridge.fit, s=bestlam.ridge, newx=c.test.x)
mean((c.ridge.pred-c.test.y)^2)
## [1] 1192843
c.lasso.fit<-cv.glmnet(c.train.x,c.train.y, alpha = 1)
bestlam.lasso<- c.lasso.fit$lambda.min
c.lasso.pred<-predict(c.lasso.fit, s=bestlam.lasso, newx=c.test.x)
mean((c.lasso.pred-c.test.y)^2)
## [1] 1106108
c.pcr.fit=pcr(Apps~., data= c.train, scale = TRUE, validation="CV" )
summary(c.pcr.fit)
## Data: X dimension: 621 17
## Y dimension: 621 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4029 4022 2141 2153 2134 1690 1649
## adjCV 4029 4024 2139 2153 2234 1677 1644
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1652 1653 1573 1554 1560 1563 1571
## adjCV 1649 1651 1569 1550 1556 1559 1567
## 14 comps 15 comps 16 comps 17 comps
## CV 1565 1506 1172 1140
## adjCV 1561 1485 1165 1133
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 31.020 57.28 64.37 70.12 75.68 80.67 84.31
## Apps 1.269 72.40 72.40 72.40 83.85 84.35 84.35
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 87.64 90.60 92.98 95.08 96.88 97.97 98.76
## Apps 84.41 85.88 86.17 86.23 86.24 86.26 86.48
## 15 comps 16 comps 17 comps
## X 99.37 99.85 100.00
## Apps 91.09 93.12 93.44
c.xcol<-1:17
mse_pcr_comps<-numeric(length = length(c.xcol))
for (i in 1:length(c.xcol)){
mse_pcr_comps[i]<-mean(
(predict(c.pcr.fit, c.test, ncomp = i )-c.test$Apps)^2)
}
mse_pcr_comps
## [1] 9687842 3165942 3165726 3160487 1909429 1872626 1862757 1849413
## [9] 1541736 1553814 1580343 1595819 1624861 1670516 1519173 1111324
## [17] 1075064
c.pcr.pred<-predict(c.pcr.fit, c.test, ncomp = 16 )
mean((c.pcr.pred-c.test$Apps)^2)
## [1] 1111324
c.pls.fit=plsr(Apps~., data= c.train, scale = TRUE, validation="CV" )
summary(c.pls.fit)
## Data: X dimension: 621 17
## Y dimension: 621 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4029 1977 1574 1519 1418 1310 1224
## adjCV 4029 1972 1565 1512 1397 1278 1211
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1212 1208 1202 1203 1202 1203 1201
## adjCV 1201 1198 1192 1192 1191 1193 1191
## 14 comps 15 comps 16 comps 17 comps
## CV 1201 1201 1201 1201
## adjCV 1191 1191 1191 1191
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 26.22 35.30 62.44 64.74 66.69 71.46 75.92
## Apps 77.61 86.65 87.94 91.31 93.13 93.25 93.29
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 79.77 81.40 83.88 85.83 90.04 92.52 94.52
## Apps 93.34 93.39 93.41 93.43 93.43 93.44 93.44
## 15 comps 16 comps 17 comps
## X 97.07 98.41 100.00
## Apps 93.44 93.44 93.44
c.xcol<-1:17
mse_pls_comps<-numeric(length = length(c.xcol))
for (i in 1:length(c.xcol)){
mse_pls_comps[i]<-mean(
(predict(c.pls.fit, c.test, ncomp = i )-c.test$Apps)^2)
}
mse_pls_comps
## [1] 2534388 1607468 1439043 1699635 1159671 1110004 1089798 1087373
## [9] 1067816 1075376 1079022 1075884 1073874 1075599 1074828 1075177
## [17] 1075064
c.pls.pred<-predict(c.pls.fit, c.test, ncomp = 9 )
mean((c.pls.pred-c.test$Apps)^2)
## [1] 1067816
Im going to go with 9 comps, as it provides a low cv, similar to the lowest cv, but with only 9 comps. It also provides the lowest MSE
PLS provided the lowest MSE of 1067816. Ridge performed the worst at 1192843.
library(ISLR)
library(MASS)
library(leaps)
library(glmnet)
library(pls)
library(data.table)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
b<-Boston
set.seed(1)
train<-sample(1:nrow(b), nrow(b)*.8)
b.train<-b[train,]
b.test<-b[-train,]
b.lm<-lm(crim~., data = b.train)
b.lm.pred<-predict(b.lm, b.test, type = 'response')
mean((b.lm.pred-b.test$crim)^2)
## [1] 46.16186
b.fwd=regsubsets(crim~., data =b.train, nvmax=13, method= 'forward')
b.fwd.sum<-summary(b.fwd)
data.table(vars = 1:13, bic = b.fwd.sum$bic, adjr2 = b.fwd.sum$adjr2, cp = b.fwd.sum$cp)
## vars bic adjr2 cp
## 1: 1 -192.1275 0.3951584 37.346862
## 2: 2 -212.2210 0.4315769 11.991005
## 3: 3 -207.9455 0.4325851 12.243245
## 4: 4 -205.5555 0.4362253 10.610122
## 5: 5 -204.1189 0.4411589 8.064166
## 6: 6 -202.5955 0.4459269 5.654263
## 7: 7 -199.5954 0.4486391 4.725811
## 8: 8 -194.8321 0.4489346 5.524099
## 9: 9 -189.6212 0.4486159 6.758753
## 10: 10 -184.1034 0.4478743 8.291242
## 11: 11 -178.3383 0.4467894 10.063083
## 12: 12 -172.3763 0.4454286 12.025021
## 13: 13 -166.4008 0.4440423 14.000000
b.train.x<-model.matrix(crim~.,data=b.train)
b.train.y<-b.train$crim
b.test.x<-model.matrix(crim~.,data=b.test)
b.test.y<-b.test$crim
b.ridge.fit<-cv.glmnet(b.train.x,b.train.y, alpha = 0)
bestlam.ridge<- b.ridge.fit$lambda.min
bestlam.ridge
## [1] 0.5904968
b.ridge.pred<-predict(b.ridge.fit, s=bestlam.ridge, newx=b.test.x)
mean((b.ridge.pred-b.test.y)^2)
## [1] 46.59933
b.lasso.fit<-cv.glmnet(b.train.x,b.train.y, alpha = 1)
bestlam.lasso<- b.lasso.fit$lambda.min
bestlam.lasso
## [1] 0.2073347
b.lasso.pred<-predict(b.lasso.fit, s=bestlam.lasso, newx=b.test.x)
mean((b.lasso.pred-b.test.y)^2)
## [1] 47.75546
b.pcr.fit=pcr(crim~., data= b.train, scale = TRUE, validation="CV" )
summary(b.pcr.fit)
## Data: X dimension: 404 13
## Y dimension: 404 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.564 7.079 7.058 6.734 6.752 6.752 6.734
## adjCV 8.564 7.077 7.055 6.728 6.748 6.748 6.728
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.716 6.651 6.595 6.592 6.594 6.568 6.494
## adjCV 6.709 6.615 6.585 6.581 6.584 6.556 6.482
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 47.51 60.66 69.39 76.25 82.93 87.93 91.14
## crim 32.02 32.76 38.82 38.83 38.94 39.93 40.59
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## X 93.35 95.46 97.14 98.56 99.54 100.0
## crim 43.17 43.50 43.94 44.01 45.02 46.2
b.xcol<-1:13
mse_pcr_comps<-numeric(length = length(b.xcol))
for (i in 1:length(b.xcol)){
mse_pcr_comps[i]<-mean(
(predict(b.pcr.fit, b.test, ncomp = i )-b.test$crim)^2)
}
mse_pcr<-as.data.frame(cbind(ncomps = c(1:13), mse = as.numeric(mse_pcr_comps)))
mse_pcr[mse_pcr$mse == min(mse_pcr$mse),]
## ncomps mse
## 4 4 45.80178
b.pls.fit=plsr(crim~., data= b.train, scale = TRUE, validation="CV" )
summary(b.pls.fit)
## Data: X dimension: 404 13
## Y dimension: 404 1
## Fit method: kernelpls
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.564 6.963 6.661 6.608 6.585 6.570 6.563
## adjCV 8.564 6.959 6.653 6.598 6.571 6.555 6.547
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.546 6.541 6.537 6.537 6.538 6.538 6.538
## adjCV 6.531 6.527 6.523 6.523 6.524 6.524 6.524
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 47.18 56.43 64.42 71.71 75.87 78.58 83.30
## crim 35.13 42.23 44.20 45.28 45.71 46.03 46.12
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## X 86.48 88.16 93.37 96.45 98.21 100.0
## crim 46.17 46.20 46.20 46.20 46.20 46.2
b.xcol<-1:13
mse_pls_comps<-numeric(length = length(b.xcol))
for (i in 1:length(b.xcol)){
mse_pls_comps[i]<-mean(
(predict(b.pls.fit, b.test, ncomp = i )-b.test$crim)^2)
}
mse_pls<-as.data.frame(cbind(ncomps = c(1:13), mse = as.numeric(mse_pls_comps)))
mse_pls[mse_pls$mse == min(mse_pls$mse),]
## ncomps mse
## 10 10 46.12139
rsq<-function(y,y_hat){
sst <- sum((y - mean(y))^2)
sse <- sum((y_hat - y)^2)
rsq <- 1 - sse / sst
rsq
}
rsq(b.test$crim,b.lm.pred)
## [1] 0.3976486
rsq(b.test$crim,b.ridge.pred)
## [1] 0.3919402
rsq(b.test$crim,b.lasso.pred)
## [1] 0.3768543
b.pcr.pred<-(predict(b.pcr.fit, b.test, ncomp = 4 ))
rsq(b.test$crim,b.pcr.pred)
## [1] 0.4023472
b.pls.pred<-(predict(b.pls.fit, b.test, ncomp = 10 ))
rsq(b.test$crim,b.pls.pred)
## [1] 0.3981767
PCR, with ncomp = 4, has the lowest MSE of 45.80178, and highest r squared of 0.4023472
coefplot(b.pcr.fit, ncomp = 4)
coefficients(b.pcr.fit, ncomp = 4)
## , , 4 comps
##
## crim
## zn 0.3895991
## indus 0.6398953
## chas -0.5294830
## nox 0.5358185
## rm 0.1019216
## age 0.1285738
## dis -0.1370783
## rad 1.5246551
## tax 1.4667659
## ptratio 0.6460145
## black -1.2514183
## lstat 0.4646801
## medv -0.4148010