Question 2, 9, 11,

Q2)

A)

III is correct option. Increase lambda, variance is reduced significantly and bias is reduced marginally. MSE get reduced and accuracy also increases. When we compare ridge with Lasso we allow some coefficients to go to zero when lambda is large. And it also uses variables selection for better interpreting results.

B)

III is correct option. If lambda becomes zero the ridge accumulates OLS. Also when the shrinkage parameter for ridge regression increases the flexibility of the model decreases. In turn this results in significant reduction in the variance with a small increase in bias. Which in turn will decrease total MSE.

C)

II is correct option.

Q9)

A)

library(ISLR)

Test and training data set

college = College

set.seed(42)
train = sample(nrow(college), .8*(nrow(college)) )

college.train = college[train, ]
college.test = college[-train, ]

B)

college.lm = lm(Apps~ ., college.train)

college.lm.pred = predict(college.lm, newdata = college.test)

college.error = mean((college.lm.pred - college.test$Apps)^2)
college.error
## [1] 1941715

C)

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1
set.seed(42)
xtrain = model.matrix(Apps ~., college.train)[,-1]
ytrain = college.train$Apps
xtest = model.matrix(Apps~., college.test)[,-1]
ytest = college.test$Apps

college.ridge = cv.glmnet(xtrain, ytrain, alpha = 0)
plot(college.ridge)

bestlam=college.ridge$lambda.min
bestlam
## [1] 337.0816
ridge.pred=predict(college.ridge,s=bestlam,newx=xtest)
mean((ridge.pred-ytest)^2)
## [1] 3844468

D)

set.seed(42)

college.lasso = cv.glmnet(xtrain, ytrain, alpha = 1)
plot(college.lasso)

bestlam= college.lasso$lambda.min
bestlam
## [1] 8.74735
lasso.pred=predict(college.lasso,s=bestlam,newx=xtest)
mean((lasso.pred-ytest)^2)
## [1] 2053837
lasso.coef=predict(college.lasso,type="coefficients",s=bestlam)[1:18,]
lasso.coef[lasso.coef!=0]
##   (Intercept)    PrivateYes        Accept     Top10perc     Top25perc 
## -8.726150e+02 -5.462323e+02  1.227041e+00  4.201700e+01 -1.039634e+01 
##   F.Undergrad   P.Undergrad      Outstate    Room.Board      Personal 
##  5.288884e-02  2.995941e-02 -3.894791e-02  1.973311e-01 -2.143431e-04 
##           PhD      Terminal     S.F.Ratio   perc.alumni        Expend 
## -6.177096e+00 -5.522741e+00  1.635132e+01 -4.051310e+00  7.855491e-02 
##     Grad.Rate 
##  8.035061e+00

E)

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(42)
college.pcr = pcr(Apps~., data = college.train, scale = TRUE, validation = "CV")
summary(college.pcr)
## Data:    X dimension: 621 17 
##  Y dimension: 621 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3606     3497     1705     1715     1480     1264     1257
## adjCV         3606     3498     1704     1716     1466     1253     1255
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1229     1212     1187      1179      1182      1181      1183
## adjCV     1226     1204     1186      1176      1180      1178      1180
##        14 comps  15 comps  16 comps  17 comps
## CV         1183      1190      1055      1035
## adjCV      1180      1187      1052      1032
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      31.872    57.63    64.85    70.51    75.83    80.81    84.41    87.82
## Apps    6.168    77.92    77.92    84.76    88.39    88.39    88.95    89.44
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.84     93.24     95.34     97.11     98.14     98.92     99.44
## Apps    89.66     89.91     89.97     90.07     90.08     90.12     90.13
##       16 comps  17 comps
## X        99.82    100.00
## Apps     92.34     92.71
validationplot(college.pcr,val.type="MSEP")

pcr.pred=predict(college.pcr,xtest,ncomp=9)
mean((pcr.pred-ytest)^2)
## [1] 5261564

F)

set.seed(42)
college.pls = plsr(Apps~., data = college.train, scale = TRUE, validation ="CV")
summary(college.pls)
## Data:    X dimension: 621 17 
##  Y dimension: 621 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3606     1546     1324     1154     1130     1096     1058
## adjCV         3606     1544     1324     1152     1125     1083     1051
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1041     1039     1037      1034      1036      1035      1035
## adjCV     1037     1036     1034      1031      1032      1031      1032
##        14 comps  15 comps  16 comps  17 comps
## CV         1035      1035      1035      1035
## adjCV      1032      1032      1032      1032
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       26.34    46.91    62.99    66.01    68.24    71.98    75.40    81.19
## Apps    82.05    87.06    90.33    91.21    92.20    92.54    92.63    92.64
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       83.51     85.23     87.18     88.80     91.34     93.31     97.11
## Apps    92.67     92.69     92.70     92.71     92.71     92.71     92.71
##       16 comps  17 comps
## X        99.34    100.00
## Apps     92.71     92.71
validationplot(college.pls,val.type="MSEP")

pls.pred=predict(college.pls,xtest,ncomp=9)
mean((pls.pred-ytest)^2)
## [1] 1960015

G)

There is a great variation in test errors. The best model we found is OLS and the worst is PCR.

Q11)

A)

library(MASS)
boston = Boston
set.seed(42)
train = sample(nrow(boston), .8*(nrow(boston)) )
boston.train = boston[train,]
boston.test = boston[-train,]

Ridge

set.seed(42)

xtrain = model.matrix(crim ~., boston.train)[,-1]
ytrain = boston.train$crim
xtest = model.matrix(crim~., boston.test)[,-1]
ytest = boston.test$crim

boston.ridge = cv.glmnet(xtrain, ytrain, alpha = 0)
plot(boston.ridge)

bestlam=boston.ridge$lambda.min
bestlam
## [1] 0.567925
ridge.pred=predict(boston.ridge,s=bestlam,newx=xtest)
mean((ridge.pred-ytest)^2)
## [1] 10.98377

Lasso

set.seed(42)
boston.lasso = cv.glmnet(xtrain, ytrain, alpha = 1)
plot(boston.lasso)

bestlam= boston.lasso$lambda.min
bestlam
## [1] 0.03102164
lasso.pred=predict(boston.lasso,s=bestlam,newx=xtest)
mean((lasso.pred-ytest)^2)
## [1] 12.76214
lasso.coef=predict(boston.lasso,type="coefficients",s=bestlam)[1:14,]
lasso.coef[lasso.coef!=0]
##   (Intercept)            zn         indus          chas           nox 
##  20.682539741   0.044316097  -0.076586514  -0.927983848 -12.483375461 
##            rm           dis           rad           tax       ptratio 
##   0.523101614  -1.105489998   0.614518358  -0.003255396  -0.299417050 
##         black         lstat          medv 
##  -0.010513309   0.110123592  -0.233892553

Pcr

set.seed(42)

boston.pcr = pcr(crim~., data = boston.train, scale = TRUE, validation = "CV")
summary(boston.pcr)
## Data:    X dimension: 404 13 
##  Y dimension: 404 1
## Fit method: svdpc
## Number of components considered: 13
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           9.315    7.846    7.831    7.382    7.388    7.401    7.431
## adjCV        9.315    7.844    7.829    7.376    7.379    7.396    7.424
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       7.419    7.297     7.35     7.352     7.358     7.279     7.174
## adjCV    7.411    7.288     7.34     7.340     7.346     7.265     7.159
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       46.62    59.73    68.81    75.84    82.71    87.78    90.98    93.28
## crim    29.46    29.80    37.86    38.12    38.12    38.21    38.88    40.94
##       9 comps  10 comps  11 comps  12 comps  13 comps
## X       95.31     97.02     98.44     99.53    100.00
## crim    40.94     41.14     41.30     43.29     44.89
validationplot(boston.pcr,val.type="MSEP")

pcr.pred=predict(boston.pcr,xtest,ncomp=8)
mean((pcr.pred-ytest)^2)
## [1] 10.49009

B)

To evaluate the model we have used validation set. PCR model gives the best score and error of 10.49. ### C) We observe that PCR model has 8 out of 13 predictors. We used validation plot to select the number of features, also observed that after 8 variables the graph increases and then finally decreases with all the variables.