require(faraway)
data(debt)
set.seed(190)
fulldf <- na.omit(debt)
testRows <- sample(nrow(fulldf),0.1*nrow(fulldf))
testdf <- fulldf[testRows,]
traindf <- fulldf[-testRows,]

Question 1

q1a: Fit a standard linear regression with the variable prodebt as the response and the other variables as predictors. Display the model summary.

ans: see model summary displayed below for q1a.

q1b: Which variables are significant at the 0.05 level? At the 0.01 level?

ans: agegp, manage, ccarduse, and locintrn are significant at both 0.05 and 0.01 alpha levels.

q1c: What are the 10-fold and leave one out cross-validation scores for this model?

ans: 10-fold MSE = \(0.4615591\). LOOCV MSE = \(0.4638267\)

q1d. What are the Mallow’s Cp, AIC, and BIC criterion values for this model?

ans: Cp = \(13.0000\). AIC = \(570.7717\). BIC = \(621.3555\)

set.seed(190)
#q1a and q1b
model1 <- glm(prodebt~.,data=traindf)
summary(model1)

Call:
glm(formula = prodebt ~ ., data = traindf)

Deviance Residuals: 
     Min        1Q    Median        3Q  
-1.90782  -0.45705  -0.03132   0.38732  
     Max  
 1.85735  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  3.95258    0.34557  11.438  < 2e-16
incomegp     0.05744    0.03648   1.574  0.11659
house       -0.03541    0.07317  -0.484  0.62877
children     0.03815    0.04227   0.902  0.36766
singpar      0.03894    0.18318   0.213  0.83181
agegp       -0.09529    0.05132  -1.857  0.06446
bankacc      0.08293    0.13348   0.621  0.53493
bsocacc     -0.10808    0.08977  -1.204  0.22968
manage      -0.13626    0.04823  -2.825  0.00509
ccarduse     0.17313    0.05541   3.125  0.00198
cigbuy      -0.15338    0.09420  -1.628  0.10468
xmasbuy      0.22044    0.12897   1.709  0.08861
locintrn    -0.12527    0.04739  -2.643  0.00871
               
(Intercept) ***
incomegp       
house          
children       
singpar        
agegp       .  
bankacc        
bsocacc        
manage      ** 
ccarduse    ** 
cigbuy         
xmasbuy     .  
locintrn    ** 
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.4455924)

    Null deviance: 143.88  on 273  degrees of freedom
Residual deviance: 116.30  on 261  degrees of freedom
AIC: 570.77

Number of Fisher Scoring iterations: 2
#q1c
library(boot)
# Run 10-fold cross-validation of the model1 on the data set
cv.model = cv.glm(data=traindf, glmfit=model1, K=10)
# Extract the MSE
cat('10-fold cross-validation MSE:', cv.model$delta[1])
10-fold cross-validation MSE: 0.4615591
# Run LOOCV
loocv.model = cv.glm(data=traindf, glmfit=model1, K=nrow(traindf))
# Extract the MSE
cat('\nLeave-one-out-cross-validation MSE:', loocv.model$delta[1])

Leave-one-out-cross-validation MSE: 0.4638267
#q1d
library(CombMSC)
s2 <- sigma(lm(prodebt~.,data=traindf)) #extracts residual standard error from lm
n = nrow(traindf)
c(Cp(model1,S2=(s2^2)), AIC(model1,k=2),AIC(model1,k=log(n)))
[1]  13.0000 570.7717 621.3555

Question 2

q2a. Compare all models using Mallow’s Cp. Display a table indicating the variables included in each model and the corresponding Mallow’s Cp value.

ans: see table below in output for q2a.

q2b. How many variables are in the model with the lowest Mallow’s Cp value? Which variables are they?

ans: \(7\): incomegp,agegp,manage,ccarduse,cigbuy,xmasbuy,locintrn.

set.seed(190)
#q2a
library(leaps)
out = leaps(traindf[,-c(13)], traindf$prodebt, method = "Cp")
cbind(as.matrix(out$which),out$Cp)
   1 2 3 4 5 6 7 8 9 A B C          
1  0 0 0 0 0 0 0 0 1 0 0 0 34.441083
1  1 0 0 0 0 0 0 0 0 0 0 0 37.145995
1  0 0 0 0 0 0 0 1 0 0 0 0 38.731359
1  0 0 0 0 1 0 0 0 0 0 0 0 42.660981
1  0 0 1 0 0 0 0 0 0 0 0 0 45.607165
1  0 0 0 0 0 0 0 0 0 0 0 1 46.957440
1  0 0 0 0 0 1 0 0 0 0 0 0 48.278428
1  0 0 0 0 0 0 0 0 0 0 1 0 48.354061
1  0 0 0 0 0 0 0 0 0 1 0 0 49.810097
1  0 0 0 0 0 0 1 0 0 0 0 0 51.455025
2  0 0 0 0 0 0 0 1 1 0 0 0 19.779750
2  1 0 0 0 0 0 0 1 0 0 0 0 24.960262
2  0 0 0 0 1 0 0 0 1 0 0 0 25.172463
2  0 0 1 0 0 0 0 0 1 0 0 0 28.251079
2  0 0 0 0 0 0 0 0 1 0 0 1 28.771002
2  1 0 0 0 0 0 0 0 1 0 0 0 30.180958
2  1 0 0 0 0 0 0 0 0 0 0 1 30.485454
2  1 0 0 0 1 0 0 0 0 0 0 0 32.499082
2  0 0 0 0 0 0 0 0 1 0 1 0 33.043153
2  0 0 0 0 0 0 1 0 1 0 0 0 33.207888
3  0 0 0 0 1 0 0 1 1 0 0 0 14.568827
3  1 0 0 0 0 0 0 1 1 0 0 0 16.095549
3  0 0 1 0 0 0 0 1 1 0 0 0 16.126847
3  0 0 0 0 0 0 0 1 1 0 0 1 16.879762
3  0 0 0 0 1 0 0 0 1 0 0 1 18.039589
3  0 0 0 0 0 0 0 1 1 0 1 0 18.291258
3  0 0 0 0 0 1 0 1 1 0 0 0 20.186256
3  0 0 0 0 0 0 0 1 1 1 0 0 20.355828
3  0 0 0 0 0 0 1 1 1 0 0 0 20.788126
3  0 1 0 0 0 0 0 1 1 0 0 0 20.913896
4  0 0 0 0 1 0 0 1 1 0 0 1 10.343902
4  1 0 0 0 0 0 0 1 1 0 0 1 11.938331
4  0 0 0 0 1 0 0 1 1 0 1 0 12.841669
4  1 0 0 0 1 0 0 1 1 0 0 0 12.947177
4  0 0 1 0 0 0 0 1 1 0 0 1 13.285237
4  1 0 1 0 0 0 0 1 1 0 0 0 13.469674
4  0 0 0 0 1 0 0 1 1 1 0 0 14.238375
4  0 0 1 0 1 0 0 1 1 0 0 0 14.457802
4  0 0 0 0 1 1 0 1 1 0 0 0 15.047039
4  0 0 0 0 0 0 0 1 1 0 1 1 15.122201
5  1 0 0 0 1 0 0 1 1 0 0 1  7.788808
5  0 0 0 0 1 0 0 1 1 0 1 1  8.277262
5  1 0 1 0 0 0 0 1 1 0 0 1  9.484143
5  0 0 0 0 1 0 0 1 1 1 0 1  9.826121
5  0 0 0 0 1 1 0 1 1 0 0 1 10.273730
5  0 0 1 0 1 0 0 1 1 0 0 1 10.563902
5  1 0 0 0 0 0 0 1 1 0 1 1 11.096022
5  0 0 0 0 1 0 1 1 1 0 0 1 11.591348
5  1 0 0 0 1 0 0 1 1 0 1 0 11.937732
5  1 1 0 0 0 0 0 1 1 0 0 1 12.078086
6  1 0 0 0 1 0 0 1 1 0 1 1  6.544859
6  0 0 0 0 1 0 0 1 1 1 1 1  7.302879
6  0 0 0 0 1 1 0 1 1 0 1 1  7.879094
6  1 0 0 0 1 0 0 1 1 1 0 1  8.124421
6  1 0 1 0 1 0 0 1 1 0 0 1  8.186881
6  1 0 0 0 1 0 1 1 1 0 0 1  8.388379
6  1 0 0 0 1 1 0 1 1 0 0 1  9.152069
6  0 0 1 0 1 0 0 1 1 0 1 1  9.445733
6  0 0 0 0 1 0 1 1 1 0 1 1  9.464968
6  1 1 0 0 1 0 0 1 1 0 0 1  9.635989
7  1 0 0 0 1 0 0 1 1 1 1 1  6.456914
7  1 0 0 0 1 0 1 1 1 0 1 1  7.137318
7  0 0 0 0 1 1 0 1 1 1 1 1  7.448470
7  1 0 0 0 1 1 0 1 1 0 1 1  7.606385
7  1 0 1 0 1 0 0 1 1 0 1 1  7.745966
7  0 0 0 0 1 0 1 1 1 1 1 1  7.905208
7  1 0 0 0 1 0 1 1 1 1 0 1  8.246065
7  1 0 1 0 1 0 0 1 1 1 0 1  8.304783
7  0 0 1 0 1 0 0 1 1 1 1 1  8.326722
7  1 0 0 1 1 0 0 1 1 0 1 1  8.444141
8  1 0 0 0 1 0 1 1 1 1 1 1  6.508902
8  1 0 1 0 1 0 0 1 1 1 1 1  7.534036
8  1 0 0 0 1 1 0 1 1 1 1 1  7.694545
8  1 1 0 0 1 0 0 1 1 1 1 1  8.211996
8  1 0 1 0 1 0 1 1 1 0 1 1  8.353004
8  1 0 0 1 1 0 0 1 1 1 1 1  8.401659
 [ reached getOption("max.print") -- omitted 35 rows ]
best.model = which(out$Cp==min(out$Cp))
cbind(as.matrix(out$which),out$Cp)[best.model,]
       1        2        3        4        5 
1.000000 0.000000 0.000000 0.000000 1.000000 
       6        7        8        9        A 
0.000000 0.000000 1.000000 1.000000 1.000000 
       B        C          
1.000000 1.000000 6.456914 
#2b
#predictors 1,5,8,9,10,11,12
names(traindf) #1,5,8,9 = incomegp, agegp, manage, ccarduse, cigbuy, xmasbuy, locintrn
 [1] "incomegp" "house"    "children" "singpar" 
 [5] "agegp"    "bankacc"  "bsocacc"  "manage"  
 [9] "ccarduse" "cigbuy"   "xmasbuy"  "locintrn"
[13] "prodebt" 

Question 3

q3(a): Perform forward stepwise regression using AIC. Allow the minimum model to be the model with only an intercept. Display the model summary of your final model.

ans: see model summary displayed below for q3a.

3(b): How many variables are in your final model? Which are significant at the 0.05 level?

ans: \(8\) variables are in the final model. agegp,manage,ccarduse,locintrn are significant at alpha = \(0.05\).

3(c): Perform backward stepwise selection. Do the variables in your final model differ from forward stepwise regression? If so, how?

ans: No; the backward stepwise regression produces the near same model with identical variables.

3(d): Compare the adjusted R2, Mallow’s Cp, and AICs of the full model, the model found in Question 2, and the model found using forward selection. Which model is preferred based on these criteria and why?

ans: For these purposes, these values seem to indicate two models of near equivalent predictive ability, making neither one likely better or worse than the other.

set.seed(190)
#q3a and b Create the minimum model that only includes the intercept
m0 <- glm(prodebt~1, data=traindf)
m2 <- glm(prodebt~.,data=traindf)
fstep <- step(m2,scope=list(m0,upper=m2),direction="forward")
Start:  AIC=570.77
prodebt ~ incomegp + house + children + singpar + agegp + bankacc + 
    bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
summary(fstep)

Call:
glm(formula = prodebt ~ incomegp + house + children + singpar + 
    agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + 
    xmasbuy + locintrn, data = traindf)

Deviance Residuals: 
     Min        1Q    Median        3Q  
-1.90782  -0.45705  -0.03132   0.38732  
     Max  
 1.85735  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  3.95258    0.34557  11.438  < 2e-16
incomegp     0.05744    0.03648   1.574  0.11659
house       -0.03541    0.07317  -0.484  0.62877
children     0.03815    0.04227   0.902  0.36766
singpar      0.03894    0.18318   0.213  0.83181
agegp       -0.09529    0.05132  -1.857  0.06446
bankacc      0.08293    0.13348   0.621  0.53493
bsocacc     -0.10808    0.08977  -1.204  0.22968
manage      -0.13626    0.04823  -2.825  0.00509
ccarduse     0.17313    0.05541   3.125  0.00198
cigbuy      -0.15338    0.09420  -1.628  0.10468
xmasbuy      0.22044    0.12897   1.709  0.08861
locintrn    -0.12527    0.04739  -2.643  0.00871
               
(Intercept) ***
incomegp       
house          
children       
singpar        
agegp       .  
bankacc        
bsocacc        
manage      ** 
ccarduse    ** 
cigbuy         
xmasbuy     .  
locintrn    ** 
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.4455924)

    Null deviance: 143.88  on 273  degrees of freedom
Residual deviance: 116.30  on 261  degrees of freedom
AIC: 570.77

Number of Fisher Scoring iterations: 2
#q3c
bstep <- step(m2,scope=list(lower=m0,upper=m2),direction="backward")
Start:  AIC=570.77
prodebt ~ incomegp + house + children + singpar + agegp + bankacc + 
    bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn

           Df Deviance    AIC
- singpar   1   116.32 568.82
- house     1   116.40 569.02
- bankacc   1   116.47 569.18
- children  1   116.66 569.63
- bsocacc   1   116.95 570.29
<none>          116.30 570.77
- incomegp  1   117.40 571.36
- cigbuy    1   117.48 571.54
- xmasbuy   1   117.60 571.82
- agegp     1   117.84 572.37
- locintrn  1   119.41 576.01
- manage    1   119.86 577.03
- ccarduse  1   120.65 578.83

Step:  AIC=568.82
prodebt ~ incomegp + house + children + agegp + bankacc + bsocacc + 
    manage + ccarduse + cigbuy + xmasbuy + locintrn

           Df Deviance    AIC
- house     1   116.42 567.06
- bankacc   1   116.49 567.21
- children  1   116.71 567.74
- bsocacc   1   116.96 568.32
<none>          116.32 568.82
- incomegp  1   117.42 569.40
- cigbuy    1   117.53 569.64
- xmasbuy   1   117.62 569.87
- agegp     1   117.91 570.53
- locintrn  1   119.46 574.12
- manage    1   119.94 575.23
- ccarduse  1   120.74 577.04

Step:  AIC=567.06
prodebt ~ incomegp + children + agegp + bankacc + bsocacc + manage + 
    ccarduse + cigbuy + xmasbuy + locintrn

           Df Deviance    AIC
- bankacc   1   116.56 565.39
- children  1   116.83 566.01
- bsocacc   1   117.10 566.65
<none>          116.42 567.06
- incomegp  1   117.46 567.48
- cigbuy    1   117.56 567.73
- xmasbuy   1   117.75 568.17
- agegp     1   118.58 570.08
- locintrn  1   119.51 572.22
- manage    1   120.23 573.89
- ccarduse  1   120.82 575.21

Step:  AIC=565.39
prodebt ~ incomegp + children + agegp + bsocacc + manage + ccarduse + 
    cigbuy + xmasbuy + locintrn

           Df Deviance    AIC
- children  1   116.97 564.35
<none>          116.56 565.39
- bsocacc   1   117.43 565.42
- cigbuy    1   117.79 566.27
- xmasbuy   1   117.82 566.34
- incomegp  1   118.05 566.86
- agegp     1   118.66 568.28
- locintrn  1   119.57 570.36
- manage    1   120.25 571.92
- ccarduse  1   121.37 574.47

Step:  AIC=564.35
prodebt ~ incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + 
    xmasbuy + locintrn

           Df Deviance    AIC
<none>          116.97 564.35
- bsocacc   1   117.84 564.38
- cigbuy    1   118.14 565.08
- incomegp  1   118.48 565.87
- xmasbuy   1   118.64 566.22
- locintrn  1   120.12 569.62
- agegp     1   120.35 570.15
- manage    1   120.78 571.12
- ccarduse  1   121.72 573.25
summary(bstep)

Call:
glm(formula = prodebt ~ incomegp + agegp + bsocacc + manage + 
    ccarduse + cigbuy + xmasbuy + locintrn, data = traindf)

Deviance Residuals: 
     Min        1Q    Median        3Q  
-1.91126  -0.43001  -0.01056   0.39193  
     Max  
 1.84378  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  4.03169    0.31682  12.726  < 2e-16
incomegp     0.06067    0.03276   1.852  0.06519
agegp       -0.12121    0.04383  -2.766  0.00608
bsocacc     -0.12175    0.08682  -1.402  0.16199
manage      -0.13824    0.04708  -2.936  0.00362
ccarduse     0.17746    0.05413   3.279  0.00118
cigbuy      -0.15044    0.09236  -1.629  0.10452
xmasbuy      0.24250    0.12485   1.942  0.05316
locintrn    -0.12503    0.04685  -2.669  0.00808
               
(Intercept) ***
incomegp    .  
agegp       ** 
bsocacc        
manage      ** 
ccarduse    ** 
cigbuy         
xmasbuy     .  
locintrn    ** 
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.4414037)

    Null deviance: 143.88  on 273  degrees of freedom
Residual deviance: 116.97  on 265  degrees of freedom
AIC: 564.35

Number of Fisher Scoring iterations: 2
#q3d
c(Cp(fstep,S2=s2^2), AIC(fstep,k=2),AIC(fstep,k=log(n)));c(Cp(bstep,S2=s2^2), AIC(bstep,k=2),AIC(bstep,k=log(n)))
[1]  13.0000 570.7717 621.3555
[1]   6.508902 564.351239 600.482521

Question 4

q4(a): Perform ridge regression. Find the lambda value that minimizes the generalized CV score.

ans: See output below. Used lambda <- seq(0,110,by=0.25) and used glmnet to get optimized value \(1.038303\)

q4(b): List the value of coefficients at the optimum lambda value.

ans: See output below for q4b for list of coefficients and their values.

q4(c): How many variables were selected? Give an explanation for this number.

ans: 12; All of the original predictors. Ridge regression does not remove predictors but rather shrinks their coefficients such that they are weighted more or less heavily depending on the value they add to the model.

set.seed(190)
#q4a
library(glmnet)
y=traindf$prodebt
X <-  traindf
X$prodebt <- NULL
X <-  as.matrix(X)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=0, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
CV Optimized lambda: 1.038303
# Create the lasso model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=0, nlambda=50)
options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)

library(MASS)
lambda <- seq(0,110,by=0.25)
out <- lm.ridge(prodebt~.,data=traindf,lambda=lambda)
#round(out$GCV,5)
which(out$GCV == min(out$GCV))
103.25 
   414 
#q4b and c
coef(out)[which(out$GCV == min(out$GCV)),]
               incomegp       house    children 
 3.70246800  0.05111209 -0.03111007  0.03811592 
    singpar       agegp     bankacc     bsocacc 
 0.04230821 -0.06981990  0.08561161 -0.07643149 
     manage    ccarduse      cigbuy     xmasbuy 
-0.10443288  0.12765092 -0.10780053  0.16060213 
   locintrn 
-0.08965927 
coef(out)[which(out$GCV == min(out$GCV)),]
               incomegp       house    children 
 3.70246800  0.05111209 -0.03111007  0.03811592 
    singpar       agegp     bankacc     bsocacc 
 0.04230821 -0.06981990  0.08561161 -0.07643149 
     manage    ccarduse      cigbuy     xmasbuy 
-0.10443288  0.12765092 -0.10780053  0.16060213 
   locintrn 
-0.08965927 
ridge.model <- as.matrix(cbind(const=1, testdf[,-13]))%*% coef(out)[which(out$GCV == min(out$GCV)),]

Question 5

q5(a): Perform Lasso regression. Find the optimal lambda value using 10 fold CV.

ans: CV Optimized lambda: \(0.0140487313\)

q5(b): Plot the regression coefficient path

ans: See plot below for regression coefficient path for q5b.

q5(c): How many variables were selected? Which are they?

ans: 12; All of the original predictors. incomegp,house,children,singpar,agegp,bankacc,bsocacc,manage,ccarduse,cigbuy,xmasbuy, and locintrn

set.seed(190)
#q5
library(glmnet)
y=traindf$prodebt
X <-  traindf
X$prodebt <- NULL
X <-  as.matrix(X)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=1, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
CV Optimized lambda: 0.01404873
# Create the lasso model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=1, nlambda=50)
options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)

mlasso = glmnet(X, y, family='poisson', alpha=1, lambda=ml.cv$lambda.min)
coef(mlasso)
13 x 1 sparse Matrix of class "dgCMatrix"
                      s0
(Intercept)  1.362052051
incomegp     0.016385118
house       -0.003460197
children     0.010039001
singpar      .          
agegp       -0.029500852
bankacc      0.015659929
bsocacc     -0.025808910
manage      -0.039178403
ccarduse     0.050923309
cigbuy      -0.036468084
xmasbuy      0.058142176
locintrn    -0.033721278
lasso.model<- as.matrix(cbind(const=1, testdf[,-13]))%*% coef(out)[which(out$GCV == min(out$GCV)),]
opt.lambda=ml.cv$lambda.min
#predict.lasso = predict.glmnet(lasso.model, as.matrix(testData[,-13]), s=opt.lasso ,type = 'response')

Question 6

q6(a): Apply Elastic Net. Give equal weight to both penalties and use 100 values for lambda. Find the optimal lambda value using 10 fold CV.

ans: refer to code block below for CV optimal lambda value.

q6(b): List the coefficient values at the optimal lambda. How many variables were selected? How do these variables compare to those from Lasso in Question 5?

ans: see code block below for q6b with list of coefficient values at the optimal lambda. All of the variables were selected, which is the same number selected in the Lasso model in Question 5.

set.seed(190)
#q6a
require(glmnet)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=.5, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
CV Optimized lambda: 0.02809746
# Create the en model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=.5, nlambda=100)
options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)

#q6b
coef(ml,s=ml.cv$lambda.min)
13 x 1 sparse Matrix of class "dgCMatrix"
                       1
(Intercept)  1.360947263
incomegp     0.016373251
house       -0.003512776
children     0.010068786
singpar      .          
agegp       -0.029332001
bankacc      0.015766629
bsocacc     -0.025669176
manage      -0.039044367
ccarduse     0.050693639
cigbuy      -0.036288955
xmasbuy      0.057848959
locintrn    -0.033567200

Question 7

q7(a): Predict prodebt for each of the rows in the test data using the full model, lowest Mallow’s Cp model, and the models found using forward stepwise regression, ridge regression, lasso regression, and elastic net.

ans: see model predictions in code below.

q7(b): Compare the predictions using mean squared prediction error. Which model performed the best?

ans: All models performed very similarly with a mean square prediction error ranging from \(0.31 - 0.33\). The first model has the lowest MSPE.

q7(c): Provide a table listing each method described in Question 7a and the variables selected by each method (see Unit 5.2.3 for examples). Which variables were selected consistently?

ans: 4 of the models selected all of the original predictors. The lowest Mallows Cp model selected 8 variables and the forward stepwise moel selected 8. The predictors, incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn, were selected in every one of the generated models.

Method No. Vars Variables
Linear Regression 12 incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
Lowest Mallows Cp 7 incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn
Forward Stepwise 8 incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
LASSO 12 incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
Ridge Regression 12 incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
Elastic Net 12 incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
#compare all models
full.pred <- predict(model1, testdf, interval="prediction")
summary(full.pred)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.755   2.941   3.198   3.170   3.345   3.762 
mallows.model <-  glm(prodebt~incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn, data=traindf)
mallows.pred <-  predict(mallows.model, testdf[,-13], interval="prediction")
summary(mallows.pred)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.749   2.946   3.189   3.164   3.296   3.697 
step.model <- glm(formula = prodebt ~ incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn, data = traindf)
step.predict <- predict(step.model,testdf)
summary(step.predict)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.769   2.941   3.230   3.178   3.345   3.741 
#lasso.predict <- predict.glmnet(lasso.model, as.matrix(testdf[,-13]), s=opt.lambda, type = "response")
summary(lasso.predict)
       1        
 Min.   :2.785  
 1st Qu.:3.019  
 Median :3.208  
 Mean   :3.179  
 3rd Qu.:3.322  
 Max.   :3.684  
#en
en.mod=glmnet(as.matrix(X),y,alpha=0.5, nlambda=100)
net.predict <- predict(en.mod,as.matrix(testdf[,-13]),s=opt.lambda)
summary(net.predict)
       1        
 Min.   :2.786  
 1st Qu.:2.969  
 Median :3.198  
 Mean   :3.169  
 3rd Qu.:3.325  
 Max.   :3.715  
#ridge model defined above
summary(ridge.model)
       V1       
 Min.   :2.851  
 1st Qu.:2.996  
 Median :3.204  
 Mean   :3.168  
 3rd Qu.:3.296  
 Max.   :3.632  
#q7b - mean square prediction errors
mean((testdf$prodebt - full.pred)^2)
[1] 0.3110787
mean((testdf$prodebt - mallows.pred)^2)
[1] 0.3173751
mean((testdf$prodebt - step.predict)^2)
[1] 0.3137186
mean((testdf$prodebt - lasso.predict)^2)
[1] 0.3132713
mean((testdf$prodebt - ridge.model)^2)
[1] 0.3334286
mean((testdf$prodebt - net.predict)^2)
[1] 0.3165425
---
title: "R Notebook"
output: html_notebook
---
```{r}
require(faraway)
data(debt)
set.seed(190)
fulldf <- na.omit(debt)
testRows <- sample(nrow(fulldf),0.1*nrow(fulldf))
testdf <- fulldf[testRows,]
traindf <- fulldf[-testRows,]
```
###Question 1###
####q1a: Fit a standard linear regression with the variable prodebt as the response and the other variables as predictors. Display the model summary. 

_ans: see model summary displayed below for q1a._

####q1b: Which variables are significant at the 0.05 level? At the 0.01 level?

_ans: `agegp`, `manage`, `ccarduse`, and `locintrn` are significant at both 0.05 and 0.01 alpha levels._

####q1c: What are the 10-fold and leave one out cross-validation scores for this model?

_ans: 10-fold MSE = $0.4615591$. LOOCV MSE = $0.4638267$_

####q1d. What are the Mallow's Cp, AIC, and BIC criterion values for this model?

_ans: Cp = $13.0000$. AIC = $570.7717$. BIC = $621.3555$_

```{r}
set.seed(190)
#q1a and q1b
model1 <- glm(prodebt~.,data=traindf)
summary(model1)
#q1c
library(boot)
# Run 10-fold cross-validation of the model1 on the data set
cv.model = cv.glm(data=traindf, glmfit=model1, K=10)
# Extract the MSE
cat('10-fold cross-validation MSE:', cv.model$delta[1])
# Run LOOCV
loocv.model = cv.glm(data=traindf, glmfit=model1, K=nrow(traindf))
# Extract the MSE
cat('\nLeave-one-out-cross-validation MSE:', loocv.model$delta[1])
#q1d
library(CombMSC)
s2 <- sigma(lm(prodebt~.,data=traindf)) #extracts residual standard error from lm
n = nrow(traindf)
c(Cp(model1,S2=(s2^2)), AIC(model1,k=2),AIC(model1,k=log(n)))
```

###Question 2###
####q2a.  Compare all models using Mallow's Cp. Display a table indicating the variables included in each model and the corresponding Mallow's Cp value.

_ans: see table below in output for q2a._

####q2b. How many variables are in the model with the lowest Mallow's Cp value? Which variables are they?

_ans: $7$: `incomegp`,`agegp`,`manage`,`ccarduse`,`cigbuy`,`xmasbuy`,`locintrn`._
```{r}
set.seed(190)
#q2a
library(leaps)
out = leaps(traindf[,-c(13)], traindf$prodebt, method = "Cp")
cbind(as.matrix(out$which),out$Cp)
best.model = which(out$Cp==min(out$Cp))
cbind(as.matrix(out$which),out$Cp)[best.model,]

#2b
#predictors 1,5,8,9,10,11,12
names(traindf) #1,5,8,9 = incomegp, agegp, manage, ccarduse, cigbuy, xmasbuy, locintrn
```

###Question 3###
####q3(a): Perform forward stepwise regression using AIC. Allow the minimum model to be the model with only an intercept. Display the model summary of your final model. 

_ans: see model summary displayed below for q3a._

####3(b): How many variables are in your final model? Which are significant at the 0.05 level? 

_ans: $8$ variables are in the final model. `agegp`,`manage`,`ccarduse`,`locintrn` are significant at alpha = $0.05$._ 

####3(c): Perform backward stepwise selection. Do the variables in your final model differ from forward stepwise regression? If so, how? 

_ans: No; the backward stepwise regression produces the near same model with identical variables._

####3(d): Compare the adjusted R2, Mallow's Cp, and AICs of the full model, the model found in Question 2, and the model found using forward selection. Which model is preferred based on these criteria and why?

_ans: For these purposes, these values seem to indicate two models of near equivalent predictive ability, making neither one likely better or worse than the other._
```{r}
set.seed(190)
#q3a and b Create the minimum model that only includes the intercept
m0 <- glm(prodebt~1, data=traindf)
m2 <- glm(prodebt~.,data=traindf)
fstep <- step(m2,scope=list(m0,upper=m2),direction="forward")
summary(fstep)

#q3c
bstep <- step(m2,scope=list(lower=m0,upper=m2),direction="backward")
summary(bstep)

#q3d
c(Cp(fstep,S2=s2^2), AIC(fstep,k=2),AIC(fstep,k=log(n)));c(Cp(bstep,S2=s2^2), AIC(bstep,k=2),AIC(bstep,k=log(n)))
```

###Question 4###
####q4(a): Perform ridge regression. Find the lambda value that minimizes the generalized CV score. 

_ans: See output below. Used `lambda <- seq(0,110,by=0.25)` and used `glmnet` to get optimized value $1.038303$_

####q4(b): List the value of coefficients at the optimum lambda value.

_ans: See output below for q4b for list of coefficients and their values. _

####q4(c): How many variables were selected? Give an explanation for this number.

_ans: 12; All of the original predictors. Ridge regression does not remove predictors but rather shrinks their coefficients such that they are weighted more or less heavily depending on the value they add to the model._ 

```{r}
set.seed(190)
#q4a
library(glmnet)
y=traindf$prodebt
X <-  traindf
X$prodebt <- NULL
X <-  as.matrix(X)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=0, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
# Create the lasso model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=0, nlambda=50)

options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)

library(MASS)
lambda <- seq(0,110,by=0.25)
out <- lm.ridge(prodebt~.,data=traindf,lambda=lambda)
#round(out$GCV,5)
which(out$GCV == min(out$GCV))

#q4b and c
coef(out)[which(out$GCV == min(out$GCV)),]


coef(out)[which(out$GCV == min(out$GCV)),]

ridge.model <- as.matrix(cbind(const=1, testdf[,-13]))%*% coef(out)[which(out$GCV == min(out$GCV)),]
```

###Question 5###
####q5(a): Perform Lasso regression. Find the optimal lambda value using 10 fold CV. 

_ans: CV Optimized lambda: $0.0140487313$_

####q5(b): Plot the regression coefficient path 

_ans: See plot below for regression coefficient path for q5b._

####q5(c): How many variables were selected? Which are they?

_ans: 12; All of the original predictors. `incomegp,house,children,singpar,agegp,bankacc,bsocacc,manage,ccarduse,cigbuy,xmasbuy,` and `locintrn` _
```{r}
set.seed(190)
#q5
library(glmnet)
y=traindf$prodebt
X <-  traindf
X$prodebt <- NULL
X <-  as.matrix(X)

# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=1, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
# Create the lasso model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=1, nlambda=50)

options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)

mlasso = glmnet(X, y, family='poisson', alpha=1, lambda=ml.cv$lambda.min)
coef(mlasso)

lasso.model<- as.matrix(cbind(const=1, testdf[,-13]))%*% coef(out)[which(out$GCV == min(out$GCV)),]

opt.lambda=ml.cv$lambda.min

#predict.lasso = predict.glmnet(lasso.model, as.matrix(testData[,-13]), s=opt.lasso ,type = 'response')
```

###Question 6###
####q6(a): Apply Elastic Net. Give equal weight to both penalties and use 100 values for lambda. Find the optimal lambda value using 10 fold CV.

_ans: refer to code block below for CV optimal lambda value. _

####q6(b): List the coefficient values at the optimal lambda. How many variables were selected? How do these variables compare to those from Lasso in Question 5?

_ans: see code block below for q6b with list of coefficient values at the optimal lambda. All of the variables were selected, which is the same number selected in the Lasso model in Question 5._ 

```{r}
set.seed(190)
#q6a
require(glmnet)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=.5, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
# Create the en model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=.5, nlambda=100)

options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)

#q6b
coef(ml,s=ml.cv$lambda.min)
```

###Question 7###
####q7(a): Predict prodebt for each of the rows in the test data using the full model, lowest Mallow's Cp model, and the models found using forward stepwise regression, ridge regression, lasso regression, and elastic net.

_ans: see model predictions in code below._

####q7(b): Compare the predictions using mean squared prediction error. Which model performed the best?

_ans: All models performed very similarly with a mean square prediction error ranging from  $0.31 - 0.33$. The first model has the lowest MSPE._ 

####q7(c): Provide a table listing each method described in Question 7a and the variables selected by each method (see Unit 5.2.3 for examples). Which variables were selected consistently?

_ans: 4 of the models selected all of the original predictors. The lowest Mallows Cp model selected 8 variables and the forward stepwise moel selected 8. The predictors, `incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn`, were selected in every one of the generated models._

| Method	| No. Vars | Variables	                        |
|---------|----------|------------------------------|
| Linear Regression	| 12 | incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn |
| Lowest Mallows Cp	| 7	| incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn
| Forward Stepwise	| 8	| incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
| LASSO	| 12	| incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
| Ridge Regression	| 12	| incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
| Elastic Net	| 12	| incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn

```{r}
#compare all models
full.pred <- predict(model1, testdf, interval="prediction")
summary(full.pred)

mallows.model <-  glm(prodebt~incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn, data=traindf)
mallows.pred <-  predict(mallows.model, testdf[,-13], interval="prediction")
summary(mallows.pred)

step.model <- glm(formula = prodebt ~ incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn, data = traindf)
step.predict <- predict(step.model,testdf)
summary(step.predict)

#lasso.predict <- predict.glmnet(lasso.model, as.matrix(testdf[,-13]), s=opt.lambda, type = "response")
summary(lasso.predict)

#en
en.mod=glmnet(as.matrix(X),y,alpha=0.5, nlambda=100)
net.predict <- predict(en.mod,as.matrix(testdf[,-13]),s=opt.lambda)
summary(net.predict)

#ridge model defined above
summary(ridge.model)

#q7b - mean square prediction errors
mean((testdf$prodebt - full.pred)^2)
mean((testdf$prodebt - mallows.pred)^2)
mean((testdf$prodebt - step.predict)^2)
mean((testdf$prodebt - lasso.predict)^2)
mean((testdf$prodebt - ridge.model)^2)
mean((testdf$prodebt - net.predict)^2)

```

