require(faraway)
data(debt)
set.seed(190)
fulldf <- na.omit(debt)
testRows <- sample(nrow(fulldf),0.1*nrow(fulldf))
testdf <- fulldf[testRows,]
traindf <- fulldf[-testRows,]
ans: see model summary displayed below for q1a.
ans: agegp, manage, ccarduse, and locintrn are significant at both 0.05 and 0.01 alpha levels.
ans: 10-fold MSE = \(0.4615591\). LOOCV MSE = \(0.4638267\)
ans: Cp = \(13.0000\). AIC = \(570.7717\). BIC = \(621.3555\)
set.seed(190)
#q1a and q1b
model1 <- glm(prodebt~.,data=traindf)
summary(model1)
Call:
glm(formula = prodebt ~ ., data = traindf)
Deviance Residuals:
Min 1Q Median 3Q
-1.90782 -0.45705 -0.03132 0.38732
Max
1.85735
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.95258 0.34557 11.438 < 2e-16
incomegp 0.05744 0.03648 1.574 0.11659
house -0.03541 0.07317 -0.484 0.62877
children 0.03815 0.04227 0.902 0.36766
singpar 0.03894 0.18318 0.213 0.83181
agegp -0.09529 0.05132 -1.857 0.06446
bankacc 0.08293 0.13348 0.621 0.53493
bsocacc -0.10808 0.08977 -1.204 0.22968
manage -0.13626 0.04823 -2.825 0.00509
ccarduse 0.17313 0.05541 3.125 0.00198
cigbuy -0.15338 0.09420 -1.628 0.10468
xmasbuy 0.22044 0.12897 1.709 0.08861
locintrn -0.12527 0.04739 -2.643 0.00871
(Intercept) ***
incomegp
house
children
singpar
agegp .
bankacc
bsocacc
manage **
ccarduse **
cigbuy
xmasbuy .
locintrn **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.4455924)
Null deviance: 143.88 on 273 degrees of freedom
Residual deviance: 116.30 on 261 degrees of freedom
AIC: 570.77
Number of Fisher Scoring iterations: 2
#q1c
library(boot)
# Run 10-fold cross-validation of the model1 on the data set
cv.model = cv.glm(data=traindf, glmfit=model1, K=10)
# Extract the MSE
cat('10-fold cross-validation MSE:', cv.model$delta[1])
10-fold cross-validation MSE: 0.4615591
# Run LOOCV
loocv.model = cv.glm(data=traindf, glmfit=model1, K=nrow(traindf))
# Extract the MSE
cat('\nLeave-one-out-cross-validation MSE:', loocv.model$delta[1])
Leave-one-out-cross-validation MSE: 0.4638267
#q1d
library(CombMSC)
s2 <- sigma(lm(prodebt~.,data=traindf)) #extracts residual standard error from lm
n = nrow(traindf)
c(Cp(model1,S2=(s2^2)), AIC(model1,k=2),AIC(model1,k=log(n)))
[1] 13.0000 570.7717 621.3555
ans: see table below in output for q2a.
ans: \(7\): incomegp,agegp,manage,ccarduse,cigbuy,xmasbuy,locintrn.
set.seed(190)
#q2a
library(leaps)
out = leaps(traindf[,-c(13)], traindf$prodebt, method = "Cp")
cbind(as.matrix(out$which),out$Cp)
1 2 3 4 5 6 7 8 9 A B C
1 0 0 0 0 0 0 0 0 1 0 0 0 34.441083
1 1 0 0 0 0 0 0 0 0 0 0 0 37.145995
1 0 0 0 0 0 0 0 1 0 0 0 0 38.731359
1 0 0 0 0 1 0 0 0 0 0 0 0 42.660981
1 0 0 1 0 0 0 0 0 0 0 0 0 45.607165
1 0 0 0 0 0 0 0 0 0 0 0 1 46.957440
1 0 0 0 0 0 1 0 0 0 0 0 0 48.278428
1 0 0 0 0 0 0 0 0 0 0 1 0 48.354061
1 0 0 0 0 0 0 0 0 0 1 0 0 49.810097
1 0 0 0 0 0 0 1 0 0 0 0 0 51.455025
2 0 0 0 0 0 0 0 1 1 0 0 0 19.779750
2 1 0 0 0 0 0 0 1 0 0 0 0 24.960262
2 0 0 0 0 1 0 0 0 1 0 0 0 25.172463
2 0 0 1 0 0 0 0 0 1 0 0 0 28.251079
2 0 0 0 0 0 0 0 0 1 0 0 1 28.771002
2 1 0 0 0 0 0 0 0 1 0 0 0 30.180958
2 1 0 0 0 0 0 0 0 0 0 0 1 30.485454
2 1 0 0 0 1 0 0 0 0 0 0 0 32.499082
2 0 0 0 0 0 0 0 0 1 0 1 0 33.043153
2 0 0 0 0 0 0 1 0 1 0 0 0 33.207888
3 0 0 0 0 1 0 0 1 1 0 0 0 14.568827
3 1 0 0 0 0 0 0 1 1 0 0 0 16.095549
3 0 0 1 0 0 0 0 1 1 0 0 0 16.126847
3 0 0 0 0 0 0 0 1 1 0 0 1 16.879762
3 0 0 0 0 1 0 0 0 1 0 0 1 18.039589
3 0 0 0 0 0 0 0 1 1 0 1 0 18.291258
3 0 0 0 0 0 1 0 1 1 0 0 0 20.186256
3 0 0 0 0 0 0 0 1 1 1 0 0 20.355828
3 0 0 0 0 0 0 1 1 1 0 0 0 20.788126
3 0 1 0 0 0 0 0 1 1 0 0 0 20.913896
4 0 0 0 0 1 0 0 1 1 0 0 1 10.343902
4 1 0 0 0 0 0 0 1 1 0 0 1 11.938331
4 0 0 0 0 1 0 0 1 1 0 1 0 12.841669
4 1 0 0 0 1 0 0 1 1 0 0 0 12.947177
4 0 0 1 0 0 0 0 1 1 0 0 1 13.285237
4 1 0 1 0 0 0 0 1 1 0 0 0 13.469674
4 0 0 0 0 1 0 0 1 1 1 0 0 14.238375
4 0 0 1 0 1 0 0 1 1 0 0 0 14.457802
4 0 0 0 0 1 1 0 1 1 0 0 0 15.047039
4 0 0 0 0 0 0 0 1 1 0 1 1 15.122201
5 1 0 0 0 1 0 0 1 1 0 0 1 7.788808
5 0 0 0 0 1 0 0 1 1 0 1 1 8.277262
5 1 0 1 0 0 0 0 1 1 0 0 1 9.484143
5 0 0 0 0 1 0 0 1 1 1 0 1 9.826121
5 0 0 0 0 1 1 0 1 1 0 0 1 10.273730
5 0 0 1 0 1 0 0 1 1 0 0 1 10.563902
5 1 0 0 0 0 0 0 1 1 0 1 1 11.096022
5 0 0 0 0 1 0 1 1 1 0 0 1 11.591348
5 1 0 0 0 1 0 0 1 1 0 1 0 11.937732
5 1 1 0 0 0 0 0 1 1 0 0 1 12.078086
6 1 0 0 0 1 0 0 1 1 0 1 1 6.544859
6 0 0 0 0 1 0 0 1 1 1 1 1 7.302879
6 0 0 0 0 1 1 0 1 1 0 1 1 7.879094
6 1 0 0 0 1 0 0 1 1 1 0 1 8.124421
6 1 0 1 0 1 0 0 1 1 0 0 1 8.186881
6 1 0 0 0 1 0 1 1 1 0 0 1 8.388379
6 1 0 0 0 1 1 0 1 1 0 0 1 9.152069
6 0 0 1 0 1 0 0 1 1 0 1 1 9.445733
6 0 0 0 0 1 0 1 1 1 0 1 1 9.464968
6 1 1 0 0 1 0 0 1 1 0 0 1 9.635989
7 1 0 0 0 1 0 0 1 1 1 1 1 6.456914
7 1 0 0 0 1 0 1 1 1 0 1 1 7.137318
7 0 0 0 0 1 1 0 1 1 1 1 1 7.448470
7 1 0 0 0 1 1 0 1 1 0 1 1 7.606385
7 1 0 1 0 1 0 0 1 1 0 1 1 7.745966
7 0 0 0 0 1 0 1 1 1 1 1 1 7.905208
7 1 0 0 0 1 0 1 1 1 1 0 1 8.246065
7 1 0 1 0 1 0 0 1 1 1 0 1 8.304783
7 0 0 1 0 1 0 0 1 1 1 1 1 8.326722
7 1 0 0 1 1 0 0 1 1 0 1 1 8.444141
8 1 0 0 0 1 0 1 1 1 1 1 1 6.508902
8 1 0 1 0 1 0 0 1 1 1 1 1 7.534036
8 1 0 0 0 1 1 0 1 1 1 1 1 7.694545
8 1 1 0 0 1 0 0 1 1 1 1 1 8.211996
8 1 0 1 0 1 0 1 1 1 0 1 1 8.353004
8 1 0 0 1 1 0 0 1 1 1 1 1 8.401659
[ reached getOption("max.print") -- omitted 35 rows ]
best.model = which(out$Cp==min(out$Cp))
cbind(as.matrix(out$which),out$Cp)[best.model,]
1 2 3 4 5
1.000000 0.000000 0.000000 0.000000 1.000000
6 7 8 9 A
0.000000 0.000000 1.000000 1.000000 1.000000
B C
1.000000 1.000000 6.456914
#2b
#predictors 1,5,8,9,10,11,12
names(traindf) #1,5,8,9 = incomegp, agegp, manage, ccarduse, cigbuy, xmasbuy, locintrn
[1] "incomegp" "house" "children" "singpar"
[5] "agegp" "bankacc" "bsocacc" "manage"
[9] "ccarduse" "cigbuy" "xmasbuy" "locintrn"
[13] "prodebt"
ans: see model summary displayed below for q3a.
ans: \(8\) variables are in the final model. agegp,manage,ccarduse,locintrn are significant at alpha = \(0.05\).
ans: No; the backward stepwise regression produces the near same model with identical variables.
ans: For these purposes, these values seem to indicate two models of near equivalent predictive ability, making neither one likely better or worse than the other.
set.seed(190)
#q3a and b Create the minimum model that only includes the intercept
m0 <- glm(prodebt~1, data=traindf)
m2 <- glm(prodebt~.,data=traindf)
fstep <- step(m2,scope=list(m0,upper=m2),direction="forward")
Start: AIC=570.77
prodebt ~ incomegp + house + children + singpar + agegp + bankacc +
bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
summary(fstep)
Call:
glm(formula = prodebt ~ incomegp + house + children + singpar +
agegp + bankacc + bsocacc + manage + ccarduse + cigbuy +
xmasbuy + locintrn, data = traindf)
Deviance Residuals:
Min 1Q Median 3Q
-1.90782 -0.45705 -0.03132 0.38732
Max
1.85735
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.95258 0.34557 11.438 < 2e-16
incomegp 0.05744 0.03648 1.574 0.11659
house -0.03541 0.07317 -0.484 0.62877
children 0.03815 0.04227 0.902 0.36766
singpar 0.03894 0.18318 0.213 0.83181
agegp -0.09529 0.05132 -1.857 0.06446
bankacc 0.08293 0.13348 0.621 0.53493
bsocacc -0.10808 0.08977 -1.204 0.22968
manage -0.13626 0.04823 -2.825 0.00509
ccarduse 0.17313 0.05541 3.125 0.00198
cigbuy -0.15338 0.09420 -1.628 0.10468
xmasbuy 0.22044 0.12897 1.709 0.08861
locintrn -0.12527 0.04739 -2.643 0.00871
(Intercept) ***
incomegp
house
children
singpar
agegp .
bankacc
bsocacc
manage **
ccarduse **
cigbuy
xmasbuy .
locintrn **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.4455924)
Null deviance: 143.88 on 273 degrees of freedom
Residual deviance: 116.30 on 261 degrees of freedom
AIC: 570.77
Number of Fisher Scoring iterations: 2
#q3c
bstep <- step(m2,scope=list(lower=m0,upper=m2),direction="backward")
Start: AIC=570.77
prodebt ~ incomegp + house + children + singpar + agegp + bankacc +
bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn
Df Deviance AIC
- singpar 1 116.32 568.82
- house 1 116.40 569.02
- bankacc 1 116.47 569.18
- children 1 116.66 569.63
- bsocacc 1 116.95 570.29
<none> 116.30 570.77
- incomegp 1 117.40 571.36
- cigbuy 1 117.48 571.54
- xmasbuy 1 117.60 571.82
- agegp 1 117.84 572.37
- locintrn 1 119.41 576.01
- manage 1 119.86 577.03
- ccarduse 1 120.65 578.83
Step: AIC=568.82
prodebt ~ incomegp + house + children + agegp + bankacc + bsocacc +
manage + ccarduse + cigbuy + xmasbuy + locintrn
Df Deviance AIC
- house 1 116.42 567.06
- bankacc 1 116.49 567.21
- children 1 116.71 567.74
- bsocacc 1 116.96 568.32
<none> 116.32 568.82
- incomegp 1 117.42 569.40
- cigbuy 1 117.53 569.64
- xmasbuy 1 117.62 569.87
- agegp 1 117.91 570.53
- locintrn 1 119.46 574.12
- manage 1 119.94 575.23
- ccarduse 1 120.74 577.04
Step: AIC=567.06
prodebt ~ incomegp + children + agegp + bankacc + bsocacc + manage +
ccarduse + cigbuy + xmasbuy + locintrn
Df Deviance AIC
- bankacc 1 116.56 565.39
- children 1 116.83 566.01
- bsocacc 1 117.10 566.65
<none> 116.42 567.06
- incomegp 1 117.46 567.48
- cigbuy 1 117.56 567.73
- xmasbuy 1 117.75 568.17
- agegp 1 118.58 570.08
- locintrn 1 119.51 572.22
- manage 1 120.23 573.89
- ccarduse 1 120.82 575.21
Step: AIC=565.39
prodebt ~ incomegp + children + agegp + bsocacc + manage + ccarduse +
cigbuy + xmasbuy + locintrn
Df Deviance AIC
- children 1 116.97 564.35
<none> 116.56 565.39
- bsocacc 1 117.43 565.42
- cigbuy 1 117.79 566.27
- xmasbuy 1 117.82 566.34
- incomegp 1 118.05 566.86
- agegp 1 118.66 568.28
- locintrn 1 119.57 570.36
- manage 1 120.25 571.92
- ccarduse 1 121.37 574.47
Step: AIC=564.35
prodebt ~ incomegp + agegp + bsocacc + manage + ccarduse + cigbuy +
xmasbuy + locintrn
Df Deviance AIC
<none> 116.97 564.35
- bsocacc 1 117.84 564.38
- cigbuy 1 118.14 565.08
- incomegp 1 118.48 565.87
- xmasbuy 1 118.64 566.22
- locintrn 1 120.12 569.62
- agegp 1 120.35 570.15
- manage 1 120.78 571.12
- ccarduse 1 121.72 573.25
summary(bstep)
Call:
glm(formula = prodebt ~ incomegp + agegp + bsocacc + manage +
ccarduse + cigbuy + xmasbuy + locintrn, data = traindf)
Deviance Residuals:
Min 1Q Median 3Q
-1.91126 -0.43001 -0.01056 0.39193
Max
1.84378
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.03169 0.31682 12.726 < 2e-16
incomegp 0.06067 0.03276 1.852 0.06519
agegp -0.12121 0.04383 -2.766 0.00608
bsocacc -0.12175 0.08682 -1.402 0.16199
manage -0.13824 0.04708 -2.936 0.00362
ccarduse 0.17746 0.05413 3.279 0.00118
cigbuy -0.15044 0.09236 -1.629 0.10452
xmasbuy 0.24250 0.12485 1.942 0.05316
locintrn -0.12503 0.04685 -2.669 0.00808
(Intercept) ***
incomegp .
agegp **
bsocacc
manage **
ccarduse **
cigbuy
xmasbuy .
locintrn **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.4414037)
Null deviance: 143.88 on 273 degrees of freedom
Residual deviance: 116.97 on 265 degrees of freedom
AIC: 564.35
Number of Fisher Scoring iterations: 2
#q3d
c(Cp(fstep,S2=s2^2), AIC(fstep,k=2),AIC(fstep,k=log(n)));c(Cp(bstep,S2=s2^2), AIC(bstep,k=2),AIC(bstep,k=log(n)))
[1] 13.0000 570.7717 621.3555
[1] 6.508902 564.351239 600.482521
ans: See output below. Used lambda <- seq(0,110,by=0.25) and used glmnet to get optimized value \(1.038303\)
ans: See output below for q4b for list of coefficients and their values.
ans: 12; All of the original predictors. Ridge regression does not remove predictors but rather shrinks their coefficients such that they are weighted more or less heavily depending on the value they add to the model.
set.seed(190)
#q4a
library(glmnet)
y=traindf$prodebt
X <- traindf
X$prodebt <- NULL
X <- as.matrix(X)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=0, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
CV Optimized lambda: 1.038303
# Create the lasso model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=0, nlambda=50)
options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)
library(MASS)
lambda <- seq(0,110,by=0.25)
out <- lm.ridge(prodebt~.,data=traindf,lambda=lambda)
#round(out$GCV,5)
which(out$GCV == min(out$GCV))
103.25
414
#q4b and c
coef(out)[which(out$GCV == min(out$GCV)),]
incomegp house children
3.70246800 0.05111209 -0.03111007 0.03811592
singpar agegp bankacc bsocacc
0.04230821 -0.06981990 0.08561161 -0.07643149
manage ccarduse cigbuy xmasbuy
-0.10443288 0.12765092 -0.10780053 0.16060213
locintrn
-0.08965927
coef(out)[which(out$GCV == min(out$GCV)),]
incomegp house children
3.70246800 0.05111209 -0.03111007 0.03811592
singpar agegp bankacc bsocacc
0.04230821 -0.06981990 0.08561161 -0.07643149
manage ccarduse cigbuy xmasbuy
-0.10443288 0.12765092 -0.10780053 0.16060213
locintrn
-0.08965927
ridge.model <- as.matrix(cbind(const=1, testdf[,-13]))%*% coef(out)[which(out$GCV == min(out$GCV)),]
ans: CV Optimized lambda: \(0.0140487313\)
ans: See plot below for regression coefficient path for q5b.
ans: 12; All of the original predictors. incomegp,house,children,singpar,agegp,bankacc,bsocacc,manage,ccarduse,cigbuy,xmasbuy, and locintrn
set.seed(190)
#q5
library(glmnet)
y=traindf$prodebt
X <- traindf
X$prodebt <- NULL
X <- as.matrix(X)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=1, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
CV Optimized lambda: 0.01404873
# Create the lasso model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=1, nlambda=50)
options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)
mlasso = glmnet(X, y, family='poisson', alpha=1, lambda=ml.cv$lambda.min)
coef(mlasso)
13 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 1.362052051
incomegp 0.016385118
house -0.003460197
children 0.010039001
singpar .
agegp -0.029500852
bankacc 0.015659929
bsocacc -0.025808910
manage -0.039178403
ccarduse 0.050923309
cigbuy -0.036468084
xmasbuy 0.058142176
locintrn -0.033721278
lasso.model<- as.matrix(cbind(const=1, testdf[,-13]))%*% coef(out)[which(out$GCV == min(out$GCV)),]
opt.lambda=ml.cv$lambda.min
#predict.lasso = predict.glmnet(lasso.model, as.matrix(testData[,-13]), s=opt.lasso ,type = 'response')
ans: refer to code block below for CV optimal lambda value.
ans: see code block below for q6b with list of coefficient values at the optimal lambda. All of the variables were selected, which is the same number selected in the Lasso model in Question 5.
set.seed(190)
#q6a
require(glmnet)
# Optimize lambda using cross validation
ml.cv = cv.glmnet(X, y, family='poisson', alpha=.5, type='deviance', nfolds=10)
cat("CV Optimized lambda:", ml.cv$lambda.min)
CV Optimized lambda: 0.02809746
# Create the en model with multiple lambda values
ml = glmnet(X, y, family='poisson', alpha=.5, nlambda=100)
options(repr.plot.width=8, repr.plot.height=5)
plot(ml,xvar="lambda",label=TRUE,lwd=2)
abline(v=log(ml.cv$lambda.min),col='black',lty = 2,lwd=2)
abline(h=0,col='black',lty = 1,lwd=2)
#q6b
coef(ml,s=ml.cv$lambda.min)
13 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1.360947263
incomegp 0.016373251
house -0.003512776
children 0.010068786
singpar .
agegp -0.029332001
bankacc 0.015766629
bsocacc -0.025669176
manage -0.039044367
ccarduse 0.050693639
cigbuy -0.036288955
xmasbuy 0.057848959
locintrn -0.033567200
ans: see model predictions in code below.
ans: All models performed very similarly with a mean square prediction error ranging from \(0.31 - 0.33\). The first model has the lowest MSPE.
ans: 4 of the models selected all of the original predictors. The lowest Mallows Cp model selected 8 variables and the forward stepwise moel selected 8. The predictors, incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn, were selected in every one of the generated models.
| Method | No. Vars | Variables |
|---|---|---|
| Linear Regression | 12 | incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn |
| Lowest Mallows Cp | 7 | incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn |
| Forward Stepwise | 8 | incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn |
| LASSO | 12 | incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn |
| Ridge Regression | 12 | incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn |
| Elastic Net | 12 | incomegp + house + children + singpar + agegp + bankacc + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn |
#compare all models
full.pred <- predict(model1, testdf, interval="prediction")
summary(full.pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.755 2.941 3.198 3.170 3.345 3.762
mallows.model <- glm(prodebt~incomegp + agegp + bsocacc + manage + ccarduse + xmasbuy + locintrn, data=traindf)
mallows.pred <- predict(mallows.model, testdf[,-13], interval="prediction")
summary(mallows.pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.749 2.946 3.189 3.164 3.296 3.697
step.model <- glm(formula = prodebt ~ incomegp + agegp + bsocacc + manage + ccarduse + cigbuy + xmasbuy + locintrn, data = traindf)
step.predict <- predict(step.model,testdf)
summary(step.predict)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.769 2.941 3.230 3.178 3.345 3.741
#lasso.predict <- predict.glmnet(lasso.model, as.matrix(testdf[,-13]), s=opt.lambda, type = "response")
summary(lasso.predict)
1
Min. :2.785
1st Qu.:3.019
Median :3.208
Mean :3.179
3rd Qu.:3.322
Max. :3.684
#en
en.mod=glmnet(as.matrix(X),y,alpha=0.5, nlambda=100)
net.predict <- predict(en.mod,as.matrix(testdf[,-13]),s=opt.lambda)
summary(net.predict)
1
Min. :2.786
1st Qu.:2.969
Median :3.198
Mean :3.169
3rd Qu.:3.325
Max. :3.715
#ridge model defined above
summary(ridge.model)
V1
Min. :2.851
1st Qu.:2.996
Median :3.204
Mean :3.168
3rd Qu.:3.296
Max. :3.632
#q7b - mean square prediction errors
mean((testdf$prodebt - full.pred)^2)
[1] 0.3110787
mean((testdf$prodebt - mallows.pred)^2)
[1] 0.3173751
mean((testdf$prodebt - step.predict)^2)
[1] 0.3137186
mean((testdf$prodebt - lasso.predict)^2)
[1] 0.3132713
mean((testdf$prodebt - ridge.model)^2)
[1] 0.3334286
mean((testdf$prodebt - net.predict)^2)
[1] 0.3165425