teen<- read.csv(file.choose())
head(teen)
## sex status income verbal gamble
## 1 1 51 2.00 8 0.0
## 2 1 28 2.50 8 0.0
## 3 1 37 2.00 6 0.0
## 4 1 28 7.00 4 7.3
## 5 1 65 2.00 8 19.6
## 6 1 61 3.47 6 0.1
library(faraway)
## Warning: package 'faraway' was built under R version 3.4.2
g<- lm(gamble ~ ., teen)
summary(g)
##
## Call:
## lm(formula = gamble ~ ., data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sex -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
#Model Selection Methods
#Backward Elimination
g<- lm(gamble ~ ., teen)
summary(g)
##
## Call:
## lm(formula = gamble ~ ., data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sex -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
g<- update(g, . ~. - status)
summary(g)
##
## Call:
## lm(formula = gamble ~ sex + income + verbal, data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.639 -11.765 -1.594 9.305 93.867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1390 14.7686 1.634 0.1095
## sex -22.9602 6.7706 -3.391 0.0015 **
## income 4.8981 0.9551 5.128 6.64e-06 ***
## verbal -2.7468 1.8253 -1.505 0.1397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared: 0.5263, Adjusted R-squared: 0.4933
## F-statistic: 15.93 on 3 and 43 DF, p-value: 4.148e-07
g<-update(g, .~. - verbal)
summary(g)
##
## Call:
## lm(formula = gamble ~ sex + income, data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.757 -11.649 0.844 8.659 100.243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.041 6.394 0.632 0.53070
## sex -21.634 6.809 -3.177 0.00272 **
## income 5.172 0.951 5.438 2.24e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.75 on 44 degrees of freedom
## Multiple R-squared: 0.5014, Adjusted R-squared: 0.4787
## F-statistic: 22.12 on 2 and 44 DF, p-value: 2.243e-07
#AIC aka Akaike information criterion
# The best choice of model will balance fit with model size.
# For linear models, the -2log-likelihood is the "deviance" i.e., nlog(RSS/n)
# We want to minimize AIC
#forward
f<- ~ sex + status +income + verbal
m0 <- lm(gamble ~ 1, data=teen)
m.forward <-step(m0, scope= f, direction = "forward", k=2)
## Start: AIC=325.34
## gamble ~ 1
##
## Df Sum of Sq RSS AIC
## + income 1 17680.9 28009 304.34
## + sex 1 7598.4 38091 318.79
## + verbal 1 2212.5 43477 325.00
## <none> 45689 325.34
## + status 1 116.2 45573 327.22
##
## Step: AIC=304.34
## gamble ~ income
##
## Df Sum of Sq RSS AIC
## + sex 1 5227.3 22781 296.63
## <none> 28009 304.34
## + status 1 719.8 27289 305.11
## + verbal 1 579.1 27429 305.35
##
## Step: AIC=296.63
## gamble ~ income + sex
##
## Df Sum of Sq RSS AIC
## + verbal 1 1139.78 21642 296.21
## <none> 22781 296.63
## + status 1 201.82 22580 298.21
##
## Step: AIC=296.21
## gamble ~ income + sex + verbal
##
## Df Sum of Sq RSS AIC
## <none> 21642 296.21
## + status 1 17.776 21624 298.18
##Checking answer with: AIC = n*log(RSS/n) + 2p
extractAIC(m.forward, k=2)
## [1] 4.0000 296.2145
summary(m.forward)
##
## Call:
## lm(formula = gamble ~ income + sex + verbal, data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.639 -11.765 -1.594 9.305 93.867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1390 14.7686 1.634 0.1095
## income 4.8981 0.9551 5.128 6.64e-06 ***
## sex -22.9602 6.7706 -3.391 0.0015 **
## verbal -2.7468 1.8253 -1.505 0.1397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared: 0.5263, Adjusted R-squared: 0.4933
## F-statistic: 15.93 on 3 and 43 DF, p-value: 4.148e-07
#backward
m1<-update(m0,f)
m.backward<-step(m1, scope = c(lower= ~ 1), direction = "backward", trace = F)
summary(m.backward)
##
## Call:
## lm(formula = gamble ~ sex + income + verbal, data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.639 -11.765 -1.594 9.305 93.867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1390 14.7686 1.634 0.1095
## sex -22.9602 6.7706 -3.391 0.0015 **
## income 4.8981 0.9551 5.128 6.64e-06 ***
## verbal -2.7468 1.8253 -1.505 0.1397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared: 0.5263, Adjusted R-squared: 0.4933
## F-statistic: 15.93 on 3 and 43 DF, p-value: 4.148e-07
#stepwise
m.stepup <-step(m0, scope=f, direction="both",trace=F)
summary(m.stepup)
##
## Call:
## lm(formula = gamble ~ income + sex + verbal, data = teen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.639 -11.765 -1.594 9.305 93.867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1390 14.7686 1.634 0.1095
## income 4.8981 0.9551 5.128 6.64e-06 ***
## sex -22.9602 6.7706 -3.391 0.0015 **
## verbal -2.7468 1.8253 -1.505 0.1397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared: 0.5263, Adjusted R-squared: 0.4933
## F-statistic: 15.93 on 3 and 43 DF, p-value: 4.148e-07