teen<- read.csv(file.choose())
head(teen)
##   sex status income verbal gamble
## 1   1     51   2.00      8    0.0
## 2   1     28   2.50      8    0.0
## 3   1     37   2.00      6    0.0
## 4   1     28   7.00      4    7.3
## 5   1     65   2.00      8   19.6
## 6   1     61   3.47      6    0.1
library(faraway)
## Warning: package 'faraway' was built under R version 3.4.2
g<- lm(gamble ~ ., teen)
summary(g)
## 
## Call:
## lm(formula = gamble ~ ., data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sex         -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06
#Model Selection Methods 

#Backward Elimination
g<- lm(gamble ~ ., teen)
summary(g)
## 
## Call:
## lm(formula = gamble ~ ., data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sex         -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06
g<- update(g, . ~. - status)
summary(g)
## 
## Call:
## lm(formula = gamble ~ sex + income + verbal, data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.639 -11.765  -1.594   9.305  93.867 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.1390    14.7686   1.634   0.1095    
## sex         -22.9602     6.7706  -3.391   0.0015 ** 
## income        4.8981     0.9551   5.128 6.64e-06 ***
## verbal       -2.7468     1.8253  -1.505   0.1397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared:  0.5263, Adjusted R-squared:  0.4933 
## F-statistic: 15.93 on 3 and 43 DF,  p-value: 4.148e-07
g<-update(g, .~. - verbal)
summary(g)
## 
## Call:
## lm(formula = gamble ~ sex + income, data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.757 -11.649   0.844   8.659 100.243 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.041      6.394   0.632  0.53070    
## sex          -21.634      6.809  -3.177  0.00272 ** 
## income         5.172      0.951   5.438 2.24e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.75 on 44 degrees of freedom
## Multiple R-squared:  0.5014, Adjusted R-squared:  0.4787 
## F-statistic: 22.12 on 2 and 44 DF,  p-value: 2.243e-07
#AIC aka Akaike information criterion
# The best choice of model will balance fit with model size. 
# For linear models, the -2log-likelihood is the "deviance" i.e., nlog(RSS/n)
# We want to minimize AIC 

#forward 
f<- ~ sex + status +income + verbal
m0 <- lm(gamble ~ 1, data=teen)
m.forward <-step(m0, scope= f, direction = "forward", k=2)
## Start:  AIC=325.34
## gamble ~ 1
## 
##          Df Sum of Sq   RSS    AIC
## + income  1   17680.9 28009 304.34
## + sex     1    7598.4 38091 318.79
## + verbal  1    2212.5 43477 325.00
## <none>                45689 325.34
## + status  1     116.2 45573 327.22
## 
## Step:  AIC=304.34
## gamble ~ income
## 
##          Df Sum of Sq   RSS    AIC
## + sex     1    5227.3 22781 296.63
## <none>                28009 304.34
## + status  1     719.8 27289 305.11
## + verbal  1     579.1 27429 305.35
## 
## Step:  AIC=296.63
## gamble ~ income + sex
## 
##          Df Sum of Sq   RSS    AIC
## + verbal  1   1139.78 21642 296.21
## <none>                22781 296.63
## + status  1    201.82 22580 298.21
## 
## Step:  AIC=296.21
## gamble ~ income + sex + verbal
## 
##          Df Sum of Sq   RSS    AIC
## <none>                21642 296.21
## + status  1    17.776 21624 298.18
##Checking answer with: AIC = n*log(RSS/n) + 2p
extractAIC(m.forward, k=2)
## [1]   4.0000 296.2145
summary(m.forward)
## 
## Call:
## lm(formula = gamble ~ income + sex + verbal, data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.639 -11.765  -1.594   9.305  93.867 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.1390    14.7686   1.634   0.1095    
## income        4.8981     0.9551   5.128 6.64e-06 ***
## sex         -22.9602     6.7706  -3.391   0.0015 ** 
## verbal       -2.7468     1.8253  -1.505   0.1397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared:  0.5263, Adjusted R-squared:  0.4933 
## F-statistic: 15.93 on 3 and 43 DF,  p-value: 4.148e-07
#backward 
m1<-update(m0,f)
m.backward<-step(m1, scope = c(lower= ~ 1), direction = "backward", trace = F)
summary(m.backward)
## 
## Call:
## lm(formula = gamble ~ sex + income + verbal, data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.639 -11.765  -1.594   9.305  93.867 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.1390    14.7686   1.634   0.1095    
## sex         -22.9602     6.7706  -3.391   0.0015 ** 
## income        4.8981     0.9551   5.128 6.64e-06 ***
## verbal       -2.7468     1.8253  -1.505   0.1397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared:  0.5263, Adjusted R-squared:  0.4933 
## F-statistic: 15.93 on 3 and 43 DF,  p-value: 4.148e-07
#stepwise
m.stepup <-step(m0, scope=f, direction="both",trace=F)
summary(m.stepup)
## 
## Call:
## lm(formula = gamble ~ income + sex + verbal, data = teen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.639 -11.765  -1.594   9.305  93.867 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.1390    14.7686   1.634   0.1095    
## income        4.8981     0.9551   5.128 6.64e-06 ***
## sex         -22.9602     6.7706  -3.391   0.0015 ** 
## verbal       -2.7468     1.8253  -1.505   0.1397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.43 on 43 degrees of freedom
## Multiple R-squared:  0.5263, Adjusted R-squared:  0.4933 
## F-statistic: 15.93 on 3 and 43 DF,  p-value: 4.148e-07