Applied_Linear_Model_Final_Exam_Yanxin

#################  Final Exam_Applied Linear Model  ################
## Read in data
setwd('D:\\R Code')
sat_data <- read.table("SAT.txt",header=TRUE);
head(sat_data)

##        STATE  COST RATIO SALARY PERCENT VERBAL MATH TOTAL
## 1    Alabama 4.405  17.2 31.144       8    491  538  1029
## 2     Alaska 8.963  17.6 47.951      47    445  489   934
## 3    Arizona 4.778  19.3 32.175      27    448  496   944
## 4   Arkansas 4.459  17.1 28.934       6    482  523  1005
## 5 California 4.992  24.0 41.078      45    417  485   902
## 6   Colorado 5.443  18.4 34.571      29    462  518   980

str(sat_data)

## 'data.frame':    50 obs. of  8 variables:
##  $ STATE  : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ COST   : num  4.41 8.96 4.78 4.46 4.99 ...
##  $ RATIO  : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
##  $ SALARY : num  31.1 48 32.2 28.9 41.1 ...
##  $ PERCENT: int  8 47 27 6 45 29 81 68 48 65 ...
##  $ VERBAL : int  491 445 448 482 417 462 431 429 420 406 ...
##  $ MATH   : int  538 489 496 523 485 518 477 468 469 448 ...
##  $ TOTAL  : int  1029 934 944 1005 902 980 908 897 889 854 ...

dim(sat_data)

## [1] 50  8

summary(sat_data)

##         STATE         COST           RATIO           SALARY     
##  Alabama   : 1   Min.   :3.656   Min.   :13.80   Min.   :25.99  
##  Alaska    : 1   1st Qu.:4.882   1st Qu.:15.22   1st Qu.:30.98  
##  Arizona   : 1   Median :5.768   Median :16.60   Median :33.29  
##  Arkansas  : 1   Mean   :5.905   Mean   :16.86   Mean   :34.83  
##  California: 1   3rd Qu.:6.434   3rd Qu.:17.57   3rd Qu.:38.55  
##  Colorado  : 1   Max.   :9.774   Max.   :24.30   Max.   :50.05  
##  (Other)   :44                                                  
##     PERCENT          VERBAL           MATH           TOTAL       
##  Min.   : 4.00   Min.   :401.0   Min.   :443.0   Min.   : 844.0  
##  1st Qu.: 9.00   1st Qu.:427.2   1st Qu.:474.8   1st Qu.: 897.2  
##  Median :28.00   Median :448.0   Median :497.5   Median : 945.5  
##  Mean   :35.24   Mean   :457.1   Mean   :508.8   Mean   : 965.9  
##  3rd Qu.:63.00   3rd Qu.:490.2   3rd Qu.:539.5   3rd Qu.:1032.0  
##  Max.   :81.00   Max.   :516.0   Max.   :592.0   Max.   :1107.0  
##

names(sat_data)

## [1] "STATE"   "COST"    "RATIO"   "SALARY"  "PERCENT" "VERBAL"  "MATH"   
## [8] "TOTAL"

sat_dat <- subset(sat_data,select=c(-STATE,-VERBAL,-MATH))
cor(sat_dat)

##               COST        RATIO       SALARY    PERCENT       TOTAL
## COST     1.0000000 -0.371025386  0.869801513  0.5926274 -0.38053700
## RATIO   -0.3710254  1.000000000 -0.001146081 -0.2130536  0.08125382
## SALARY   0.8698015 -0.001146081  1.000000000  0.6167799 -0.43988338
## PERCENT  0.5926274 -0.213053607  0.616779867  1.0000000 -0.88711868
## TOTAL   -0.3805370  0.081253823 -0.439883381 -0.8871187  1.00000000

## Plot the data
library(ggplot2)
library(GGally)
p <- ggpairs(sat_dat, upper = list(continuous = "points")
             , lower = list(continuous = "cor")
)
print(p)

p1 <- ggplot(sat_data, aes(x=COST, y=TOTAL, colour=STATE))
p1 <- p1 + geom_point(size=2, color = "black")
p1 <- p1 + geom_text(size=3, data=sat_data, aes(label = as.character(sat_data$STATE)),vjust=-1)
p1 <- p1 + xlab("Cost Per Pupil")+ylab("Average Total SAT Score")
p1 <- p1 + theme(legend.position = "none")
p1 <- p1 + stat_smooth(aes(group=1),method="lm", se=FALSE,size=1.5)
p1 <- p1 + theme(axis.title = element_text(size = rel(1))) 
print(p1)

p2 <- ggplot(sat_data, aes(x=PERCENT, y=TOTAL, colour=STATE))
p2 <- p2 + geom_point(size=2, color = "black")
p2 <- p2 + geom_text(size=3, data=sat_data, aes(label = as.character(sat_data$STATE)),vjust=-1)
p2 <- p2 + theme(legend.position = "none")
p2 <- p2 + xlab("Percent of Student Taking SAT")+ylab("Average Total SAT Score")
p2 <- p2 + stat_smooth(aes(group=1),method="lm", se=FALSE, size=1.5)
p2 <- p2 + theme(axis.title = element_text(size = rel(1))) 
print(p2)

p3 <- ggplot(sat_data, aes(x=SALARY, y=TOTAL, colour=STATE))
p3 <- p3 + geom_point(size=2, color = "black")
p3 <- p3+ geom_text(size=3, data=sat_data, aes(label = as.character(sat_data$STATE)),vjust=-1)
p3 <- p3 + theme(legend.position = "none")
p3 <- p3 + xlab("Average Annual Teachers' Salary")+ylab("Average Total SAT Score")
p3 <- p3 + stat_smooth(aes(group=1),method="lm", se=FALSE, size=1.5)
p3 <- p3 + theme(axis.title = element_text(size = rel(1))) 
print(p3)

## Fit the estimated regression line
lm.cost <- lm(TOTAL~COST, data=sat_data)
summary(lm.cost)

## 
## Call:
## lm(formula = TOTAL ~ COST, data = sat_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.074  -46.821    4.087   40.034  128.489 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1089.294     44.390  24.539  < 2e-16 ***
## COST         -20.892      7.328  -2.851  0.00641 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.91 on 48 degrees of freedom
## Multiple R-squared:  0.1448, Adjusted R-squared:  0.127 
## F-statistic: 8.128 on 1 and 48 DF,  p-value: 0.006408

anova(lm.cost)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## COST       1  39722   39722  8.1278 0.006408 **
## Residuals 48 234586    4887                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

lm.percent <- lm(TOTAL~PERCENT, data=sat_data)
summary(lm.percent)

## 
## Call:
## lm(formula = TOTAL ~ PERCENT, data = sat_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -79.158 -27.364   3.308  19.876  66.080 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1053.3204     8.2112  128.28   <2e-16 ***
## PERCENT       -2.4801     0.1862  -13.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.89 on 48 degrees of freedom
## Multiple R-squared:  0.787,  Adjusted R-squared:  0.7825 
## F-statistic: 177.3 on 1 and 48 DF,  p-value: < 2.2e-16

anova(lm.percent)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## PERCENT    1 215875  215875  177.33 < 2.2e-16 ***
## Residuals 48  58433    1217                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

lm.salary <- lm(TOTAL~SALARY, data=sat_data)
summary(lm.salary)

## 
## Call:
## lm(formula = TOTAL ~ SALARY, data = sat_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.125  -45.354    4.073   42.193  125.279 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1158.859     57.659  20.098  < 2e-16 ***
## SALARY        -5.540      1.632  -3.394  0.00139 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.89 on 48 degrees of freedom
## Multiple R-squared:  0.1935, Adjusted R-squared:  0.1767 
## F-statistic: 11.52 on 1 and 48 DF,  p-value: 0.001391

anova(lm.salary)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## SALARY     1  53078   53078  11.516 0.001391 **
## Residuals 48 221230    4609                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

############# Multiple regression ################
## Full model: 1 predictor COST, 3 covariates PERCENT, RATIO, and SALARY
library(car)
library(MASS)
sat1 <-lm(TOTAL ~ ., data = sat_dat)
summary(sat1)

## 
## Call:
## lm(formula = TOTAL ~ ., data = sat_dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.531 -20.855  -1.746  15.979  66.571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1045.9715    52.8698  19.784  < 2e-16 ***
## COST           4.4626    10.5465   0.423    0.674    
## RATIO         -3.6242     3.2154  -1.127    0.266    
## SALARY         1.6379     2.3872   0.686    0.496    
## PERCENT       -2.9045     0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared:  0.8246, Adjusted R-squared:  0.809 
## F-statistic: 52.88 on 4 and 45 DF,  p-value: < 2.2e-16

anova(sat1)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## COST       1  39722   39722  37.1436 2.260e-07 ***
## RATIO      1   1143    1143   1.0685 0.3068088    
## SALARY     1  16631   16631  15.5514 0.0002779 ***
## PERCENT    1 168688  168688 157.7379 2.607e-16 ***
## Residuals 45  48124    1069                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Evaluate Collinearity
vif(sat1) # variance inflation factors

##     COST    RATIO   SALARY  PERCENT 
## 9.465320 2.433204 9.217237 1.755090

sqrt(vif(sat1)) > 2 # problem?

##    COST   RATIO  SALARY PERCENT 
##    TRUE   FALSE    TRUE   FALSE

## Evaluate Nonlinearity, component + residual plot 
crPlots(sat1)

## Ceres plots 
ceresPlots(sat1)

## Reduced model: 1 predictor COST, 2 covariates PERCENT, RATIO
sat2 <-lm(TOTAL ~ COST + RATIO + PERCENT, data = sat_dat)
summary(sat2)

## 
## Call:
## lm(formula = TOTAL ~ COST + RATIO + PERCENT, data = sat_dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.284 -21.130   1.414  16.709  66.073 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1035.4739    50.3155  20.580   <2e-16 ***
## COST          11.0140     4.4521   2.474   0.0171 *  
## RATIO         -2.0282     2.2071  -0.919   0.3629    
## PERCENT       -2.8491     0.2155 -13.222   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.51 on 46 degrees of freedom
## Multiple R-squared:  0.8227, Adjusted R-squared:  0.8112 
## F-statistic: 71.16 on 3 and 46 DF,  p-value: < 2.2e-16

anova(sat2)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## COST       1  39722   39722  37.5759 1.849e-07 ***
## RATIO      1   1143    1143   1.0809    0.3039    
## PERCENT    1 184816  184816 174.8301 < 2.2e-16 ***
## Residuals 46  48627    1057                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Evaluate Collinearity
vif(sat2) # variance inflation factors

##     COST    RATIO  PERCENT 
## 1.706384 1.159732 1.541453

sqrt(vif(sat2)) > 2

##    COST   RATIO PERCENT 
##   FALSE   FALSE   FALSE

## Evaluate Nonlinearity, component + residual plot 
crPlots(sat2)

## Ceres plots 
ceresPlots(sat2)

###########  Variable Selection  ############
## Best subsets: the leaps package provides best subsets with other selection criteria.
library(leaps)
## First, fit the full model
sat.full <- lm(TOTAL ~ ., data = sat_dat)
summary(sat.full)

## 
## Call:
## lm(formula = TOTAL ~ ., data = sat_dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.531 -20.855  -1.746  15.979  66.571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1045.9715    52.8698  19.784  < 2e-16 ***
## COST           4.4626    10.5465   0.423    0.674    
## RATIO         -3.6242     3.2154  -1.127    0.266    
## SALARY         1.6379     2.3872   0.686    0.496    
## PERCENT       -2.9045     0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared:  0.8246, Adjusted R-squared:  0.809 
## F-statistic: 52.88 on 4 and 45 DF,  p-value: < 2.2e-16

## Second, create the design matrix which leap uses as argument
## R^2 -- for each model size, report best subset of size 4
leaps.r2 <- leaps(x = model.matrix(sat.full)[,-1], y = sat_dat$TOTAL
                  , method = "r2"
                  , int = TRUE, nbest = 1, names = colnames(model.matrix(sat.full))[-1])
leaps.r2

## $which
##    COST RATIO SALARY PERCENT
## 1 FALSE FALSE  FALSE    TRUE
## 2  TRUE FALSE  FALSE    TRUE
## 3 FALSE  TRUE   TRUE    TRUE
## 4  TRUE  TRUE   TRUE    TRUE
## 
## $label
## [1] "(Intercept)" "COST"        "RATIO"       "SALARY"      "PERCENT"    
## 
## $size
## [1] 2 3 4 5
## 
## $r2
## [1] 0.7869795 0.8194726 0.8238643 0.8245623

plot(leaps.r2$size, leaps.r2$r2, main = "R2")
lines(leaps.r2$size,leaps.r2$r2)

## Report the best model (indicate which terms are in the model)
best.model.r2 <- leaps.r2$which[which((leaps.r2$r2 == max(leaps.r2$r2))),]
## These are the variable names for the best model
names(best.model.r2)[best.model.r2]

## [1] "COST"    "RATIO"   "SALARY"  "PERCENT"

## adj-R^2 -- for each model size, report best subset of size 3
leaps.adjr2 <- leaps(x = model.matrix(sat.full)[,-1], y = sat_dat$TOTAL
                     , method = "adjr2"
                     , int = TRUE, nbest = 1, names = colnames(model.matrix(sat.full))[-1])
leaps.adjr2

## $which
##    COST RATIO SALARY PERCENT
## 1 FALSE FALSE  FALSE    TRUE
## 2  TRUE FALSE  FALSE    TRUE
## 3 FALSE  TRUE   TRUE    TRUE
## 4  TRUE  TRUE   TRUE    TRUE
## 
## $label
## [1] "(Intercept)" "COST"        "RATIO"       "SALARY"      "PERCENT"    
## 
## $size
## [1] 2 3 4 5
## 
## $adjr2
## [1] 0.7825416 0.8117906 0.8123772 0.8089679

## Plot model adjusted R^2 vs size of model
plot(leaps.adjr2$size, leaps.adjr2$adjr2, main = "Adj-R2")
lines(leaps.adjr2$size,leaps.adjr2$adjr2)

## Report the best model (indicate which terms are in the model)
best.model.adjr2 <- leaps.adjr2$which[which((leaps.adjr2$adjr2 == max(leaps.adjr2$adjr2))),]
## These are the variable names for the best model
names(best.model.adjr2)[best.model.adjr2]

## [1] "RATIO"   "SALARY"  "PERCENT"

## Cp -- for each model size, report best subset of size 1
leaps.Cp <- leaps(x = model.matrix(sat.full)[,-1], y = sat_dat$TOTAL
                  , method = "Cp"
                  , int = TRUE, nbest = 1, names = colnames(model.matrix(sat.full))[-1])
leaps.Cp

## $which
##    COST RATIO SALARY PERCENT
## 1 FALSE FALSE  FALSE    TRUE
## 2  TRUE FALSE  FALSE    TRUE
## 3 FALSE  TRUE   TRUE    TRUE
## 4  TRUE  TRUE   TRUE    TRUE
## 
## $label
## [1] "(Intercept)" "COST"        "RATIO"       "SALARY"      "PERCENT"    
## 
## $size
## [1] 2 3 4 5
## 
## $Cp
## [1] 8.640040 2.305533 3.179042 5.000000

## Plot model Cp vs size of model
plot(leaps.Cp$size, leaps.Cp$Cp, main = "Cp")
lines(leaps.Cp$size, leaps.Cp$size) # adds the line for Cp = p

## Report the best model (indicate which terms are in the model)
best.model.Cp <- leaps.Cp$which[which((leaps.Cp$Cp == min(leaps.Cp$Cp))),]
## These are the variable names for the best model
names(best.model.Cp)[best.model.Cp]

## [1] "COST"    "PERCENT"

## Another way
leaps <- regsubsets(TOTAL ~.,
                    data=sat_dat, method="exhaustive",
                    nbest=1,really.big=F)

summary(leaps)

## Subset selection object
## Call: regsubsets.formula(TOTAL ~ ., data = sat_dat, method = "exhaustive", 
##     nbest = 1, really.big = F)
## 4 Variables  (and intercept)
##         Forced in Forced out
## COST        FALSE      FALSE
## RATIO       FALSE      FALSE
## SALARY      FALSE      FALSE
## PERCENT     FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
##          COST RATIO SALARY PERCENT
## 1  ( 1 ) " "  " "   " "    "*"    
## 2  ( 1 ) "*"  " "   " "    "*"    
## 3  ( 1 ) " "  "*"   "*"    "*"    
## 4  ( 1 ) "*"  "*"   "*"    "*"

plot(leaps,scale="r2")

par(mfrow=c(1,3))
plot(leaps, scale="adjr2")
plot(leaps, scale="Cp")
plot(leaps,scale="bic")

############ Stepwise regression ###############
## step():function specification, object = a fitted model object
## scope = a formula giving the terms to be considered for adding or dropping
## Default is AIC, for BIC, include k = log(n)
## test="F" includes additional information for parameter estimate tests
## Define full model and empty model (just the intercept)
sat.full <- lm(TOTAL ~ ., data = sat_dat)
sat.null <- lm(TOTAL ~ 1, data = sat_dat)

## Forward selection, AIC & BIC with F-tests
sat.forward.AIC <- step(sat.null,trace=1,test="F"
                        , scope=list(lower=sat.null,upper=sat.full)
                        , direction = "forward", k = 2)

## Start:  AIC=432.5
## TOTAL ~ 1
## 
##           Df Sum of Sq    RSS    AIC  F value    Pr(>F)    
## + PERCENT  1    215875  58433 357.18 177.3305 < 2.2e-16 ***
## + SALARY   1     53078 221230 423.75  11.5162  0.001391 ** 
## + COST     1     39722 234586 426.68   8.1278  0.006408 ** 
## <none>                 274308 432.50                       
## + RATIO    1      1811 272497 434.17   0.3190  0.574833    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=357.18
## TOTAL ~ PERCENT
## 
##          Df Sum of Sq   RSS    AIC F value   Pr(>F)   
## + COST    1    8913.1 49520 350.91  8.4595 0.005529 **
## + SALARY  1    5094.8 53338 354.62  4.4893 0.039421 * 
## + RATIO   1    3336.2 55097 356.24  2.8459 0.098235 . 
## <none>                58433 357.18                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=350.91
## TOTAL ~ PERCENT + COST
## 
##          Df Sum of Sq   RSS    AIC F value Pr(>F)
## <none>                49520 350.91               
## + RATIO   1    892.74 48627 352.00  0.8445 0.3629
## + SALARY  1     37.52 49483 352.87  0.0349 0.8527

sat.forward.AIC

## 
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
## 
## Coefficients:
## (Intercept)      PERCENT         COST  
##     993.832       -2.851       12.287

anova(sat.forward.AIC)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## PERCENT    1 215875  215875 204.8888 < 2.2e-16 ***
## COST       1   8913    8913   8.4595  0.005529 ** 
## Residuals 47  49520    1054                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sat.forward.BIC <- step(sat.null,trace=1,test="F"
                        , scope=list(lower=sat.null,upper=sat.full)
                        , direction = "forward", k = log(dim(sat_dat)[1]))

## Start:  AIC=434.41
## TOTAL ~ 1
## 
##           Df Sum of Sq    RSS    AIC  F value    Pr(>F)    
## + PERCENT  1    215875  58433 361.00 177.3305 < 2.2e-16 ***
## + SALARY   1     53078 221230 427.57  11.5162  0.001391 ** 
## + COST     1     39722 234586 430.50   8.1278  0.006408 ** 
## <none>                 274308 434.41                       
## + RATIO    1      1811 272497 437.99   0.3190  0.574833    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=361
## TOTAL ~ PERCENT
## 
##          Df Sum of Sq   RSS    AIC F value   Pr(>F)   
## + COST    1    8913.1 49520 356.64  8.4595 0.005529 **
## + SALARY  1    5094.8 53338 360.36  4.4893 0.039421 * 
## <none>                58433 361.00                    
## + RATIO   1    3336.2 55097 361.98  2.8459 0.098235 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=356.64
## TOTAL ~ PERCENT + COST
## 
##          Df Sum of Sq   RSS    AIC F value Pr(>F)
## <none>                49520 356.64               
## + RATIO   1    892.74 48627 359.64  0.8445 0.3629
## + SALARY  1     37.52 49483 360.52  0.0349 0.8527

sat.forward.BIC

## 
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
## 
## Coefficients:
## (Intercept)      PERCENT         COST  
##     993.832       -2.851       12.287

anova(sat.forward.BIC)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## PERCENT    1 215875  215875 204.8888 < 2.2e-16 ***
## COST       1   8913    8913   8.4595  0.005529 ** 
## Residuals 47  49520    1054                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Backward selection, AIC & BIC with F-tests
sat.backward.AIC <- step(sat.full,trace=1,test="F"
                         , scope=list(lower=sat.null,upper=sat.full)
                         , direction = "backward", k = 2)

## Start:  AIC=353.48
## TOTAL ~ COST + RATIO + SALARY + PERCENT
## 
##           Df Sum of Sq    RSS    AIC  F value    Pr(>F)    
## - COST     1       191  48315 351.67   0.1790    0.6742    
## - SALARY   1       503  48627 352.00   0.4707    0.4962    
## - RATIO    1      1359  49483 352.87   1.2704    0.2657    
## <none>                  48124 353.48                       
## - PERCENT  1    168688 216812 426.74 157.7379 2.607e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=351.67
## TOTAL ~ RATIO + SALARY + PERCENT
## 
##           Df Sum of Sq    RSS    AIC  F value  Pr(>F)    
## <none>                  48315 351.67                     
## - RATIO    1      5023  53338 354.62   4.7823 0.03388 *  
## - SALARY   1      6782  55097 356.24   6.4566 0.01449 *  
## - PERCENT  1    171126 219441 425.34 162.9252 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sat.backward.AIC

## 
## Call:
## lm(formula = TOTAL ~ RATIO + SALARY + PERCENT, data = sat_dat)
## 
## Coefficients:
## (Intercept)        RATIO       SALARY      PERCENT  
##    1057.898       -4.639        2.552       -2.913

anova(sat.backward.AIC)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## RATIO      1   1811    1811   1.7242    0.1957    
## SALARY     1  53055   53055  50.5129 6.293e-09 ***
## PERCENT    1 171126  171126 162.9252 < 2.2e-16 ***
## Residuals 46  48315    1050                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sat.backward.BIC <- step(sat.full,trace=1,test="F"
                         , scope=list(lower=sat.null,upper=sat.full)
                         , direction = "backward", k = log(dim(sat_dat)[1]))

## Start:  AIC=363.04
## TOTAL ~ COST + RATIO + SALARY + PERCENT
## 
##           Df Sum of Sq    RSS    AIC  F value    Pr(>F)    
## - COST     1       191  48315 359.32   0.1790    0.6742    
## - SALARY   1       503  48627 359.64   0.4707    0.4962    
## - RATIO    1      1359  49483 360.52   1.2704    0.2657    
## <none>                  48124 363.04                       
## - PERCENT  1    168688 216812 434.39 157.7379 2.607e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=359.32
## TOTAL ~ RATIO + SALARY + PERCENT
## 
##           Df Sum of Sq    RSS    AIC  F value  Pr(>F)    
## <none>                  48315 359.32                     
## - RATIO    1      5023  53338 360.36   4.7823 0.03388 *  
## - SALARY   1      6782  55097 361.98   6.4566 0.01449 *  
## - PERCENT  1    171126 219441 431.08 162.9252 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sat.backward.BIC

## 
## Call:
## lm(formula = TOTAL ~ RATIO + SALARY + PERCENT, data = sat_dat)
## 
## Coefficients:
## (Intercept)        RATIO       SALARY      PERCENT  
##    1057.898       -4.639        2.552       -2.913

anova(sat.backward.BIC)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## RATIO      1   1811    1811   1.7242    0.1957    
## SALARY     1  53055   53055  50.5129 6.293e-09 ***
## PERCENT    1 171126  171126 162.9252 < 2.2e-16 ***
## Residuals 46  48315    1050                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Stepwise (both) selection, BIC with F-tests, starting with intermediate model
## Option: trace = 0 does not print each step of the selection
## The anova object provides a summary of the selection steps in order
sat.both.AIC <- step(sat.null, trace = 0, test = "F"
                     , scope=list(lower=sat.null,upper=sat.full)
                     , direction = "both", k = 2)
sat.both.AIC

## 
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
## 
## Coefficients:
## (Intercept)      PERCENT         COST  
##     993.832       -2.851       12.287

anova(sat.both.AIC)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## PERCENT    1 215875  215875 204.8888 < 2.2e-16 ***
## COST       1   8913    8913   8.4595  0.005529 ** 
## Residuals 47  49520    1054                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sat.both.BIC <- step(sat.null, trace = 0, test = "F"
                     , scope=list(lower=sat.null,upper=sat.full)
                     , direction = "both", k = log(dim(sat_dat)[1]))
sat.both.BIC

## 
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
## 
## Coefficients:
## (Intercept)      PERCENT         COST  
##     993.832       -2.851       12.287

anova(sat.both.BIC)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## PERCENT    1 215875  215875 204.8888 < 2.2e-16 ***
## COST       1   8913    8913   8.4595  0.005529 ** 
## Residuals 47  49520    1054                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## From the above, we can choose the model: TOTAL ~ COST + PERCENT
sat <- subset(sat_dat, select=c(-RATIO,-SALARY))
sat_full <- lm(TOTAL ~ ., data = sat)
summary(sat_full)

## 
## Call:
## lm(formula = TOTAL ~ ., data = sat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.400 -22.884   1.968  19.142  68.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 993.8317    21.8332  45.519  < 2e-16 ***
## COST         12.2865     4.2243   2.909  0.00553 ** 
## PERCENT      -2.8509     0.2151 -13.253  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.46 on 47 degrees of freedom
## Multiple R-squared:  0.8195, Adjusted R-squared:  0.8118 
## F-statistic: 106.7 on 2 and 47 DF,  p-value: < 2.2e-16

anova(sat_full)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## COST       1  39722   39722  37.701 1.651e-07 ***
## PERCENT    1 185066  185066 175.648 < 2.2e-16 ***
## Residuals 47  49520    1054                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cor(sat)

##               COST    PERCENT      TOTAL
## COST     1.0000000  0.5926274 -0.3805370
## PERCENT  0.5926274  1.0000000 -0.8871187
## TOTAL   -0.3805370 -0.8871187  1.0000000

vif(sat_full)

##     COST  PERCENT 
## 1.541324 1.541324

## Influential observations
## Leverage check: observations 2, 29, 34, and 48 have high leverage for the response variable TOTAL.
## Among them observation 2 is flagged in both graphs. n observations and k parameters
## However, the Cook's distance of observation 2 is less than the cut-off rule of thumb: Di > 4/(n-k-1)
## Therefore, there is no removal of any observtions.
cutoff <- 4/((nrow(sat)-length(sat_full$coefficients))) 
par(mfrow=c(1,1))
qqPlot(sat_full, id.n = 4)

## 48 28 34 29 
##  1  2 49 50

outlierTest(sat_full)

## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferonni p
## 48 -3.006388          0.0042729      0.21365

influenceIndexPlot(sat_full, vars=c("Cook","hat"),  id.n=4)

plot(cooks.distance(sat_full))
abline(h = cutoff, lty=2)

avPlots(sat_full)

## Cook's D plot, identify D values > 4/(n-k-1) 
plot(sat_full, which=4, cook.levels=cutoff)

## Influence Plot 
influencePlot(sat_full, id.method="identify", main="Influence Plot", 
              sub="Circle size is proportial to Cook's Distance")

## Regression diagnostic
## Calculate the residuals and standardized residuals
## Non-normality
pairs(TOTAL ~ COST + PERCENT,data=sat)

fitted <- fitted(sat_full)
cor(fitted, sat$TOTAL)

## [1] 0.9052472

residuals <- residuals(sat_full)
student <- studres(sat_full)
par(mfrow=c(1,2))
plot(fitted,sat$TOTAL,xlab="Fitted Values")
abline(lsfit(fitted,sat$TOTAL))
plot(fitted,student, ylab="Studentized Residuals")

plot(sat$COST,student, ylab="Studentized Residuals")
plot(sat$PERCENT,student, ylab="Studentized Residuals")

## Histogram
par(mfrow=c(1,2))
hist(residuals)
qqnorm(residuals)
qqline(residuals)

hist(student) 
qqnorm(student)
qqline(student)

## Skewness
library(moments)
skewness(residuals)

## [1] -0.01181024

skewness(student)

## [1] -0.0382346

## Increase each teacher's salary by 5 thousands of dollars
sat_new <- sat_dat
sat_new$SALARY <- sat_dat$SALARY+5

## Fit the full model
sat.new1 <- lm(TOTAL ~ ., data = sat_new)
summary(sat.new1)

## 
## Call:
## lm(formula = TOTAL ~ ., data = sat_new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.531 -20.855  -1.746  15.979  66.571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1037.7819    50.7192  20.461  < 2e-16 ***
## COST           4.4626    10.5465   0.423    0.674    
## RATIO         -3.6242     3.2154  -1.127    0.266    
## SALARY         1.6379     2.3872   0.686    0.496    
## PERCENT       -2.9045     0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared:  0.8246, Adjusted R-squared:  0.809 
## F-statistic: 52.88 on 4 and 45 DF,  p-value: < 2.2e-16

anova(sat.new1)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## COST       1  39722   39722  37.1436 2.260e-07 ***
## RATIO      1   1143    1143   1.0685 0.3068088    
## SALARY     1  16631   16631  15.5514 0.0002779 ***
## PERCENT    1 168688  168688 157.7379 2.607e-16 ***
## Residuals 45  48124    1069                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Evaluate Collinearity
vif(sat.new1) # variance inflation factors

##     COST    RATIO   SALARY  PERCENT 
## 9.465320 2.433204 9.217237 1.755090

sqrt(vif(sat.new1)) > 2 # problem?

##    COST   RATIO  SALARY PERCENT 
##    TRUE   FALSE    TRUE   FALSE

## Reduced model: 1 predictor COST, 2 covariates PERCENT, RATIO
sat.new2 <-lm(TOTAL ~ COST + PERCENT + RATIO, data = sat_new)
summary(sat.new2)

## 
## Call:
## lm(formula = TOTAL ~ COST + PERCENT + RATIO, data = sat_new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.284 -21.130   1.414  16.709  66.073 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1035.4739    50.3155  20.580   <2e-16 ***
## COST          11.0140     4.4521   2.474   0.0171 *  
## PERCENT       -2.8491     0.2155 -13.222   <2e-16 ***
## RATIO         -2.0282     2.2071  -0.919   0.3629    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.51 on 46 degrees of freedom
## Multiple R-squared:  0.8227, Adjusted R-squared:  0.8112 
## F-statistic: 71.16 on 3 and 46 DF,  p-value: < 2.2e-16

anova(sat.new2)

## Analysis of Variance Table
## 
## Response: TOTAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## COST       1  39722   39722  37.5759 1.849e-07 ***
## PERCENT    1 185066  185066 175.0665 < 2.2e-16 ***
## RATIO      1    893     893   0.8445    0.3629    
## Residuals 46  48627    1057                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Evaluate Collinearity
vif(sat.new2) # variance inflation factors

##     COST  PERCENT    RATIO 
## 1.706384 1.541453 1.159732

sqrt(vif(sat.new2)) > 2

##    COST PERCENT   RATIO 
##   FALSE   FALSE   FALSE

Applied_Linear_Model_Final_Exam_Yanxin_Li.R

Shuchen

Wed Dec 16 19:54:21 2015