################# Final Exam_Applied Linear Model ################
## Read in data
setwd('D:\\R Code')
sat_data <- read.table("SAT.txt",header=TRUE);
head(sat_data)
## STATE COST RATIO SALARY PERCENT VERBAL MATH TOTAL
## 1 Alabama 4.405 17.2 31.144 8 491 538 1029
## 2 Alaska 8.963 17.6 47.951 47 445 489 934
## 3 Arizona 4.778 19.3 32.175 27 448 496 944
## 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 5 California 4.992 24.0 41.078 45 417 485 902
## 6 Colorado 5.443 18.4 34.571 29 462 518 980
str(sat_data)
## 'data.frame': 50 obs. of 8 variables:
## $ STATE : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ COST : num 4.41 8.96 4.78 4.46 4.99 ...
## $ RATIO : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
## $ SALARY : num 31.1 48 32.2 28.9 41.1 ...
## $ PERCENT: int 8 47 27 6 45 29 81 68 48 65 ...
## $ VERBAL : int 491 445 448 482 417 462 431 429 420 406 ...
## $ MATH : int 538 489 496 523 485 518 477 468 469 448 ...
## $ TOTAL : int 1029 934 944 1005 902 980 908 897 889 854 ...
dim(sat_data)
## [1] 50 8
summary(sat_data)
## STATE COST RATIO SALARY
## Alabama : 1 Min. :3.656 Min. :13.80 Min. :25.99
## Alaska : 1 1st Qu.:4.882 1st Qu.:15.22 1st Qu.:30.98
## Arizona : 1 Median :5.768 Median :16.60 Median :33.29
## Arkansas : 1 Mean :5.905 Mean :16.86 Mean :34.83
## California: 1 3rd Qu.:6.434 3rd Qu.:17.57 3rd Qu.:38.55
## Colorado : 1 Max. :9.774 Max. :24.30 Max. :50.05
## (Other) :44
## PERCENT VERBAL MATH TOTAL
## Min. : 4.00 Min. :401.0 Min. :443.0 Min. : 844.0
## 1st Qu.: 9.00 1st Qu.:427.2 1st Qu.:474.8 1st Qu.: 897.2
## Median :28.00 Median :448.0 Median :497.5 Median : 945.5
## Mean :35.24 Mean :457.1 Mean :508.8 Mean : 965.9
## 3rd Qu.:63.00 3rd Qu.:490.2 3rd Qu.:539.5 3rd Qu.:1032.0
## Max. :81.00 Max. :516.0 Max. :592.0 Max. :1107.0
##
names(sat_data)
## [1] "STATE" "COST" "RATIO" "SALARY" "PERCENT" "VERBAL" "MATH"
## [8] "TOTAL"
sat_dat <- subset(sat_data,select=c(-STATE,-VERBAL,-MATH))
cor(sat_dat)
## COST RATIO SALARY PERCENT TOTAL
## COST 1.0000000 -0.371025386 0.869801513 0.5926274 -0.38053700
## RATIO -0.3710254 1.000000000 -0.001146081 -0.2130536 0.08125382
## SALARY 0.8698015 -0.001146081 1.000000000 0.6167799 -0.43988338
## PERCENT 0.5926274 -0.213053607 0.616779867 1.0000000 -0.88711868
## TOTAL -0.3805370 0.081253823 -0.439883381 -0.8871187 1.00000000
## Plot the data
library(ggplot2)
library(GGally)
p <- ggpairs(sat_dat, upper = list(continuous = "points")
, lower = list(continuous = "cor")
)
print(p)

p1 <- ggplot(sat_data, aes(x=COST, y=TOTAL, colour=STATE))
p1 <- p1 + geom_point(size=2, color = "black")
p1 <- p1 + geom_text(size=3, data=sat_data, aes(label = as.character(sat_data$STATE)),vjust=-1)
p1 <- p1 + xlab("Cost Per Pupil")+ylab("Average Total SAT Score")
p1 <- p1 + theme(legend.position = "none")
p1 <- p1 + stat_smooth(aes(group=1),method="lm", se=FALSE,size=1.5)
p1 <- p1 + theme(axis.title = element_text(size = rel(1)))
print(p1)

p2 <- ggplot(sat_data, aes(x=PERCENT, y=TOTAL, colour=STATE))
p2 <- p2 + geom_point(size=2, color = "black")
p2 <- p2 + geom_text(size=3, data=sat_data, aes(label = as.character(sat_data$STATE)),vjust=-1)
p2 <- p2 + theme(legend.position = "none")
p2 <- p2 + xlab("Percent of Student Taking SAT")+ylab("Average Total SAT Score")
p2 <- p2 + stat_smooth(aes(group=1),method="lm", se=FALSE, size=1.5)
p2 <- p2 + theme(axis.title = element_text(size = rel(1)))
print(p2)

p3 <- ggplot(sat_data, aes(x=SALARY, y=TOTAL, colour=STATE))
p3 <- p3 + geom_point(size=2, color = "black")
p3 <- p3+ geom_text(size=3, data=sat_data, aes(label = as.character(sat_data$STATE)),vjust=-1)
p3 <- p3 + theme(legend.position = "none")
p3 <- p3 + xlab("Average Annual Teachers' Salary")+ylab("Average Total SAT Score")
p3 <- p3 + stat_smooth(aes(group=1),method="lm", se=FALSE, size=1.5)
p3 <- p3 + theme(axis.title = element_text(size = rel(1)))
print(p3)

## Fit the estimated regression line
lm.cost <- lm(TOTAL~COST, data=sat_data)
summary(lm.cost)
##
## Call:
## lm(formula = TOTAL ~ COST, data = sat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.074 -46.821 4.087 40.034 128.489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1089.294 44.390 24.539 < 2e-16 ***
## COST -20.892 7.328 -2.851 0.00641 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.91 on 48 degrees of freedom
## Multiple R-squared: 0.1448, Adjusted R-squared: 0.127
## F-statistic: 8.128 on 1 and 48 DF, p-value: 0.006408
anova(lm.cost)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## COST 1 39722 39722 8.1278 0.006408 **
## Residuals 48 234586 4887
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.percent <- lm(TOTAL~PERCENT, data=sat_data)
summary(lm.percent)
##
## Call:
## lm(formula = TOTAL ~ PERCENT, data = sat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79.158 -27.364 3.308 19.876 66.080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1053.3204 8.2112 128.28 <2e-16 ***
## PERCENT -2.4801 0.1862 -13.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.89 on 48 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.7825
## F-statistic: 177.3 on 1 and 48 DF, p-value: < 2.2e-16
anova(lm.percent)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PERCENT 1 215875 215875 177.33 < 2.2e-16 ***
## Residuals 48 58433 1217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.salary <- lm(TOTAL~SALARY, data=sat_data)
summary(lm.salary)
##
## Call:
## lm(formula = TOTAL ~ SALARY, data = sat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.125 -45.354 4.073 42.193 125.279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1158.859 57.659 20.098 < 2e-16 ***
## SALARY -5.540 1.632 -3.394 0.00139 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.89 on 48 degrees of freedom
## Multiple R-squared: 0.1935, Adjusted R-squared: 0.1767
## F-statistic: 11.52 on 1 and 48 DF, p-value: 0.001391
anova(lm.salary)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## SALARY 1 53078 53078 11.516 0.001391 **
## Residuals 48 221230 4609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
############# Multiple regression ################
## Full model: 1 predictor COST, 3 covariates PERCENT, RATIO, and SALARY
library(car)
library(MASS)
sat1 <-lm(TOTAL ~ ., data = sat_dat)
summary(sat1)
##
## Call:
## lm(formula = TOTAL ~ ., data = sat_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.531 -20.855 -1.746 15.979 66.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1045.9715 52.8698 19.784 < 2e-16 ***
## COST 4.4626 10.5465 0.423 0.674
## RATIO -3.6242 3.2154 -1.127 0.266
## SALARY 1.6379 2.3872 0.686 0.496
## PERCENT -2.9045 0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared: 0.8246, Adjusted R-squared: 0.809
## F-statistic: 52.88 on 4 and 45 DF, p-value: < 2.2e-16
anova(sat1)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## COST 1 39722 39722 37.1436 2.260e-07 ***
## RATIO 1 1143 1143 1.0685 0.3068088
## SALARY 1 16631 16631 15.5514 0.0002779 ***
## PERCENT 1 168688 168688 157.7379 2.607e-16 ***
## Residuals 45 48124 1069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Evaluate Collinearity
vif(sat1) # variance inflation factors
## COST RATIO SALARY PERCENT
## 9.465320 2.433204 9.217237 1.755090
sqrt(vif(sat1)) > 2 # problem?
## COST RATIO SALARY PERCENT
## TRUE FALSE TRUE FALSE
## Evaluate Nonlinearity, component + residual plot
crPlots(sat1)

## Ceres plots
ceresPlots(sat1)

## Reduced model: 1 predictor COST, 2 covariates PERCENT, RATIO
sat2 <-lm(TOTAL ~ COST + RATIO + PERCENT, data = sat_dat)
summary(sat2)
##
## Call:
## lm(formula = TOTAL ~ COST + RATIO + PERCENT, data = sat_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.284 -21.130 1.414 16.709 66.073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1035.4739 50.3155 20.580 <2e-16 ***
## COST 11.0140 4.4521 2.474 0.0171 *
## RATIO -2.0282 2.2071 -0.919 0.3629
## PERCENT -2.8491 0.2155 -13.222 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.51 on 46 degrees of freedom
## Multiple R-squared: 0.8227, Adjusted R-squared: 0.8112
## F-statistic: 71.16 on 3 and 46 DF, p-value: < 2.2e-16
anova(sat2)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## COST 1 39722 39722 37.5759 1.849e-07 ***
## RATIO 1 1143 1143 1.0809 0.3039
## PERCENT 1 184816 184816 174.8301 < 2.2e-16 ***
## Residuals 46 48627 1057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Evaluate Collinearity
vif(sat2) # variance inflation factors
## COST RATIO PERCENT
## 1.706384 1.159732 1.541453
sqrt(vif(sat2)) > 2
## COST RATIO PERCENT
## FALSE FALSE FALSE
## Evaluate Nonlinearity, component + residual plot
crPlots(sat2)

## Ceres plots
ceresPlots(sat2)

########### Variable Selection ############
## Best subsets: the leaps package provides best subsets with other selection criteria.
library(leaps)
## First, fit the full model
sat.full <- lm(TOTAL ~ ., data = sat_dat)
summary(sat.full)
##
## Call:
## lm(formula = TOTAL ~ ., data = sat_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.531 -20.855 -1.746 15.979 66.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1045.9715 52.8698 19.784 < 2e-16 ***
## COST 4.4626 10.5465 0.423 0.674
## RATIO -3.6242 3.2154 -1.127 0.266
## SALARY 1.6379 2.3872 0.686 0.496
## PERCENT -2.9045 0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared: 0.8246, Adjusted R-squared: 0.809
## F-statistic: 52.88 on 4 and 45 DF, p-value: < 2.2e-16
## Second, create the design matrix which leap uses as argument
## R^2 -- for each model size, report best subset of size 4
leaps.r2 <- leaps(x = model.matrix(sat.full)[,-1], y = sat_dat$TOTAL
, method = "r2"
, int = TRUE, nbest = 1, names = colnames(model.matrix(sat.full))[-1])
leaps.r2
## $which
## COST RATIO SALARY PERCENT
## 1 FALSE FALSE FALSE TRUE
## 2 TRUE FALSE FALSE TRUE
## 3 FALSE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE
##
## $label
## [1] "(Intercept)" "COST" "RATIO" "SALARY" "PERCENT"
##
## $size
## [1] 2 3 4 5
##
## $r2
## [1] 0.7869795 0.8194726 0.8238643 0.8245623
plot(leaps.r2$size, leaps.r2$r2, main = "R2")
lines(leaps.r2$size,leaps.r2$r2)

## Report the best model (indicate which terms are in the model)
best.model.r2 <- leaps.r2$which[which((leaps.r2$r2 == max(leaps.r2$r2))),]
## These are the variable names for the best model
names(best.model.r2)[best.model.r2]
## [1] "COST" "RATIO" "SALARY" "PERCENT"
## adj-R^2 -- for each model size, report best subset of size 3
leaps.adjr2 <- leaps(x = model.matrix(sat.full)[,-1], y = sat_dat$TOTAL
, method = "adjr2"
, int = TRUE, nbest = 1, names = colnames(model.matrix(sat.full))[-1])
leaps.adjr2
## $which
## COST RATIO SALARY PERCENT
## 1 FALSE FALSE FALSE TRUE
## 2 TRUE FALSE FALSE TRUE
## 3 FALSE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE
##
## $label
## [1] "(Intercept)" "COST" "RATIO" "SALARY" "PERCENT"
##
## $size
## [1] 2 3 4 5
##
## $adjr2
## [1] 0.7825416 0.8117906 0.8123772 0.8089679
## Plot model adjusted R^2 vs size of model
plot(leaps.adjr2$size, leaps.adjr2$adjr2, main = "Adj-R2")
lines(leaps.adjr2$size,leaps.adjr2$adjr2)

## Report the best model (indicate which terms are in the model)
best.model.adjr2 <- leaps.adjr2$which[which((leaps.adjr2$adjr2 == max(leaps.adjr2$adjr2))),]
## These are the variable names for the best model
names(best.model.adjr2)[best.model.adjr2]
## [1] "RATIO" "SALARY" "PERCENT"
## Cp -- for each model size, report best subset of size 1
leaps.Cp <- leaps(x = model.matrix(sat.full)[,-1], y = sat_dat$TOTAL
, method = "Cp"
, int = TRUE, nbest = 1, names = colnames(model.matrix(sat.full))[-1])
leaps.Cp
## $which
## COST RATIO SALARY PERCENT
## 1 FALSE FALSE FALSE TRUE
## 2 TRUE FALSE FALSE TRUE
## 3 FALSE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE
##
## $label
## [1] "(Intercept)" "COST" "RATIO" "SALARY" "PERCENT"
##
## $size
## [1] 2 3 4 5
##
## $Cp
## [1] 8.640040 2.305533 3.179042 5.000000
## Plot model Cp vs size of model
plot(leaps.Cp$size, leaps.Cp$Cp, main = "Cp")
lines(leaps.Cp$size, leaps.Cp$size) # adds the line for Cp = p

## Report the best model (indicate which terms are in the model)
best.model.Cp <- leaps.Cp$which[which((leaps.Cp$Cp == min(leaps.Cp$Cp))),]
## These are the variable names for the best model
names(best.model.Cp)[best.model.Cp]
## [1] "COST" "PERCENT"
## Another way
leaps <- regsubsets(TOTAL ~.,
data=sat_dat, method="exhaustive",
nbest=1,really.big=F)
summary(leaps)
## Subset selection object
## Call: regsubsets.formula(TOTAL ~ ., data = sat_dat, method = "exhaustive",
## nbest = 1, really.big = F)
## 4 Variables (and intercept)
## Forced in Forced out
## COST FALSE FALSE
## RATIO FALSE FALSE
## SALARY FALSE FALSE
## PERCENT FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
## COST RATIO SALARY PERCENT
## 1 ( 1 ) " " " " " " "*"
## 2 ( 1 ) "*" " " " " "*"
## 3 ( 1 ) " " "*" "*" "*"
## 4 ( 1 ) "*" "*" "*" "*"
plot(leaps,scale="r2")

par(mfrow=c(1,3))
plot(leaps, scale="adjr2")
plot(leaps, scale="Cp")
plot(leaps,scale="bic")

############ Stepwise regression ###############
## step():function specification, object = a fitted model object
## scope = a formula giving the terms to be considered for adding or dropping
## Default is AIC, for BIC, include k = log(n)
## test="F" includes additional information for parameter estimate tests
## Define full model and empty model (just the intercept)
sat.full <- lm(TOTAL ~ ., data = sat_dat)
sat.null <- lm(TOTAL ~ 1, data = sat_dat)
## Forward selection, AIC & BIC with F-tests
sat.forward.AIC <- step(sat.null,trace=1,test="F"
, scope=list(lower=sat.null,upper=sat.full)
, direction = "forward", k = 2)
## Start: AIC=432.5
## TOTAL ~ 1
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + PERCENT 1 215875 58433 357.18 177.3305 < 2.2e-16 ***
## + SALARY 1 53078 221230 423.75 11.5162 0.001391 **
## + COST 1 39722 234586 426.68 8.1278 0.006408 **
## <none> 274308 432.50
## + RATIO 1 1811 272497 434.17 0.3190 0.574833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=357.18
## TOTAL ~ PERCENT
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + COST 1 8913.1 49520 350.91 8.4595 0.005529 **
## + SALARY 1 5094.8 53338 354.62 4.4893 0.039421 *
## + RATIO 1 3336.2 55097 356.24 2.8459 0.098235 .
## <none> 58433 357.18
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=350.91
## TOTAL ~ PERCENT + COST
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 49520 350.91
## + RATIO 1 892.74 48627 352.00 0.8445 0.3629
## + SALARY 1 37.52 49483 352.87 0.0349 0.8527
sat.forward.AIC
##
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
##
## Coefficients:
## (Intercept) PERCENT COST
## 993.832 -2.851 12.287
anova(sat.forward.AIC)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PERCENT 1 215875 215875 204.8888 < 2.2e-16 ***
## COST 1 8913 8913 8.4595 0.005529 **
## Residuals 47 49520 1054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sat.forward.BIC <- step(sat.null,trace=1,test="F"
, scope=list(lower=sat.null,upper=sat.full)
, direction = "forward", k = log(dim(sat_dat)[1]))
## Start: AIC=434.41
## TOTAL ~ 1
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + PERCENT 1 215875 58433 361.00 177.3305 < 2.2e-16 ***
## + SALARY 1 53078 221230 427.57 11.5162 0.001391 **
## + COST 1 39722 234586 430.50 8.1278 0.006408 **
## <none> 274308 434.41
## + RATIO 1 1811 272497 437.99 0.3190 0.574833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=361
## TOTAL ~ PERCENT
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## + COST 1 8913.1 49520 356.64 8.4595 0.005529 **
## + SALARY 1 5094.8 53338 360.36 4.4893 0.039421 *
## <none> 58433 361.00
## + RATIO 1 3336.2 55097 361.98 2.8459 0.098235 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=356.64
## TOTAL ~ PERCENT + COST
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 49520 356.64
## + RATIO 1 892.74 48627 359.64 0.8445 0.3629
## + SALARY 1 37.52 49483 360.52 0.0349 0.8527
sat.forward.BIC
##
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
##
## Coefficients:
## (Intercept) PERCENT COST
## 993.832 -2.851 12.287
anova(sat.forward.BIC)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PERCENT 1 215875 215875 204.8888 < 2.2e-16 ***
## COST 1 8913 8913 8.4595 0.005529 **
## Residuals 47 49520 1054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Backward selection, AIC & BIC with F-tests
sat.backward.AIC <- step(sat.full,trace=1,test="F"
, scope=list(lower=sat.null,upper=sat.full)
, direction = "backward", k = 2)
## Start: AIC=353.48
## TOTAL ~ COST + RATIO + SALARY + PERCENT
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - COST 1 191 48315 351.67 0.1790 0.6742
## - SALARY 1 503 48627 352.00 0.4707 0.4962
## - RATIO 1 1359 49483 352.87 1.2704 0.2657
## <none> 48124 353.48
## - PERCENT 1 168688 216812 426.74 157.7379 2.607e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=351.67
## TOTAL ~ RATIO + SALARY + PERCENT
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 48315 351.67
## - RATIO 1 5023 53338 354.62 4.7823 0.03388 *
## - SALARY 1 6782 55097 356.24 6.4566 0.01449 *
## - PERCENT 1 171126 219441 425.34 162.9252 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sat.backward.AIC
##
## Call:
## lm(formula = TOTAL ~ RATIO + SALARY + PERCENT, data = sat_dat)
##
## Coefficients:
## (Intercept) RATIO SALARY PERCENT
## 1057.898 -4.639 2.552 -2.913
anova(sat.backward.AIC)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## RATIO 1 1811 1811 1.7242 0.1957
## SALARY 1 53055 53055 50.5129 6.293e-09 ***
## PERCENT 1 171126 171126 162.9252 < 2.2e-16 ***
## Residuals 46 48315 1050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sat.backward.BIC <- step(sat.full,trace=1,test="F"
, scope=list(lower=sat.null,upper=sat.full)
, direction = "backward", k = log(dim(sat_dat)[1]))
## Start: AIC=363.04
## TOTAL ~ COST + RATIO + SALARY + PERCENT
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - COST 1 191 48315 359.32 0.1790 0.6742
## - SALARY 1 503 48627 359.64 0.4707 0.4962
## - RATIO 1 1359 49483 360.52 1.2704 0.2657
## <none> 48124 363.04
## - PERCENT 1 168688 216812 434.39 157.7379 2.607e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=359.32
## TOTAL ~ RATIO + SALARY + PERCENT
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 48315 359.32
## - RATIO 1 5023 53338 360.36 4.7823 0.03388 *
## - SALARY 1 6782 55097 361.98 6.4566 0.01449 *
## - PERCENT 1 171126 219441 431.08 162.9252 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sat.backward.BIC
##
## Call:
## lm(formula = TOTAL ~ RATIO + SALARY + PERCENT, data = sat_dat)
##
## Coefficients:
## (Intercept) RATIO SALARY PERCENT
## 1057.898 -4.639 2.552 -2.913
anova(sat.backward.BIC)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## RATIO 1 1811 1811 1.7242 0.1957
## SALARY 1 53055 53055 50.5129 6.293e-09 ***
## PERCENT 1 171126 171126 162.9252 < 2.2e-16 ***
## Residuals 46 48315 1050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Stepwise (both) selection, BIC with F-tests, starting with intermediate model
## Option: trace = 0 does not print each step of the selection
## The anova object provides a summary of the selection steps in order
sat.both.AIC <- step(sat.null, trace = 0, test = "F"
, scope=list(lower=sat.null,upper=sat.full)
, direction = "both", k = 2)
sat.both.AIC
##
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
##
## Coefficients:
## (Intercept) PERCENT COST
## 993.832 -2.851 12.287
anova(sat.both.AIC)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PERCENT 1 215875 215875 204.8888 < 2.2e-16 ***
## COST 1 8913 8913 8.4595 0.005529 **
## Residuals 47 49520 1054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sat.both.BIC <- step(sat.null, trace = 0, test = "F"
, scope=list(lower=sat.null,upper=sat.full)
, direction = "both", k = log(dim(sat_dat)[1]))
sat.both.BIC
##
## Call:
## lm(formula = TOTAL ~ PERCENT + COST, data = sat_dat)
##
## Coefficients:
## (Intercept) PERCENT COST
## 993.832 -2.851 12.287
anova(sat.both.BIC)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## PERCENT 1 215875 215875 204.8888 < 2.2e-16 ***
## COST 1 8913 8913 8.4595 0.005529 **
## Residuals 47 49520 1054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## From the above, we can choose the model: TOTAL ~ COST + PERCENT
sat <- subset(sat_dat, select=c(-RATIO,-SALARY))
sat_full <- lm(TOTAL ~ ., data = sat)
summary(sat_full)
##
## Call:
## lm(formula = TOTAL ~ ., data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.400 -22.884 1.968 19.142 68.755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 993.8317 21.8332 45.519 < 2e-16 ***
## COST 12.2865 4.2243 2.909 0.00553 **
## PERCENT -2.8509 0.2151 -13.253 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.46 on 47 degrees of freedom
## Multiple R-squared: 0.8195, Adjusted R-squared: 0.8118
## F-statistic: 106.7 on 2 and 47 DF, p-value: < 2.2e-16
anova(sat_full)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## COST 1 39722 39722 37.701 1.651e-07 ***
## PERCENT 1 185066 185066 175.648 < 2.2e-16 ***
## Residuals 47 49520 1054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(sat)
## COST PERCENT TOTAL
## COST 1.0000000 0.5926274 -0.3805370
## PERCENT 0.5926274 1.0000000 -0.8871187
## TOTAL -0.3805370 -0.8871187 1.0000000
vif(sat_full)
## COST PERCENT
## 1.541324 1.541324
## Influential observations
## Leverage check: observations 2, 29, 34, and 48 have high leverage for the response variable TOTAL.
## Among them observation 2 is flagged in both graphs. n observations and k parameters
## However, the Cook's distance of observation 2 is less than the cut-off rule of thumb: Di > 4/(n-k-1)
## Therefore, there is no removal of any observtions.
cutoff <- 4/((nrow(sat)-length(sat_full$coefficients)))
par(mfrow=c(1,1))
qqPlot(sat_full, id.n = 4)

## 48 28 34 29
## 1 2 49 50
outlierTest(sat_full)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 48 -3.006388 0.0042729 0.21365
influenceIndexPlot(sat_full, vars=c("Cook","hat"), id.n=4)

plot(cooks.distance(sat_full))
abline(h = cutoff, lty=2)

avPlots(sat_full)

## Cook's D plot, identify D values > 4/(n-k-1)
plot(sat_full, which=4, cook.levels=cutoff)

## Influence Plot
influencePlot(sat_full, id.method="identify", main="Influence Plot",
sub="Circle size is proportial to Cook's Distance")

## Regression diagnostic
## Calculate the residuals and standardized residuals
## Non-normality
pairs(TOTAL ~ COST + PERCENT,data=sat)

fitted <- fitted(sat_full)
cor(fitted, sat$TOTAL)
## [1] 0.9052472
residuals <- residuals(sat_full)
student <- studres(sat_full)
par(mfrow=c(1,2))
plot(fitted,sat$TOTAL,xlab="Fitted Values")
abline(lsfit(fitted,sat$TOTAL))
plot(fitted,student, ylab="Studentized Residuals")

plot(sat$COST,student, ylab="Studentized Residuals")
plot(sat$PERCENT,student, ylab="Studentized Residuals")

## Histogram
par(mfrow=c(1,2))
hist(residuals)
qqnorm(residuals)
qqline(residuals)

hist(student)
qqnorm(student)
qqline(student)

## Skewness
library(moments)
skewness(residuals)
## [1] -0.01181024
skewness(student)
## [1] -0.0382346
## Increase each teacher's salary by 5 thousands of dollars
sat_new <- sat_dat
sat_new$SALARY <- sat_dat$SALARY+5
## Fit the full model
sat.new1 <- lm(TOTAL ~ ., data = sat_new)
summary(sat.new1)
##
## Call:
## lm(formula = TOTAL ~ ., data = sat_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.531 -20.855 -1.746 15.979 66.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1037.7819 50.7192 20.461 < 2e-16 ***
## COST 4.4626 10.5465 0.423 0.674
## RATIO -3.6242 3.2154 -1.127 0.266
## SALARY 1.6379 2.3872 0.686 0.496
## PERCENT -2.9045 0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared: 0.8246, Adjusted R-squared: 0.809
## F-statistic: 52.88 on 4 and 45 DF, p-value: < 2.2e-16
anova(sat.new1)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## COST 1 39722 39722 37.1436 2.260e-07 ***
## RATIO 1 1143 1143 1.0685 0.3068088
## SALARY 1 16631 16631 15.5514 0.0002779 ***
## PERCENT 1 168688 168688 157.7379 2.607e-16 ***
## Residuals 45 48124 1069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Evaluate Collinearity
vif(sat.new1) # variance inflation factors
## COST RATIO SALARY PERCENT
## 9.465320 2.433204 9.217237 1.755090
sqrt(vif(sat.new1)) > 2 # problem?
## COST RATIO SALARY PERCENT
## TRUE FALSE TRUE FALSE
## Reduced model: 1 predictor COST, 2 covariates PERCENT, RATIO
sat.new2 <-lm(TOTAL ~ COST + PERCENT + RATIO, data = sat_new)
summary(sat.new2)
##
## Call:
## lm(formula = TOTAL ~ COST + PERCENT + RATIO, data = sat_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.284 -21.130 1.414 16.709 66.073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1035.4739 50.3155 20.580 <2e-16 ***
## COST 11.0140 4.4521 2.474 0.0171 *
## PERCENT -2.8491 0.2155 -13.222 <2e-16 ***
## RATIO -2.0282 2.2071 -0.919 0.3629
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.51 on 46 degrees of freedom
## Multiple R-squared: 0.8227, Adjusted R-squared: 0.8112
## F-statistic: 71.16 on 3 and 46 DF, p-value: < 2.2e-16
anova(sat.new2)
## Analysis of Variance Table
##
## Response: TOTAL
## Df Sum Sq Mean Sq F value Pr(>F)
## COST 1 39722 39722 37.5759 1.849e-07 ***
## PERCENT 1 185066 185066 175.0665 < 2.2e-16 ***
## RATIO 1 893 893 0.8445 0.3629
## Residuals 46 48627 1057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Evaluate Collinearity
vif(sat.new2) # variance inflation factors
## COST PERCENT RATIO
## 1.706384 1.541453 1.159732
sqrt(vif(sat.new2)) > 2
## COST PERCENT RATIO
## FALSE FALSE FALSE