library(tidyverse)
library(faraway)
# View(sat)
  1. Using the sat dataset, fit a model with the total SAT score as the response and expend, salary, ratio and takers as predictors. Perform regression diagnostics on this model to answer the following questions. Display any plots that are relevant. Do not provide any plots about which you have nothing to say. Suggest possible improvements or corrections to the model where appropriate.
  1. Check the constant variance assumption for the errors.
  2. Check the normality assumption.
  3. Check for large leverage points.
  4. Check for outliers.
  5. Check for influential points.
  6. Check the structure of the relationship between the predictors and the response.
model <- lm(total ~ expend + salary + ratio + takers, data = sat)
summary(model)
## 
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.531 -20.855  -1.746  15.979  66.571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1045.9715    52.8698  19.784  < 2e-16 ***
## expend         4.4626    10.5465   0.423    0.674    
## salary         1.6379     2.3872   0.686    0.496    
## ratio         -3.6242     3.2154  -1.127    0.266    
## takers        -2.9045     0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared:  0.8246, Adjusted R-squared:  0.809 
## F-statistic: 52.88 on 4 and 45 DF,  p-value: < 2.2e-16
# (a) Check the constant variance assumption for the errors.
plot(fitted(model), residuals(model), xlab = "Fitted", ylab = "Residuals")
abline(h=0)

Residuals vs. fitted plot - the plot indicates some non-linearity & mild nonconstant variance which prompts some change in the structual form.

plot(fitted(model), sqrt(abs(residuals(model))), xlab = "Fitted", ylab = expression(sqrt(hat(epsilon))))

The plot shows somewhat approximate constant variance.

summary(lm(sqrt(abs(residuals(model))) ~ fitted(model)))
## 
## Call:
## lm(formula = sqrt(abs(residuals(model))) ~ fitted(model))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3097 -1.2366 -0.2234  0.9929  5.0337 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)    4.6484524  4.0660807   1.143    0.259
## fitted(model) -0.0001637  0.0041994  -0.039    0.969
## 
## Residual standard error: 1.997 on 48 degrees of freedom
## Multiple R-squared:  3.166e-05,  Adjusted R-squared:  -0.0208 
## F-statistic: 0.00152 on 1 and 48 DF,  p-value: 0.9691
par(mfrow=c(3,3))
n <- 50
# constant variance
for(i in 1:9) {x <- runif(n) ; plot(x, rnorm(n))}

# strong nonconstant variance
for(i in 1:9) {x <- runif(n) ; plot(x, x*rnorm(n))}

# mild nonconstant
for(i in 1:9) {x <- runif(n) ; plot(x, sqrt ((x)) * rnorm(n))}

# nonlinearity
for(i in 1:9) {x <- runif(n) ; plot(x, cos(x*pi/25)+rnorm(n, sd = 1))}

par(mfrow=c(1,1))

Experimenting with a log transformation.

model <- lm(sqrt(total) ~ expend + salary + ratio + takers, data = sat)
summary(model)
## 
## Call:
## lm(formula = sqrt(total) ~ expend + salary + ratio + takers, 
##     data = sat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43610 -0.34707 -0.02486  0.25943  1.08084 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 32.297536   0.840140  38.443   <2e-16 ***
## expend       0.075430   0.167592   0.450    0.655    
## salary       0.026203   0.037935   0.691    0.493    
## ratio       -0.056489   0.051095  -1.106    0.275    
## takers      -0.046730   0.003675 -12.716   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5197 on 45 degrees of freedom
## Multiple R-squared:  0.8278, Adjusted R-squared:  0.8125 
## F-statistic: 54.08 on 4 and 45 DF,  p-value: < 2.2e-16
plot(fitted(model), residuals(model), xlab = "Fitted", ylab = "Residuals")
abline(h=0)

Seems to have somewhat improved the non-linearity issue. The constant variance assumption is slightly improved.

plot(sat$expend, residuals(model), xlab = "Expend", ylab = "Residuals")
abline(h=0)

plot(sat$salary, residuals(model), xlab = "Salary", ylab = "Residuals")
abline(h=0)

var.test(residuals(model)[sat$expend>5.90526], residuals(model)[sat$exp<5.90526])
## 
##  F test to compare two variances
## 
## data:  residuals(model)[sat$expend > 5.90526] and residuals(model)[sat$exp < 5.90526]
## F = 0.89354, num df = 21, denom df = 27, p-value = 0.8004
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3994815 2.0862745
## sample estimates:
## ratio of variances 
##           0.893537
# mean(sat$expend)

A significant difference is not seen.

  1. Check the normality assumption.
qqnorm(residuals(model), ylab = "Residuals", main = "")
qqline(residuals(model))

The plot shows that the observations approximately fit on the line, and therefore the normality assumption is satisfied.

par(mfrow=c(3,3))
n <- 50 
# normal 
for(i in 1:9) {x <- rnorm(n) ; qqnorm(x) ; qqline(x)}

# lognormal
for(i in 1:9) {x <- exp(rnorm(n)); qqnorm(x); qqline(x)}

# cauchy
for(i in 1:9) {x <- rcauchy(n); qqnorm(x); qqline(x)}

# uniform
for(i in 1:9) {x <- runif(n); qqnorm(x); qqline(x)}

par(mfrow=c(1,1))
shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.97976, p-value = 0.5421

Do not reject H0.

  1. Check for large leverage points.
hatv <- hatvalues(model)
head(hatv)
##    Alabama     Alaska    Arizona   Arkansas California   Colorado 
## 0.09537668 0.18030612 0.04931612 0.05382878 0.28211791 0.03014533
sum(hatv)
## [1] 5
states <- row.names(sat)
halfnorm(hatv, labs = states, ylab = "Leverages")

qqnorm(rstandard(model))
abline(0,1)

  1. Check for outliers.
set.seed(123)
testdata <- data.frame(x=1:10,y=1:10+rnorm(10))
lmod <- lm(y ~ x, testdata)
p1 <- c(5.5,12)
lmod1 <- lm(y ~ x, rbind(testdata, p1))
plot(y ~ x, rbind(testdata, p1))
points(5.5, 12, pch=4, cex=2)
abline(lmod)
abline(lmod1, lty=2)

p2 <- c(15,15.1)
lmod2 <- lm(y ~ x, rbind(testdata, p2))
plot(y ~ x, rbind(testdata, p2))
points(15, 15.1, pch=4, cex=2)
abline(lmod)
abline(lmod2, lty=2)

p3 <- c(15,5.1)
lmod3 <- lm(y ~ x, rbind(testdata, p3))
plot(y ~ x, rbind(testdata, p3))
points(15, 5.1, pch=4, cex=2)
abline(lmod)
abline(lmod3, lty=2)

stud <- rstudent(model)
stud[which.max(abs(stud))]
## West Virginia 
##     -3.117753
qt(0.05/(50*2),44)
## [1] -3.525801

As -3.117753 > -3.525801 we conclude that West Virginia is an outlier.

  1. Check for influential points.
cook <- cooks.distance(model)
halfnorm(cook, 3, labs = states, ylab = "Cook's distance")

modeli <- lm(total ~ expend + salary + ratio + takers, data = sat, subset = (cook < max(cook)))
summary(modeli)
## 
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat, 
##     subset = (cook < max(cook)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.118 -18.402   1.808  14.890  67.669 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1093.8460    53.4226  20.475   <2e-16 ***
## expend        -0.9427    10.1922  -0.092    0.927    
## salary         3.0964     2.3283   1.330    0.190    
## ratio         -7.6391     3.4279  -2.229    0.031 *  
## takers        -2.9308     0.2188 -13.397   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.9 on 44 degrees of freedom
## Multiple R-squared:  0.8396, Adjusted R-squared:  0.825 
## F-statistic: 57.58 on 4 and 44 DF,  p-value: < 2.2e-16
summary(model)
## 
## Call:
## lm(formula = sqrt(total) ~ expend + salary + ratio + takers, 
##     data = sat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43610 -0.34707 -0.02486  0.25943  1.08084 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 32.297536   0.840140  38.443   <2e-16 ***
## expend       0.075430   0.167592   0.450    0.655    
## salary       0.026203   0.037935   0.691    0.493    
## ratio       -0.056489   0.051095  -1.106    0.275    
## takers      -0.046730   0.003675 -12.716   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5197 on 45 degrees of freedom
## Multiple R-squared:  0.8278, Adjusted R-squared:  0.8125 
## F-statistic: 54.08 on 4 and 45 DF,  p-value: < 2.2e-16
plot(dfbeta(model)[,2],ylab = "Change in expend coef")
abline(h=0)

  1. Check the structure of the relationship between the predictors and the response.
d <- residuals(lm(total ~ expend + salary + ratio + takers, data = sat))
m <- residuals(lm(expend ~ salary + ratio + takers, data = sat))
plot(m, d, xlab = "expend residuals", ylab = "sat residuals")
abline(d, m)

coef(lm(d ~m))
##   (Intercept)             m 
## -4.242192e-16  1.970697e-15
coef(model)
## (Intercept)      expend      salary       ratio      takers 
## 32.29753601  0.07543047  0.02620276 -0.05648875 -0.04672981
termplot(model, partial.resid = T, terms = 1)

model1 <- lm(total ~ expend + salary + ratio + takers, data = sat, subset = (expend>6))
model2 <- lm(total ~ expend + salary + ratio + takers, data = sat, subset = (expend<6))
summary(model1)
## 
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat, 
##     subset = (expend > 6))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.818 -13.456   3.091  15.186  49.887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 994.7970    87.3748  11.385 1.83e-08 ***
## expend       -8.6936    14.2395  -0.611    0.551    
## salary        4.0757     3.2329   1.261    0.228    
## ratio        -2.8374     5.4074  -0.525    0.608    
## takers       -2.2405     0.3276  -6.838 8.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.69 on 14 degrees of freedom
## Multiple R-squared:  0.8209, Adjusted R-squared:  0.7697 
## F-statistic: 16.04 on 4 and 14 DF,  p-value: 3.995e-05
summary(model2)
## 
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat, 
##     subset = (expend < 6))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.233 -19.527  -5.211  12.643  79.212 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1017.9597    90.6421  11.231 2.93e-11 ***
## expend        18.7994    16.7715   1.121    0.273    
## salary        -0.5454     3.2032  -0.170    0.866    
## ratio         -1.6764     3.8813  -0.432    0.670    
## takers        -3.2461     0.3297  -9.846 4.39e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.26 on 25 degrees of freedom
## Multiple R-squared:  0.8506, Adjusted R-squared:  0.8267 
## F-statistic: 35.59 on 4 and 25 DF,  p-value: 5.544e-10
sat$status <- ifelse(sat$expend > 6, "high", "low")
require(ggplot2)
ggplot(sat, aes(x = takers, y = total, shape = status)) +
  geom_point()

ggplot(sat, aes(x = takers, y = total, shape = status)) +
  geom_point() +
  facet_grid(~ status) +
  stat_smooth(method = "lm")