library(tidyverse)
library(faraway)
# View(sat)
model <- lm(total ~ expend + salary + ratio + takers, data = sat)
summary(model)
##
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.531 -20.855 -1.746 15.979 66.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1045.9715 52.8698 19.784 < 2e-16 ***
## expend 4.4626 10.5465 0.423 0.674
## salary 1.6379 2.3872 0.686 0.496
## ratio -3.6242 3.2154 -1.127 0.266
## takers -2.9045 0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared: 0.8246, Adjusted R-squared: 0.809
## F-statistic: 52.88 on 4 and 45 DF, p-value: < 2.2e-16
# (a) Check the constant variance assumption for the errors.
plot(fitted(model), residuals(model), xlab = "Fitted", ylab = "Residuals")
abline(h=0)
Residuals vs. fitted plot - the plot indicates some non-linearity & mild nonconstant variance which prompts some change in the structual form.
plot(fitted(model), sqrt(abs(residuals(model))), xlab = "Fitted", ylab = expression(sqrt(hat(epsilon))))
The plot shows somewhat approximate constant variance.
summary(lm(sqrt(abs(residuals(model))) ~ fitted(model)))
##
## Call:
## lm(formula = sqrt(abs(residuals(model))) ~ fitted(model))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3097 -1.2366 -0.2234 0.9929 5.0337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6484524 4.0660807 1.143 0.259
## fitted(model) -0.0001637 0.0041994 -0.039 0.969
##
## Residual standard error: 1.997 on 48 degrees of freedom
## Multiple R-squared: 3.166e-05, Adjusted R-squared: -0.0208
## F-statistic: 0.00152 on 1 and 48 DF, p-value: 0.9691
par(mfrow=c(3,3))
n <- 50
# constant variance
for(i in 1:9) {x <- runif(n) ; plot(x, rnorm(n))}
# strong nonconstant variance
for(i in 1:9) {x <- runif(n) ; plot(x, x*rnorm(n))}
# mild nonconstant
for(i in 1:9) {x <- runif(n) ; plot(x, sqrt ((x)) * rnorm(n))}
# nonlinearity
for(i in 1:9) {x <- runif(n) ; plot(x, cos(x*pi/25)+rnorm(n, sd = 1))}
par(mfrow=c(1,1))
Experimenting with a log transformation.
model <- lm(sqrt(total) ~ expend + salary + ratio + takers, data = sat)
summary(model)
##
## Call:
## lm(formula = sqrt(total) ~ expend + salary + ratio + takers,
## data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43610 -0.34707 -0.02486 0.25943 1.08084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.297536 0.840140 38.443 <2e-16 ***
## expend 0.075430 0.167592 0.450 0.655
## salary 0.026203 0.037935 0.691 0.493
## ratio -0.056489 0.051095 -1.106 0.275
## takers -0.046730 0.003675 -12.716 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5197 on 45 degrees of freedom
## Multiple R-squared: 0.8278, Adjusted R-squared: 0.8125
## F-statistic: 54.08 on 4 and 45 DF, p-value: < 2.2e-16
plot(fitted(model), residuals(model), xlab = "Fitted", ylab = "Residuals")
abline(h=0)
Seems to have somewhat improved the non-linearity issue. The constant variance assumption is slightly improved.
plot(sat$expend, residuals(model), xlab = "Expend", ylab = "Residuals")
abline(h=0)
plot(sat$salary, residuals(model), xlab = "Salary", ylab = "Residuals")
abline(h=0)
var.test(residuals(model)[sat$expend>5.90526], residuals(model)[sat$exp<5.90526])
##
## F test to compare two variances
##
## data: residuals(model)[sat$expend > 5.90526] and residuals(model)[sat$exp < 5.90526]
## F = 0.89354, num df = 21, denom df = 27, p-value = 0.8004
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3994815 2.0862745
## sample estimates:
## ratio of variances
## 0.893537
# mean(sat$expend)
A significant difference is not seen.
qqnorm(residuals(model), ylab = "Residuals", main = "")
qqline(residuals(model))
The plot shows that the observations approximately fit on the line, and therefore the normality assumption is satisfied.
par(mfrow=c(3,3))
n <- 50
# normal
for(i in 1:9) {x <- rnorm(n) ; qqnorm(x) ; qqline(x)}
# lognormal
for(i in 1:9) {x <- exp(rnorm(n)); qqnorm(x); qqline(x)}
# cauchy
for(i in 1:9) {x <- rcauchy(n); qqnorm(x); qqline(x)}
# uniform
for(i in 1:9) {x <- runif(n); qqnorm(x); qqline(x)}
par(mfrow=c(1,1))
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.97976, p-value = 0.5421
Do not reject H0.
hatv <- hatvalues(model)
head(hatv)
## Alabama Alaska Arizona Arkansas California Colorado
## 0.09537668 0.18030612 0.04931612 0.05382878 0.28211791 0.03014533
sum(hatv)
## [1] 5
states <- row.names(sat)
halfnorm(hatv, labs = states, ylab = "Leverages")
qqnorm(rstandard(model))
abline(0,1)
set.seed(123)
testdata <- data.frame(x=1:10,y=1:10+rnorm(10))
lmod <- lm(y ~ x, testdata)
p1 <- c(5.5,12)
lmod1 <- lm(y ~ x, rbind(testdata, p1))
plot(y ~ x, rbind(testdata, p1))
points(5.5, 12, pch=4, cex=2)
abline(lmod)
abline(lmod1, lty=2)
p2 <- c(15,15.1)
lmod2 <- lm(y ~ x, rbind(testdata, p2))
plot(y ~ x, rbind(testdata, p2))
points(15, 15.1, pch=4, cex=2)
abline(lmod)
abline(lmod2, lty=2)
p3 <- c(15,5.1)
lmod3 <- lm(y ~ x, rbind(testdata, p3))
plot(y ~ x, rbind(testdata, p3))
points(15, 5.1, pch=4, cex=2)
abline(lmod)
abline(lmod3, lty=2)
stud <- rstudent(model)
stud[which.max(abs(stud))]
## West Virginia
## -3.117753
qt(0.05/(50*2),44)
## [1] -3.525801
As -3.117753 > -3.525801 we conclude that West Virginia is an outlier.
cook <- cooks.distance(model)
halfnorm(cook, 3, labs = states, ylab = "Cook's distance")
modeli <- lm(total ~ expend + salary + ratio + takers, data = sat, subset = (cook < max(cook)))
summary(modeli)
##
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat,
## subset = (cook < max(cook)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.118 -18.402 1.808 14.890 67.669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1093.8460 53.4226 20.475 <2e-16 ***
## expend -0.9427 10.1922 -0.092 0.927
## salary 3.0964 2.3283 1.330 0.190
## ratio -7.6391 3.4279 -2.229 0.031 *
## takers -2.9308 0.2188 -13.397 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.9 on 44 degrees of freedom
## Multiple R-squared: 0.8396, Adjusted R-squared: 0.825
## F-statistic: 57.58 on 4 and 44 DF, p-value: < 2.2e-16
summary(model)
##
## Call:
## lm(formula = sqrt(total) ~ expend + salary + ratio + takers,
## data = sat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43610 -0.34707 -0.02486 0.25943 1.08084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.297536 0.840140 38.443 <2e-16 ***
## expend 0.075430 0.167592 0.450 0.655
## salary 0.026203 0.037935 0.691 0.493
## ratio -0.056489 0.051095 -1.106 0.275
## takers -0.046730 0.003675 -12.716 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5197 on 45 degrees of freedom
## Multiple R-squared: 0.8278, Adjusted R-squared: 0.8125
## F-statistic: 54.08 on 4 and 45 DF, p-value: < 2.2e-16
plot(dfbeta(model)[,2],ylab = "Change in expend coef")
abline(h=0)
d <- residuals(lm(total ~ expend + salary + ratio + takers, data = sat))
m <- residuals(lm(expend ~ salary + ratio + takers, data = sat))
plot(m, d, xlab = "expend residuals", ylab = "sat residuals")
abline(d, m)
coef(lm(d ~m))
## (Intercept) m
## -4.242192e-16 1.970697e-15
coef(model)
## (Intercept) expend salary ratio takers
## 32.29753601 0.07543047 0.02620276 -0.05648875 -0.04672981
termplot(model, partial.resid = T, terms = 1)
model1 <- lm(total ~ expend + salary + ratio + takers, data = sat, subset = (expend>6))
model2 <- lm(total ~ expend + salary + ratio + takers, data = sat, subset = (expend<6))
summary(model1)
##
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat,
## subset = (expend > 6))
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.818 -13.456 3.091 15.186 49.887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 994.7970 87.3748 11.385 1.83e-08 ***
## expend -8.6936 14.2395 -0.611 0.551
## salary 4.0757 3.2329 1.261 0.228
## ratio -2.8374 5.4074 -0.525 0.608
## takers -2.2405 0.3276 -6.838 8.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.69 on 14 degrees of freedom
## Multiple R-squared: 0.8209, Adjusted R-squared: 0.7697
## F-statistic: 16.04 on 4 and 14 DF, p-value: 3.995e-05
summary(model2)
##
## Call:
## lm(formula = total ~ expend + salary + ratio + takers, data = sat,
## subset = (expend < 6))
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.233 -19.527 -5.211 12.643 79.212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1017.9597 90.6421 11.231 2.93e-11 ***
## expend 18.7994 16.7715 1.121 0.273
## salary -0.5454 3.2032 -0.170 0.866
## ratio -1.6764 3.8813 -0.432 0.670
## takers -3.2461 0.3297 -9.846 4.39e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.26 on 25 degrees of freedom
## Multiple R-squared: 0.8506, Adjusted R-squared: 0.8267
## F-statistic: 35.59 on 4 and 25 DF, p-value: 5.544e-10
sat$status <- ifelse(sat$expend > 6, "high", "low")
require(ggplot2)
ggplot(sat, aes(x = takers, y = total, shape = status)) +
geom_point()
ggplot(sat, aes(x = takers, y = total, shape = status)) +
geom_point() +
facet_grid(~ status) +
stat_smooth(method = "lm")