env <- new.env()
load("../data.rda", envir = env)
df <- get("CH01TA01", envir = env)
names(df) <- c('x', 'y')
fit <- lm(y ~ x, df)
par(mfrow = c(2, 2), pch = 19)
stem(df$x, scale=3) # Printed to screen, not graphics window
##
## The decimal point is 1 digit(s) to the right of the |
##
## 2 | 0
## 3 | 000
## 4 | 00
## 5 | 000
## 6 | 0
## 7 | 000
## 8 | 000
## 9 | 0000
## 10 | 00
## 11 | 00
## 12 | 0
dotchart(df$x, xlab = "Lot Size", main = "(a) Dot Plot")
plot(df$x, type = "b", lty = 2, xlab = "Run", ylab = "Lot Size")
title("(b) Sequence Plot")
boxplot(df$x, horizontal = TRUE, xlab = "Lot Size", main = "(d) Box Plot")
par(mfrow = c(2, 2), pch = 19)
plot(df$x, resid(fit), xlab = "Lot Size", ylab = "Residual")
title("(a) Residual Plot against x")
plot(resid(fit), type = "b", lty = 2, xlab = "Run", ylab = "Residual")
title("(b) Sequence Plot")
boxplot(resid(fit), horizontal = TRUE, xlab = "Residual")
title("(c) Box Plot")
qqnorm(resid(fit), xlab = "Expected", ylab = "Residual", main = "")
title("(d) Normal Probability Plot")
qqline(resid(fit))
df <- get("CH03TA01", envir = env)
names(df) <- c('y', 'x')
fit <- lm(y ~ x, df)
plot(y ~ x, df, ylim = c(0, 8), main = "(a) Scatter Plot")
abline(fit)
plot(df$x, resid(fit), main = "(b) Residual Plot")
abline(0, 0)
cbind(
"Increase in Ridership" = df$y,
"Maps Distributed" = df$x,
"Fitted Values" = round(fitted(fit), 2),
"Residuals" = round(resid(fit), 2))
## Increase in Ridership Maps Distributed Fitted Values Residuals
## 1 0.60 80 1.66 -1.06
## 2 6.70 220 7.75 -1.05
## 3 5.30 140 4.27 1.03
## 4 4.00 120 3.40 0.60
## 5 6.55 180 6.01 0.54
## 6 2.15 100 2.53 -0.38
## 7 6.60 200 6.88 -0.28
## 8 5.75 160 5.14 0.61
df <- get("CH01TA01", envir = env)
names(df) <- c('x', 'y')
fit <- lm(y ~ x, df)
Could obtain mean square error from model directly using anova(fit)[2, "Mean Sq"]. Note that MSE can be calculated below as the SSE / DF.
# Expected value under normality comes from equation (3.6)
cbind(
"Residual" = round(resid(fit), 2),
"Rank" = rank(resid(fit)),
"Exp. Value under Normality" = round(sqrt(deviance(fit) / df.residual(fit)) *
qnorm((rank(resid(fit)) - 0.375) / (nrow(data) + 0.25)), 2))
## Residual Rank
## 1 51.02 22
## 2 -48.47 5
## 3 -19.88 10
## 4 -7.68 11
## 5 48.72 21
## 6 -52.58 4
## 7 55.21 23
## 8 4.02 15
## 9 -66.39 2
## 10 -83.88 1
## 11 -45.17 6
## 12 -60.28 3
## 13 5.32 16
## 14 -20.77 8
## 15 -20.09 9
## 16 0.61 14
## 17 42.53 20
## 18 27.12 18
## 19 -6.68 12
## 20 -34.09 7
## 21 103.53 25
## 22 84.32 24
## 23 38.83 19
## 24 -5.98 13
## 25 10.72 17
This complex function will print out the conclusion from the decision rule regarding this test. All of the respective elements are contained within the return object. If this object is not caught, it will display to the screen as displayed below.
For completeness, the car library has a levene test function called leveneTest. It requires the data have a factor by which to group the observations. This is performed using cut on the independent variable. Thus, we use the test on the response variable grouped by the cut independent variable as demonstrated below. The outcome of interest is the p-value which is greater than our alpha. We fail to reject the null hypothesis of equal variance, the same conclusion the authors derived. By default leveneTest uses median values (Brown-Forsythe test), and it uses an F-test instead of the t-test. Unfortunately, there is nothing of interest returned with the test object beyond what is displayed.
levene <- function(model, alpha = 0.05)
{
f <- function(x)
{
within(x, {
dsqdev = scale(abs(e - median(e)), T, F)^2
d = abs(e - median(e))
})
} # end f
data <- model.frame(model) # Grab the data used in the model
data <- transform(data, # Append the residuals and splitting factor
e = resid(model),
group = cut(x, 2, labels = LETTERS[1:2]))
# Split by group factor and add last TABLE 3.3 columns. Notice the syntax for within()
data.split <- with(data, split(subset(data, select = -group), group))
data.split <- lapply(data.split, f)
# Define the relevant variables for hypothesis test and return object
SSd <- lapply(data.split, function(x) sum(x$dsqdev))
dbar <- lapply(data.split, function(x) mean(x$d))
n <- c(Total = nrow(data), lapply(data.split, nrow)) # List of n's
s <- sqrt( (SSd$A + SSd$B) / (n$Total - 2) ) # sqrt of (3.9a)
tstar <- (dbar$A - dbar$B) / (s * sqrt(1/n$A + 1/n$B)) # (3.9)
tval <- qt(1 - alpha/2, n$Total - 2)
p.value <- 2 * pt(-abs(tval), n$Total - 1) # = 0.0495
# Print conclusion of the decision rule
if (abs(tstar) <= tval) {
print("error variance is constant")
} else
print("error variance is not constant")
# return split data and defined variables
data <- list(
'data' = data.split,
'p.value' = p.value,
'tstar' = tstar,
'tval' = tval,
'dbar' = unlist(dbar),
'SSd' = unlist(SSd),
'n' = unlist(n),
's' = s)
return(data)
} # end levene function
library(car)
levene(fit)
## [1] "error variance is constant"
## $data
## $data$A
## y x e d dsqdev
## 2 121 30 -48.47 28.5960 263.060
## 3 221 50 -19.88 0.0000 2008.391
## 5 361 70 48.72 68.5960 565.531
## 6 224 60 -52.58 32.7020 146.726
## 10 157 50 -83.88 64.0000 368.061
## 11 160 40 -45.17 25.2980 380.917
## 12 252 70 -60.28 40.4040 19.457
## 14 113 20 -20.77 0.8939 1929.066
## 17 212 30 42.53 62.4040 309.372
## 18 268 50 27.12 47.0000 4.774
## 21 273 30 103.53 123.4040 6176.226
## 23 244 40 38.83 58.7020 192.847
## 25 323 70 10.72 30.5960 202.183
##
## $data$B
## y x e d dsqdev
## 1 399 80 51.0180 53.702 637.648
## 4 376 90 -7.6840 5.000 549.918
## 7 546 120 55.2099 57.894 866.926
## 8 352 80 4.0180 6.702 472.989
## 9 353 100 -66.3861 63.702 1242.681
## 13 389 90 5.3160 8.000 418.216
## 15 435 110 -20.0881 17.404 122.021
## 16 420 100 0.6139 3.298 632.641
## 19 377 90 -6.6840 4.000 597.819
## 20 421 110 -34.0881 31.404 8.724
## 22 468 90 84.3160 87.000 3428.063
## 24 342 80 -5.9820 3.298 632.641
##
##
## $p.value
## [1] 0.04951
##
## $tstar
## [1] 1.316
##
## $tval
## [1] 2.069
##
## $dbar
## A B
## 44.82 28.45
##
## $SSd
## A B
## 12567 9610
##
## $n
## Total A B
## 25 13 12
##
## $s
## [1] 31.05
with(df, leveneTest(y, cut(x, 2))) # F value is within acceptance region
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.78 0.39
## 23
R has a method for calculating this contained in the lmtest library. It defaults to a studentized BP test, so an extra parameter will be passed.
Note: The authors have retained an error in this 5th edition since the 4th. They claim a p-value of 0.64 > alpha which is inconsistent with their results and the outcome of the bptest command. The true p-value provided appears to be 1 - 0.64 = 0.36.
library(lmtest)
bptest(fit, student = FALSE)
##
## Breusch-Pagan test
##
## data: fit
## BP = 0.8209, df = 1, p-value = 0.3649
qchisq(1 - 0.05, 1) # BP falls within the chisq acceptance region
## [1] 3.841
df <- get("CH03TA04", envir = env)
names(df) <- c('x', 'y')
fit <- lm(y ~ x, df)
as.table(cbind(
"Size of Min. Deposit" = df$x,
"Number of New Accounts" = df$y
))
## Size of Min. Deposit Number of New Accounts
## A 125 160
## B 100 112
## C 200 124
## D 75 28
## E 150 152
## F 175 156
## G 75 42
## H 175 124
## I 125 150
## J 200 104
## K 100 136
anova(fit)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 5141 5141 3.14 0.11
## Residuals 9 14742 1638
plot(y ~ x, df)
abline(fit)
tab <- do.call("cbind", unstack(df, y ~ x))
tab[2, 4] <- NA # The singular "150" level value is duplicated; remove it
tab <- rbind(tab, colMeans(tab, na.rm = TRUE))
dimnames(tab) <- list(Replicate = c(1:2, "Mean"), Deposit = dimnames(tab)[[2]])
(tab <- as.table(tab))
## Deposit
## Replicate 75 100 125 150 175 200
## 1 28 112 160 152 156 124
## 2 42 136 150 124 104
## Mean 35 124 155 152 140 114
While R has a basic anova table for lm objects, there is a trick to obtain a further decomposition for the lack of fit and pure error. In particular, with a y ~ x regression, we can model y ~ x + factor(x) to obtain an anova table with the desired values. Also, the alr3 package contains a function pureErrorAnova that gives a more complete anova table.
For numeric subscripting, the following will be utilized
Number Rows Columns
1 x (Regression) Df
2 factor(x) (lack of fit) Sum Sq
3 Residuals (pure error) Mean Sq
fit.aov <- anova(update(fit, . ~ . + factor(x)))
as.table(cbind(
'SS' = c('SSR' = fit.aov[1, 2],
'SSE' = sum(fit.aov[2:3, 2]),
'SSLF' = fit.aov[2, 2],
'SSPE' = fit.aov[3, 2],
'Total' = sum(fit.aov[1:3, 2])),
'Df' = c( fit.aov[1, 1],
sum(fit.aov[2:3, 1]),
fit.aov[2, 1],
fit.aov[3, 1],
sum(fit.aov[1:3, 1])),
'MS' = c( fit.aov[1, 3],
sum(fit.aov[2:3, 2]) / sum(fit.aov[2:3, 1]),
fit.aov[2, 3],
fit.aov[3, 3],
NA)
))
## SS Df MS
## SSR 5141.3 1.0 5141.3
## SSE 14741.6 9.0 1638.0
## SSLF 13593.6 4.0 3398.4
## SSPE 1148.0 5.0 229.6
## Total 19882.9 10.0
df <- get("CH03TA07", envir = env)
names(df) <- c('x', 'y')
fit <- lm(y ~ x, df)
(df <- transform(df, sqrtx = sqrt(x)))
## x y sqrtx
## 1 0.5 42.5 0.7071
## 2 0.5 50.6 0.7071
## 3 1.0 68.5 1.0000
## 4 1.0 80.7 1.0000
## 5 1.5 89.0 1.2247
## 6 1.5 99.6 1.2247
## 7 2.0 105.3 1.4142
## 8 2.0 111.8 1.4142
## 9 2.5 112.3 1.5811
## 10 2.5 125.7 1.5811
par(mfrow = c(2, 2), pch = 19)
plot(y ~ x, df, xlab = "Days", ylab = "Performance")
title("(a) Scatter Plot")
plot(y ~ sqrt(x), df, xlab = expression(sqrt(x)), ylab = "Performance")
title(expression(paste("(b) Scatter Plot against ", sqrt(x))))
plot(resid(fit) ~ sqrt(x), df, xlab = expression(sqrt(x)), ylab = "Residual")
title(expression(paste("(c) Residual Plot against ", sqrt(x))))
qqnorm(resid(fit), xlab = "Expected", ylab = "Residual", main = "")
title("(d) Normal Probability Plot")
df <- get("CH03TA08", envir = env)
names(df) <- c("x", "y", "z")
fit <- lm(z ~ x, df)
with(df, data.frame("Age" = x, "Plasma" = y, "Transform" = z))
## Age Plasma Transform
## 1 0 13.44 1.1284
## 2 0 12.84 1.1086
## 3 0 11.91 1.0759
## 4 0 20.09 1.3030
## 5 0 15.60 1.1931
## 6 1 10.11 1.0048
## 7 1 11.38 1.0561
## 8 1 10.28 1.0120
## 9 1 8.96 0.9523
## 10 1 8.59 0.9340
## 11 2 9.83 0.9926
## 12 2 9.00 0.9542
## 13 2 8.65 0.9370
## 14 2 7.85 0.8949
## 15 2 8.88 0.9484
## 16 3 7.94 0.8998
## 17 3 6.01 0.7789
## 18 3 5.14 0.7110
## 19 3 6.90 0.8388
## 20 3 6.77 0.8306
## 21 4 4.86 0.6866
## 22 4 5.10 0.7076
## 23 4 5.67 0.7536
## 24 4 5.75 0.7597
## 25 4 6.23 0.7945
par(mfrow = c(2, 2), pch = 19)
plot(y ~ x, df, xlab = "Age", ylab = "Plasma Level")
title("(a) Scatter Plot")
plot(z ~ x, df, xlab = "Age")
title("(b) Scatter Plot with Y' = log(Y)")
plot(df$x, resid(fit)*100, xlab = "Age", ylab = "Residual x 100")
title("(c) Residual Plot against X")
abline(0, 0)
qqnorm(resid(fit), xlab = "Expected Valued", ylab = "Residual", main = "")
title("(d) Normal Probability Plot")
R has two Box-Cox functions: boxcox (MASS) and boxCox (car). They both take a linear model object as their input. They both produce a graph that maximizes the log-likelihood rather than minimize the sum of squares error. The result is the same.
library(MASS)
library(car)
par(mfrow = c(1, 3))
boxcox(lm(y ~ x, df))
boxCox(lm(y ~ x, df))
boxcox.sse <- function(lambda, model)
{
x <- model.frame(model)$x
y <- model.frame(model)$y
K2 <- prod(y)^( 1/length(y) ) # (3.36a)
K1 <- 1 / (lambda * K2^(lambda - 1)) # (3.36b)
ifelse(lambda != 0, # (3.36)
assign("W", K1 * (y^lambda - 1)),
assign("W", K2 * log(y)))
# Deviance = Residual Sum of Squares
return(deviance(lm(W ~ x)))
}
lambda <- seq(-2, 2, by = 0.1)
SSE = sapply(lambda, boxcox.sse, lm(y ~ x, df))
plot(lambda, SSE, type = "l", xlab = expression(lambda))
abline(v = -0.5, lty = 3)
cbind('lambda' = lambda, 'SSE' = SSE)
## lambda SSE
## [1,] -2.0 61.86
## [2,] -1.9 57.57
## [3,] -1.8 53.67
## [4,] -1.7 50.12
## [5,] -1.6 46.91
## [6,] -1.5 44.01
## [7,] -1.4 41.43
## [8,] -1.3 39.13
## [9,] -1.2 37.12
## [10,] -1.1 35.38
## [11,] -1.0 33.91
## [12,] -0.9 32.70
## [13,] -0.8 31.76
## [14,] -0.7 31.09
## [15,] -0.6 30.69
## [16,] -0.5 30.56
## [17,] -0.4 30.72
## [18,] -0.3 31.18
## [19,] -0.2 31.95
## [20,] -0.1 33.06
## [21,] 0.0 34.52
## [22,] 0.1 36.37
## [23,] 0.2 38.64
## [24,] 0.3 41.36
## [25,] 0.4 44.59
## [26,] 0.5 48.37
## [27,] 0.6 52.76
## [28,] 0.7 57.84
## [29,] 0.8 63.67
## [30,] 0.9 70.35
## [31,] 1.0 77.98
## [32,] 1.1 86.68
## [33,] 1.2 96.59
## [34,] 1.3 107.85
## [35,] 1.4 120.64
## [36,] 1.5 135.17
## [37,] 1.6 151.65
## [38,] 1.7 170.36
## [39,] 1.8 191.60
## [40,] 1.9 215.69
## [41,] 2.0 243.05
df <- get("CH01TA01", envir = env)
names(df) <- c('x', 'y')
fit <- lm(y ~ x, df)
R has methods for lowess regression (loess) and loess plotting. We will simply make use of these faculties. As stated in the Chapter 2 walk-through we will plot confidence bands using the built-in R function predict. The R function scatter.smooth plots both the scatter plot and lowess curve. FIGURE 3.18 will be ignored since it can be generated in the same manner.
with(df, scatter.smooth(x, y))
title("Lowess Curve and Linear Regression Confidence Bands")
plot(y ~ x, df, xlab = "Lot Size", ylab = "Hours")
title("Lowess Curve and Linear Regression Confidence Bands")
with(df, lines(loess.smooth(x, y), col = "red"))
# Gather confidence bands, ordered by x, and add lines to plot
ci <- cbind(model.frame(fit), predict(fit, int = "c"))[order(df$x), ]
lines(lwr ~ x, ci, col = "blue", lty = "dashed" )
lines(upr ~ x, ci, col = "blue", lty = "dashed" )
df <- get("CH03TA10", envir = env)
names(df) <- c("y", "x")
cbind("Plutonium Activity" = df$x, "Alpha Count Rate" = df$y)
## Plutonium Activity Alpha Count Rate
## [1,] 20 0.150
## [2,] 0 0.004
## [3,] 10 0.069
## [4,] 5 0.030
## [5,] 0 0.011
## [6,] 0 0.004
## [7,] 5 0.041
## [8,] 20 0.109
## [9,] 10 0.068
## [10,] 0 0.009
## [11,] 0 0.009
## [12,] 10 0.048
## [13,] 0 0.006
## [14,] 20 0.083
## [15,] 5 0.037
## [16,] 5 0.039
## [17,] 20 0.132
## [18,] 0 0.004
## [19,] 0 0.006
## [20,] 10 0.059
## [21,] 10 0.051
## [22,] 0 0.002
## [23,] 5 0.049
## [24,] 0 0.106
Analytically confirm nonconstant variance by the BP test
plot(y ~ x, df, pch = 19, xlab = "pCi/g", ylab = "#/sec")
with(df, lines(loess.smooth(x, y))) # Warnings may occur
c(Stat = bptest(lm(y ~ x, df), student = FALSE)$statistic,
Chisq = pchisq(.95, 1)) # BP falls outside acceptance region: Reject H0
## Stat.BP Chisq
## 0.7508 0.6703
df <- df[-24, ] # Remove outlier: Record 24
df <- transform(df, sqrty = sqrt(y), sqrtx = sqrt(x))
# Linear Models Summay and Anova
summary(lm(y ~ x, df))
##
## Call:
## lm(formula = y ~ x, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03477 -0.00406 -0.00103 0.00494 0.03223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.007033 0.003599 1.95 0.064 .
## x 0.005537 0.000366 15.13 9.1e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0126 on 21 degrees of freedom
## Multiple R-squared: 0.916, Adjusted R-squared: 0.912
## F-statistic: 229 on 1 and 21 DF, p-value: 9.08e-13
anova(lm(y ~ x, df))
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 0.0362 0.0362 229 9.1e-13 ***
## Residuals 21 0.0033 0.0002
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(sqrty ~ x, df))
##
## Call:
## lm(formula = sqrty ~ x, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.07396 -0.02441 0.00011 0.02801 0.05978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.094760 0.009567 9.91 2.3e-09 ***
## x 0.013365 0.000973 13.74 5.8e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0334 on 21 degrees of freedom
## Multiple R-squared: 0.9, Adjusted R-squared: 0.895
## F-statistic: 189 on 1 and 21 DF, p-value: 5.77e-12
anova(lm(sqrty ~ x, df))
## Analysis of Variance Table
##
## Response: sqrty
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 0.2108 0.2108 189 5.8e-12 ***
## Residuals 21 0.0235 0.0011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(sqrty ~ sqrtx, df))
##
## Call:
## lm(formula = sqrty ~ sqrtx, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04119 -0.01054 0.00087 0.01434 0.05801
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07301 0.00783 9.32 6.5e-09 ***
## sqrtx 0.05731 0.00302 19.00 1.0e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0248 on 21 degrees of freedom
## Multiple R-squared: 0.945, Adjusted R-squared: 0.942
## F-statistic: 361 on 1 and 21 DF, p-value: 1.05e-14
anova(lm(sqrty ~ sqrtx, df))
## Analysis of Variance Table
##
## Response: sqrty
## Df Sum Sq Mean Sq F value Pr(>F)
## sqrtx 1 0.2214 0.2214 361 1e-14 ***
## Residuals 21 0.0129 0.0006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pplot <- function(formula, data)
{
fit <- lm(formula, data)
plot(resid(fit) ~ fitted(fit), xlab = "Fitted", ylab = "Residual")
title("(b) Residual Plot")
abline(0, 0)
qqnorm(resid(fit), xlab = "Expected", ylab = "Residual", main = "")
qqline(resid(fit))
title("(c) Normal Probability Plot")
}
par(mfrow = c(2, 2), pch = 19)
pplot(y ~ x, df) # Untransformed Plots (FIGURE 3.21)
pplot(sqrty ~ x, df) # Transformed Response Plots (FIGURE 3.22)
pplot(sqrty ~ sqrtx, df) # Transformed Response and Predictor Plots (FIGURE 3.23)
par(mfrow = c(1,1))
fit <- lm(sqrty ~ sqrtx, df)
scatter.smooth(df$sqrtx, df$sqrty, main = "(d) Confidence Band", pch = 19,
xlab = expression(sqrt(x)), ylab = expression(sqrt(y)))
newdata = data.frame(sqrtx = sort(df$sqrtx))
pp <- predict(fit, newdata, int = "c")
lines(newdata$sqrtx, pp[, 2], col = "blue")
lines(newdata$sqrtx, pp[, 3], col = "blue")