# filename
fn.data <- "http://statacumen.com/teach/ADA2/ADA2_notes_Ch02_indian.dat"
indian <- read.table(fn.data, header=TRUE)
# Create the "fraction of their life" variable
# yrage = years since migration divided by age
indian$yrage <- indian$yrmig / indian$age
# continuous color for wt
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(indian, aes(x = yrage, y = sysbp, label = id))
p <- p + geom_point(aes(colour=wt), size=2)
# plot labels next to points
p <- p + geom_text(hjust = 0.5, vjust = -0.5, alpha = 0.25, colour = 2)
# plot regression line and confidence band
p <- p + geom_smooth(method = lm)
p <- p + labs(title="Indian sysbp by yrage with continuous wt")
print(p)
# categorical color for wt
indian$wtcat <- rep(NA, nrow(indian))
indian$wtcat <- "M" # init as medium
indian$wtcat[(indian$wt < 60)] <- "L" # update low
indian$wtcat[(indian$wt >= 70)] <- "H" # update high
# define as a factor variable with a specific order
indian$wtcat <- ordered(indian$wtcat, levels=c("L", "M", "H"))
#
library(ggplot2)
p <- ggplot(indian, aes(x = yrage, y = sysbp, label = id))
p <- p + geom_point(aes(colour=wtcat, shape=wtcat), size=2)
library(R.oo) # for ascii code lookup
p <- p + scale_shape_manual(values=charToInt(sort(unique(indian$wtcat))))
# plot regression line and confidence band
p <- p + geom_smooth(method = lm)
p <- p + labs(title="Indian sysbp by yrage with categorical wt")
print(p)
# fit the simple linear regression model
lm.sysbp.yrage <- lm(sysbp ~ yrage, data = indian)
# use Anova() from library(car) to get ANOVA table (Type 3 SS, df)
library(car)
Anova(lm.sysbp.yrage, type=3)
# fit the multiple linear regression model, (" + wt" added)
lm.sysbp.yrage.wt <- lm(sysbp ~ yrage + wt, data = indian)
library(car)
Anova(lm.sysbp.yrage.wt, type=3)
summary(lm.sysbp.yrage.wt)
#### Example: GCE
fn.data <- "http://statacumen.com/teach/ADA2/ADA2_notes_Ch02_gce.dat"
gce <- read.table(fn.data, header=TRUE)
library(ggplot2)
#suppressMessages(suppressWarnings(library(GGally)))
library(GGally)
#p <- ggpairs(gce)
# put scatterplots on top so y axis is vertical
p <- ggpairs(gce, upper = list(continuous = "points"),
lower = list(continuous = "cor"))
print(p)
# correlation matrix and associated p-values testing "H0: rho == 0"
library(Hmisc)
rcorr(as.matrix(gce))
# y ~ x1
lm.y.x1 <- lm(y ~ x1, data = gce)
library(car)
Anova(lm.y.x1, type=3)
summary(lm.y.x1)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.y.x1, which = c(1,4,6))
plot(gce$x1, lm.y.x1$residuals, main="Residuals vs x1")
# horizontal line at zero
abline(h = 0, col = "gray75")
# Normality of Residuals
library(car)
qqPlot(lm.y.x1$residuals, las = 1, id.n = 3, main="QQ Plot")
# residuals vs order of data
plot(lm.y.x1$residuals, main="Residuals vs Order of data")
# horizontal line at zero
abline(h = 0, col = "gray75")
# y ~ x2
lm.y.x2 <- lm(y ~ x2, data = gce)
library(car)
Anova(lm.y.x2, type=3)
summary(lm.y.x2)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.y.x2, which = c(1,4,6))
plot(gce$x2, lm.y.x2$residuals, main="Residuals vs x2")
# horizontal line at zero
abline(h = 0, col = "gray75")
# Normality of Residuals
library(car)
qqPlot(lm.y.x2$residuals, las = 1, id.n = 3, main="QQ Plot")
# residuals vs order of data
plot(lm.y.x2$residuals, main="Residuals vs Order of data")
# horizontal line at zero
abline(h = 0, col = "gray75")
# y ~ x1 + x2
lm.y.x1.x2 <- lm(y ~ x1 + x2, data = gce)
library(car)
Anova(lm.y.x1.x2, type=3)
summary(lm.y.x1.x2)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.y.x1.x2, which = c(1,4,6))
plot(gce$x1, lm.y.x1.x2$residuals, main="Residuals vs x1")
# horizontal line at zero
abline(h = 0, col = "gray75")
plot(gce$x2, lm.y.x1.x2$residuals, main="Residuals vs x2")
# horizontal line at zero
abline(h = 0, col = "gray75")
# Normality of Residuals
library(car)
qqPlot(lm.y.x1.x2$residuals, las = 1, id.n = 3, main="QQ Plot")
Yes, we’ve seen that X1 may be used to predict Y , and that X2 does not explain significantly more variability in the model with X1
The partial regression residual plot, or added variable plot, is a graphical tool that provides information about the need for transformations in a multiple regression model.
library(car)
avPlots(lm.y.x1.x2, id.n=3)
# function to create partial regression plot
partial.regression.plot <- function (y, x, sel, ...) {
m <- as.matrix(x[, -sel])
# residuals of y regressed on all x's except "sel"
y1 <- lm(y ~ m)$res
# residuals of x regressed on all other x's
x1 <- lm(x[, sel] ~ m)$res
# plot residuals of y vs residuals of x
plot( y1 ~ x1, main="Partial regression plot", ylab="y | others", ...)
# add grid
grid(lty = "solid")
# add red regression line
abline(lm(y1 ~ x1), col = "red", lwd = 2)
}
par(mfrow=c(1, 2))
partial.regression.plot(gce$y, cbind(gce$x1, gce$x2), 1, xlab="x1 | others")
partial.regression.plot(gce$y, cbind(gce$x1, gce$x2), 2, xlab="x2 | others")
gce10 <- gce[-10,]
# y ~ x1 + x2
lm.y10.x1.x2 <- lm(y ~ x1 + x2, data = gce10)
library(car)
Anova(lm.y10.x1.x2, type=3)
summary(lm.y10.x1.x2)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.y10.x1.x2, which = c(1,4,6))
plot(gce10$x1, lm.y10.x1.x2$residuals, main="Residuals vs x1")
# horizontal line at zero
abline(h = 0, col = "gray75")
plot(gce10$x2, lm.y10.x1.x2$residuals, main="Residuals vs x2")
# horizontal line at zero
abline(h = 0, col = "gray75")
# Normality of Residuals
library(car)
qqPlot(lm.y10.x1.x2$residuals, las = 1, id.n = 3, main="QQ Plot")
Maximum likelihood and AIC/BIC
Example: Peru Indian blood pressure
# filename
fn.data <- "http://statacumen.com/teach/ADA2/ADA2_notes_Ch02_indian.dat"
indian <- read.table(fn.data, header=TRUE)
# Create the "fraction of their life" variable
# yrage = years since migration divided by age
indian$yrage <- indian$yrmig / indian$age
# subset of variables we want in our model
indian2 <- subset(indian, select=c("sysbp", "wt", "ht", "chin"
, "fore", "calf", "pulse", "yrage"))
library(ggplot2)
#suppressMessages(suppressWarnings(library(GGally)))
library(GGally)
#p <- ggpairs(indian2)
# put scatterplots on top so y axis is vertical
p <- ggpairs(indian2, upper = list(continuous = "points")
, lower = list(continuous = "cor"))
print(p)
# correlation matrix and associated p-values testing "H0: rho == 0"
library(Hmisc)
rcorr(as.matrix(indian2))
# fit full model
lm.indian2.full <- lm(sysbp ~ wt + ht + chin + fore + calf + pulse + yrage, data = indian2)
library(car)
Anova(lm.indian2.full, type=3)
summary(lm.indian2.full)
# model reduction using update() and subtracting (removing) model terms
lm.indian2.red <- lm.indian2.full;
# remove calf
lm.indian2.red <- update(lm.indian2.red, ~ . - calf ); summary(lm.indian2.red);
# remove pulse
lm.indian2.red <- update(lm.indian2.red, ~ . - pulse); summary(lm.indian2.red);
# remove fore
lm.indian2.red <- update(lm.indian2.red, ~ . - fore ); summary(lm.indian2.red);
# remove ht
lm.indian2.red <- update(lm.indian2.red, ~ . - ht ); summary(lm.indian2.red);
# remove chin
lm.indian2.red <- update(lm.indian2.red, ~ . - chin ); summary(lm.indian2.red);
# all are significant, stop.
# final model: sysbp ~ wt + yrage
lm.indian2.final <- lm.indian2.red
## AIC
# option: test="F" includes additional information
# for parameter estimate tests that we're familiar with
# option: for BIC, include k=log(nrow( [data.frame name] ))
lm.indian2.red.AIC <- step(lm.indian2.full, direction="backward", test="F")
# BIC (not shown)
# step(lm.indian2.full, direction="backward", test="F", k=log(nrow(indian2)))
library(car)
Anova(lm.indian2.final, type=3)
summary(lm.indian2.final)
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.indian2.final, which = c(1,4,6))
plot(indian2$wt, lm.indian2.final$residuals, main="Residuals vs wt")
# horizontal line at zero
abline(h = 0, col = "gray75")
plot(indian2$yrage, lm.indian2.final$residuals, main="Residuals vs yrage")
# horizontal line at zero
abline(h = 0, col = "gray75")
# Normality of Residuals
library(car)
qqPlot(lm.indian2.final$residuals, las = 1, id.n = 3, main="QQ Plot")
library(car)
avPlots(lm.indian2.final, id.n=3)
#### Example: Rat liver
fn.data <- "http://statacumen.com/teach/ADA2/ADA2_notes_Ch03_ratliver.csv"
ratliver <- read.csv(fn.data)
ratliver <- ratliver[,c(4,1,2,3)] # reorder columns so response is the first
library(ggplot2)
#suppressMessages(suppressWarnings(library(GGally)))
library(GGally)
#p <- ggpairs(ratliver)
# put scatterplots on top so y axis is vertical
p <- ggpairs(ratliver, upper = list(continuous = "points"), lower = list(continuous = "cor"))
print(p)
# correlation matrix and associated p-values testing "H0: rho == 0"
library(Hmisc)
rcorr(as.matrix(ratliver))
# fit full model
lm.ratliver.full <- lm(y ~ bodywt + liverwt + dose, data = ratliver)
library(car)
Anova(lm.ratliver.full, type=3)
summary(lm.ratliver.full)
lm.ratliver.red.AIC <- step(lm.ratliver.full, direction="backward", test="F")
lm.ratliver.final <- lm.ratliver.red.AIC
# plot diagnistics
par(mfrow=c(2,3))
plot(lm.ratliver.final, which = c(1,4,6))
plot(ratliver$bodywt, lm.ratliver.final$residuals, main="Residuals vs bodywt")
# horizontal line at zero
abline(h = 0, col = "gray75")
plot(ratliver$dose, lm.ratliver.final$residuals, main="Residuals vs dose")
# horizontal line at zero
abline(h = 0, col = "gray75")
# Normality of Residuals
library(car)
qqPlot(lm.ratliver.final$residuals, las = 1, id.n = 3, main="QQ Plot")
library(car)
avPlots(lm.ratliver.final, id.n=3)
# remove case 3
ratliver3 <- ratliver[-3,]
# fit full model
lm.ratliver3.full <- lm(y ~ bodywt + liverwt + dose, data = ratliver3)
lm.ratliver3.red.AIC <- step(lm.ratliver3.full, direction="backward", test="F")
All varialbles are omitted!
# ggplot: Plot the data with linear regression fit and confidence bands
library(ggplot2)
p <- ggplot(ratliver, aes(x = bodywt, y = dose, label = 1:nrow(ratliver)))
# plot regression line and confidence band
p <- p + geom_smooth(method = lm)
p <- p + geom_point(alpha=1/3)
# plot labels next to points
p <- p + geom_text(hjust = 0.5, vjust = -0.5, alpha = 0.25, colour = 2)
p <- p + labs(title="Rat liver dose by bodywt: rat 3 overdosed")
print(p)