Textbook: Linear Models with R, 2nd, Julian J. Faraway

Diagnostics

Dataset: Savings rates in 50 countries

Description

The savings data frame has 50 rows and 5 columns. The data is averaged over the period 1960-1970.

data(savings,package="faraway")
#?savings
head(savings)
##              sr pop15 pop75     dpi ddpi
## Australia 11.43 29.35  2.87 2329.68 2.87
## Austria   12.07 23.32  4.41 1507.99 3.93
## Belgium   13.17 23.80  4.43 2108.47 3.82
## Bolivia    5.75 41.89  1.67  189.13 0.22
## Brazil    12.88 42.19  0.83  728.47 4.56
## Canada     8.79 31.72  2.85 2982.88 2.43
lmod<- lm(sr~pop15+pop75+dpi+ddpi,savings)

R provides an easy way to produce a useful selection of residual plots.

  1. Residuals vs Fitted (Check constant symmetrical variation, homoscedasticity)

  2. Normal Q-Q plot (Check Normal assumption)

  3. Scale-Location (Check equal variance)

  4. Residuals vs Leverage (Find influential points)

par(mfrow=c(2,2)) #將四張圖 (2*2) 畫在一起
plot(lmod) 

1. Residuals vs Fitted: Check constant symmetrical variation, homoscedasticity

\[ Var(\varepsilon_i)=\sigma^2 \]

Faraway-Fig 6.1
Faraway-Fig 6.1

2. Normal Q-Q plot (Check Normal assumption)

qqnorm(residuals(lmod), ylab="residuals",main="")
qqline(residuals(lmod))

par(mfrow=c(2,3))
n <- 50
for(i in 1:6) {x <- rnorm(n); qqnorm(x); qqline(x)} #normal

for(i in 1:6) {x <- exp(rnorm(n)); qqnorm(x); qqline(x)} #log-normal: skewed distribution

for(i in 1:6) {x <- rcauchy(n); qqnorm(x); qqline(x)} #Cauchy: long-tailed distribution

for(i in 1:6) {x <- runif(n); qqnorm(x); qqline(x)} #Uniform: short-tailed distribution

curve(dnorm(x),xlim=c(-4,4),ylim=c(0,0.4),ylab="density")
curve(dcauchy(x),add=T,col=2)
curve(dunif(x,min=-sqrt(3),max=sqrt(3)),add=T, col=3)
legend("topleft",c("Normal","Cauchy","Uniform"),col=1:3,lty=1)

Shapiro-Wilk test: a formal test for normality (\(H_0\): residuals are normal)

shapiro.test(residuals(lmod))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(lmod)
## W = 0.98698, p-value = 0.8524

3. Scale-Location: Check equal variance

\[ \hat{\varepsilon} = y - \hat{y} = (I-H)y = (I-H)\varepsilon \] \[ I-H= \begin{pmatrix} 1 & 0 & ... & 0 \\ 0 & 1 & ... & 0 \\ \vdots & & & \vdots \\ 0 & 0 & ... & 1 \end{pmatrix}- \begin{pmatrix} h_1 & h_{12} & ... & h_{1n} \\ h_{21} & h_2 & ... & h_{2n} \\ \vdots & & & \vdots \\ h_{n1} & h_{n2} & ... & h_n \end{pmatrix} \] Standardized residuals:

Since \(Var(\hat{\varepsilon})=(I-H)\sigma^2\), \(Var(\hat{\varepsilon}_i)=(1-h_i)\sigma^2\). The standardized residual \(r_i\)

\[ r_i = \frac{\hat{\varepsilon}_i}{\hat{\sigma}\sqrt{(1-h_i)}} \]

lmod<- lm(sr~pop15+pop75+dpi+ddpi,savings)
r<-rstandard(lmod)

x<-model.matrix(~pop15+pop75+dpi+ddpi,savings)
H<-x%*%solve(t(x)%*%x)%*%t(x)
r2<-residuals(lmod)/(summary(lmod)$sigma*sqrt(1-diag(H)))

head(cbind(r,r2))
##                     r          r2
## Australia  0.23520105  0.23520105
## Austria    0.17282943  0.17282943
## Belgium    0.61085760  0.61085760
## Bolivia   -0.19245030 -0.19245030
## Brazil     0.96858807  0.96858807
## Canada    -0.09083873 -0.09083873
plot(lmod$fitted.values, sqrt(abs(r)), ylab = expression(sqrt(r )),xlab= expression(hat(y)))

4. Residuals vs Leverage: Find influential points

\[ H = X(X^TX)^{-1}X^T = \begin{pmatrix} h_1 & h_{12} & ... & h_{1n} \\ h_{21} & h_2 & ... & h_{2n} \\ \vdots & & & \vdots \\ h_{n1} & h_{n2} & ... & h_n \end{pmatrix} \]

  • Leverage:
    \(h_i\) only depend on \(X\)

  • Standardized residual: \[r=\frac{\hat{\varepsilon}_i}{\hat{\sigma}\sqrt{(1-h_i)}}\]

  • The Cook statistics: popular influence diagnostics \[ D_i = \frac{(\hat{y}-\hat{y}_{(i)})^T(\hat{y}-\hat{y}_{(i)})}{p\hat{\sigma}^2}= \frac{1}{p}r_i^2\frac{h_i}{1-h_i} \] Standardized residual \(r_i^2\) increase \(\Rightarrow D_i\) increase.
    Leverage \(h_i\) increase \(\Rightarrow D_i\) increase.

library(faraway)
## Warning: 套件 'faraway' 是用 R 版本 4.3.1 來建造的
lmod <- lm(sr ~ pop15+pop75+dpi+ddpi,savings)
cook <- cooks.distance(lmod)
halfnorm(cook,nlab=3,labs=rownames(savings),ylab="Cook's distances") 

lmodi <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(cook < max(cook)))
summary(lmodi)$coefficients
##                  Estimate   Std. Error    t value    Pr(>|t|)
## (Intercept) 24.5240459788 8.2240263129  2.9819999 0.004654742
## pop15       -0.3914401268 0.1579094892 -2.4788892 0.017084112
## pop75       -1.2808669233 1.1451820596 -1.1184832 0.269430095
## dpi         -0.0003189001 0.0009293298 -0.3431507 0.733119294
## ddpi         0.6102790264 0.2687784209  2.2705656 0.028122167
summary(lmod)$coefficients
##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept) 28.5660865407 7.3545161062  3.8841558 0.0003338249
## pop15       -0.4611931471 0.1446422248 -3.1885098 0.0026030189
## pop75       -1.6914976767 1.0835989307 -1.5609998 0.1255297940
## dpi         -0.0003369019 0.0009311072 -0.3618293 0.7191731554
## ddpi         0.4096949279 0.1961971276  2.0881801 0.0424711387
plot(dfbeta(lmod)[,2],ylab="Change in pop15 coef")
abline(h=0)
points(which(cook>0.05), dfbeta(lmod)[which(cook>0.05),2],col=2, pch=16)
text(which(cook>0.05)-3, dfbeta(lmod)[which(cook>0.05),2], rownames(savings)[which(cook>0.05)],col=2)

plot(lmod)

Correlated Errors

Dataset: Northern Hemisphere (北半球) temperatures and climate proxies in the last millenia ()

Description

Average Northen Hemisphere Temperature (y: 北半球溫度) from 1856-2000 and eight climate proxies from 1000-2000AD. Data can be used to predict temperatures prior to 1856.

data(globwarm,package="faraway")
lmod.cor <- lm(nhtemp ~ wusa + jasper + westgreen + chesapeake + tornetrask + urals + mongolia + tasman, globwarm)
plot(residuals(lmod.cor) ~ year, na.omit(globwarm), ylab="Residuals")
abline(h=0)

n <- length(residuals(lmod.cor))
plot(tail(residuals(lmod.cor),n-1) ~ head(residuals(lmod.cor),n-1), xlab= expression(hat(epsilon)[i]),ylab=expression(hat(epsilon)[i+1]))
abline(h=0,v=0,col=grey(0.75))

Durbin-Watson test: test for auto-correlation (\(H_0\): errors are uncorrelated)

\[ DW=\frac{\sum_{i=2}^n(\hat{\varepsilon}_i-\hat{\varepsilon}_{i-1})^2}{\sum_{i=1}^n \hat{\varepsilon}_i^2} \]

DW small: positive auto-correlation (reject \(H_0\))

DW large: negative auto-correlation (reject \(H_0\))

#install.packages("lmtest")
library(lmtest)
dwtest(nhtemp ~ wusa + jasper + westgreen + chesapeake + tornetrask + urals + mongolia + tasman, data=globwarm)
## 
##  Durbin-Watson test
## 
## data:  nhtemp ~ wusa + jasper + westgreen + chesapeake + tornetrask +     urals + mongolia + tasman
## DW = 0.81661, p-value = 1.402e-15
## alternative hypothesis: true autocorrelation is greater than 0