Regression:Influential observations

In this post, we will calculate the measures of influence by hand:

Cook’s D
DFFITS
DFBETAS

Also, in this post we will calculate:

General residuals
Predicted residuals
PRESS
Standardised residuals
Studentisied residuals
Leverage points

Model fitting

Lets fit two models, one model with all the observations and another after removing first observation.

attach(cars)

x1 <- lm(speed ~ dist, data = cars) # all observations

x2 <-  lm(speed ~ dist, data = cars[-1,]) # without first obs

Fitted values

The fitted values for both the models

pred_x1 <- predict(x1, cars[1,], se.fit = T)

pred_x2 <- predict(x2, cars[1,], se.fit = T)

c(pred_x1$fit, pred_x2$fit) # Fitted values for x1 and x2

##        1        1 
## 8.615041 8.971019

Design matrix, hat matrix, leverage points

x <- model.matrix(speed ~ dist) # x matrix

y <- speed # response variable

betas <- solve(crossprod(x))%*%crossprod(x,y) # co-efficients

h <- x%*%solve(crossprod(x))%*%t(x) # Hat matrix

leverage <- diag(h) #Leverage points

i <- diag(1, dim(cars)[1]) # Identity matrix

Regular residuals

e <- (i - h)%*%y # regular residuals

Predicted residuals

pi <- e/(1-diag(h)) # predicted residuals

PRESS

press <- sum(pi)

Standardised residuals

di <- e/sqrt(anova(x1)$`Mean Sq`[2]) # standardised residuals

Studentised residuals

ri <- e/sqrt(anova(x1)$`Mean Sq`[2]*(1-diag(h))) # studentised residuals

DFBETAS

#DFBETAS
num <- x1$coefficients[2] - x2$coefficients[2]

denom <- sqrt(anova(x2)$`Mean Sq`[2]*solve(crossprod(x))[4])

df_betas <- num/denom

DFFITS

#DFFITS

num_dffits <-pred_x1$fit - pred_x2$fit

denom_dffits <- sqrt(anova(x2)$`Mean Sq`[2]*diag(h)[1])

df_fits <- num_dffits/denom_dffits

Cook’s D

#COOK'S D

c_num <- t(pred_x2$fit - pred_x1$fit)*(pred_x2$fit - pred_x1$fit)

c_denom <- sum(diag(h))*anova(x1)$`Mean Sq`[2]

cdist <- c_num/c_denom

#COOK'S D(using single fit)

risqaure <- ri^2 #square of studentised residuals

hii <- diag(h) # leverage points

p <- sum(diag(h)) # number of parameters

one_hii <- 1 - diag(h) 

cooks_num <- risqaure*hii

cooks_denom <- p*one_hii

cooksdist <- round(cooks_num/cooks_denom, digits = 3)