Distinction Between Outliers and High Leverage Observations

In this section, we learn the distinction between outliers and high leverage observations. In short:

  • An outlier is a data point whose response y does not follow the general trend of the rest of the data.

  • A data point has high leverage if it has “extreme” predictor x values. With a single predictor, an extreme x value is simply one that is particularly high or low. With multiple predictors, extreme x values may be particularly high or low for one or more predictors, or may be “unusual” combinations of predictor values (e.g., with two predictors that are positively correlated, an unusual combination of predictor values might be a high value of one predictor paired with a low value of the other predictor).

Example 1

data_1<-data.frame(
  x=c(0.1,0.45401,1.09765,1.27936,2.20611,2.50064,3.0403,3.23583,4.45308,4.1699,5.28474,5.59238,5.92091,6.66066,6.79953,7.97943,8.41536,8.71607,8.70156,9.16463),
  y=c(-0.0716,4.1673,6.5703,13.815,11.4501,12.9554,20.1575,17.5633,26.0317,22.7573,26.303,30.6885,33.9402,30.9228,34.11,44.4536,46.5022,50.0568,46.5475,45.7762)
)
plot(data_1$x,data_1$y,main="No Outlier & Influence")
lm_1<-lm(y~x,data=data_1)
abline(lm_1,col="blue")

#Plot Residual Vs Leverage
plot(lm_1, main = "Residual vs Leverage", which=5)

All of the data points follow the general trend of the rest of the data, so there are no outliers (in the y direction). And, none of the data points are extreme with respect to x, so there are no high leverage points. Overall, none of the data points would appear to be influential with respect to the location of the best fitting line.

Example 2

data_2<-data.frame(
  x=c(0.1,0.45401,1.09765,1.27936,2.20611,2.50064,3.0403,3.23583,4.45308,4.1699,5.28474,5.59238,5.92091,6.66066,6.79953,7.97943,8.41536,8.71607,8.70156,9.16463,4),
  y=c(-0.0716,4.1673,6.5703,13.815,11.4501,12.9554,20.1575,17.5633,26.0317,22.7573,26.303,30.6885,33.9402,30.9228,34.11,44.4536,46.5022,50.0568,46.5475,45.7762,40)
)
plot(data_2$x,data_2$y,main="Only Outlier")
lm_2<-lm(y~x,data=data_2)
abline(lm_2,col="blue")

#Plot Residual Vs Leverage
plot(lm_2,main = "Residual vs Leverage", which=5)

Because the highest data point does not follow the general trend of the rest of the data, it would be considered an outlier. However, this point does not have an extreme x value, so it does not have high leverage.

Example 3

data_3<-data.frame(
  x=c(0.1,0.45401,1.09765,1.27936,2.20611,2.50064,3.0403,3.23583,4.45308,4.1699,5.28474,5.59238,5.92091,6.66066,6.79953,7.97943,8.41536,8.71607,8.70156,9.16463,14),
  y=c(-0.0716,4.1673,6.5703,13.815,11.4501,12.9554,20.1575,17.5633,26.0317,22.7573,26.303,30.6885,33.9402,30.9228,34.11,44.4536,46.5022,50.0568,46.5475,45.7762,68)
)
plot(data_3$x,data_3$y,main="Only Influence")
lm_3<-lm(y~x,data=data_3)
abline(lm_3,col="blue")

#Plot Residual Vs Leverage
plot(lm_3, main = "Residual vs Leverage", which=5)

The data point at the top right does follow the general trend of the rest of the data. Therefore, it is not deemed an outlier here. However, this point does have an extreme x value, so it does have high leverage.

Example 4

data_4<-data.frame(
  x=c(0.1,0.45401,1.09765,1.27936,2.20611,2.50064,3.0403,3.23583,4.45308,4.1699,5.28474,5.59238,5.92091,6.66066,6.79953,7.97943,8.41536,8.71607,8.70156,9.16463,13),
  y=c(-0.0716,4.1673,6.5703,13.815,11.4501,12.9554,20.1575,17.5633,26.0317,22.7573,26.303,30.6885,33.9402,30.9228,34.11,44.4536,46.5022,50.0568,46.5475,45.7762,15)
)
plot(data_4$x,data_4$y,main="Outlier & Influence")
lm_4<-lm(y~x,data=data_4)
abline(lm_4,col="blue")

#Plot Residual Vs Leverage
plot(lm_4, main = "Residual vs Leverage", which=5)

The data point at the bottom right is most certainly an outlier and has high leverage! The data point at the bottom right does not follow the general trend of the rest of the data and it also has an extreme x value.

Bonferroni test for an outlier (J. Fox)

outlierTest <- function(model, ...){
  UseMethod("outlierTest")
}

outlierTest.lm <- function(model, cutoff=0.05, n.max=10, order=TRUE, labels=names(rstudent), ...){
  rstudent <- rstudent(model)
  if (length(rstudent) != length(labels))
    stop("Number of labels does not correspond to number of residuals.")
  else names(rstudent) <- labels
  df <- df.residual(model) - 1
  rstudent <- rstudent[!is.na(rstudent)]
  n <- length(rstudent)
  p <- if (class(model)[1] == "glm")
    2*(pnorm(abs(rstudent), lower.tail=FALSE))
  else 2*(pt(abs(rstudent), df, lower.tail=FALSE))
  bp <- n*p
  ord <- if (order) order(bp) else 1:n
  ord <- ord[bp[ord] <= cutoff]
  result <- if (length(ord) == 0){
    which <- which.max(abs(rstudent))
    list(rstudent=rstudent[which], p=p[which], bonf.p=bp[which], signif=FALSE, cutoff=cutoff)
  }
  else {
    if (length(ord) > n.max) ord <- ord[1:n.max]
    result <- list(rstudent=rstudent[ord], p=p[ord], bonf.p=bp[ord], signif=TRUE, cutoff=cutoff)
  }
  class(result)<-"outlierTest"
  result
}

outlierTest.lmerMod <- function(model, ...){
  outlierTest.lm(model, ...)
}

print.outlierTest<-function(x, digits=5, ...){
  if (!x$signif){
    cat("No Studentized residuals with Bonferroni p <", x$cutoff)
    cat("\nLargest |rstudent|:\n")
  }
  bp <- x$bonf
  bp[bp > 1] <- NA
  table <- data.frame(rstudent=x$rstudent,
                      "unadjusted p-value"=signif(x$p, digits), "Bonferroni p"=signif(bp, digits),
                      check.names=FALSE)
  rownames(table) <- names(x$rstudent)
  print(table)
  invisible(x)
}

Regression with Influential Obs

#One Influence
mydata<-data.frame(
  x=c(1,2,3,4,5,7,3,2,12,11,15,14,17,22),
  y=c(23,24,23,19,34,35,36,36,34,32,38,41,42,180)
)
plot(mydata$x,mydata$y,main="One Influence At Top Right")
mydata_regression<-lm(y~x,data=mydata)
summary(mydata_regression)
## 
## Call:
## lm(formula = y ~ x, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.392 -22.608   0.257   9.709  82.338 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    8.473     13.576   0.624  0.54423   
## x              4.054      1.280   3.168  0.00809 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.85 on 12 degrees of freedom
## Multiple R-squared:  0.4555, Adjusted R-squared:  0.4101 
## F-statistic: 10.04 on 1 and 12 DF,  p-value: 0.008094
abline(mydata_regression,col="blue")

Outlier Test

#Perform Pairwise t-tests with Bonferroni's Correction
outlierTest(mydata_regression)
##    rstudent unadjusted p-value Bonferroni p
## 14  18.8833         9.8899e-10   1.3846e-08

Cook’s Distance

#Cook's Distance
#cooks.distance(mydata_regression)

#Max Cook's Distance
cooks.distance(mydata_regression)[which.max(cooks.distance(mydata_regression))]
##       14 
## 3.693296

All summaries for Cook’s distance, DFFITS, LAVERAGE

alldata<-data.frame(
  x=c(1,2,3,4,5,7,3,2,12,11,15,14,17,22),
  y=c(23,24,23,19,34,35,36,36,34,32,38,41,42,180),
  cook_distance=c(cooks.distance(mydata_regression)),
  dffits=c(dffits(mydata_regression)),
  #dfbetas=c(dfbetas(mydata_regression)),
  laverage=c(hatvalues(mydata_regression))
)

# Change default argument of print.data.frame
formals(print.data.frame)$row.names <- FALSE

# Peek
print(alldata)
##   x   y cook_distance      dffits   laverage
##   1  23  0.0137885203  0.15991716 0.16633907
##   2  24  0.0056029292  0.10163701 0.14250614
##   3  23  0.0004654358  0.02921943 0.12211302
##   4  19  0.0022326538 -0.06407964 0.10515971
##   5  34  0.0016121388  0.05443785 0.09164619
##   7  35  0.0001576538 -0.01700370 0.07493857
##   3  36  0.0196473777  0.19206404 0.12211302
##   2  36  0.0383869834  0.27054372 0.14250614
##  12  34  0.0318951165 -0.24830848 0.09336609
##  11  32  0.0229456022 -0.20958961 0.08280098
##  15  38  0.1026220325 -0.45728687 0.14570025
##  14  41  0.0502492716 -0.31284333 0.12481572
##  17  42  0.2022058962 -0.65529023 0.19778870
##  22 180  3.6932955120 15.04203884 0.38820639

Plot for Cook’s distance (Influential Obs)

#Plot Cook's Distance
n <- nrow(mydata)
plot(cooks.distance(mydata_regression), main = "Cooks Distance for Influential Obs")
abline(h = 4/n, lty = 2, col = "blue") # add cutoff line

Plot for DFFITS test (Influential Obs)

#Plot DFFITS
n <- nrow(mydata)
p <- length(mydata_regression$coefficients)
plot(dffits(mydata_regression), main = "DFFITS for Influential Obs")
abline(h = 2*sqrt((p+1)/(n-p-1)), lty = 2, col = "blue") # add cutoff line

Plot for DFBETAS test (Influential Obs)

#Plot DFBETAS
n <- nrow(mydata)
plot(dfbetas(mydata_regression), main = "DFBETAS for Influential Obs")
abline(h = 2/sqrt(n), lty = 2, col = "blue") # add cutoff line

Plot for Residual vs Leverage

Each observation from the dataset is shown as a single point within the plot. The x-axis shows the leverage of each point and the y-axis shows the standardized residual of each point.

Leverage refers to the extent to which the coefficients in the regression model would change if a particular observation was removed from the dataset.

Standardized residuals refer to the standardized difference between a predicted value for an observation and the actual value of the observation.

Observations with high leverage have a strong influence on the coefficients in the regression model. If we remove these observations, the coefficients of the model would change noticeably.

#Plot Residual Vs Leverage
plot(mydata_regression, main = "Residual vs Leverage", which=5)

We can see that observation #14 in the top right corner falls outside of the black dashed lines. This indicates that it is an influential point.

This means that if we removed this observation from our dataset and fit the regression model again, the coefficients of the model would change significantly.

Regression without Influential Obs

# No Influence
mydata_new<-data.frame(
  xnew=c(1,2,3,4,5,7,3,2,12,11,15,14,17),
  ynew=c(23,24,23,19,34,35,36,36,34,32,38,41,42)
)
plot(mydata_new$xnew,mydata_new$ynew,main="No Influence")
mydata_regression_new<-lm(ynew~xnew,data=mydata_new)
summary(mydata_regression_new)
## 
## Call:
## lm(formula = ynew ~ xnew, data = mydata_new)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.988 -3.250 -1.027  3.274  8.837 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.3373     2.6105   9.706 9.95e-07 ***
## xnew          0.9127     0.2848   3.204  0.00839 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.575 on 11 degrees of freedom
## Multiple R-squared:  0.4828, Adjusted R-squared:  0.4358 
## F-statistic: 10.27 on 1 and 11 DF,  p-value: 0.008391
abline(mydata_regression_new,col="blue")

# Perform pairwise t-tests with Bonferroni's correction
outlierTest(mydata_regression_new)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##   rstudent unadjusted p-value Bonferroni p
##  -2.202916           0.052183      0.67838
# Max Cook's Distance
cooks.distance(mydata_regression_new)[which.max(cooks.distance(mydata_regression_new))]
##         8 
## 0.2670507
alldata_new<-data.frame(
  x=c(1,2,3,4,5,7,3,2,12,11,15,14,17),
  y=c(23,24,23,19,34,35,36,36,34,32,38,41,42),
  cook_distance=c(cooks.distance(mydata_regression_new))
)

# Change default argument of print.data.frame
formals(print.data.frame)$row.names <- FALSE

# Peek
print(alldata_new)
##   x  y cook_distance
##   1 23   0.046714338
##   2 24   0.034202052
##   3 23   0.069136774
##   4 19   0.214929482
##   5 34   0.030079052
##   7 35   0.015661511
##   3 36   0.168558632
##   2 36   0.267050738
##  12 34   0.014848702
##  11 32   0.025775208
##  15 38   0.006507487
##  14 41   0.039144785
##  17 42   0.014510572

Plot for Cook’s distance (No Influential Obs)

# Plot Cook's Distance
n_new <- nrow(mydata_new)
plot(cooks.distance(mydata_regression_new), main = "Cooks Distance for No Influential Obs")
abline(h = 4/n_new, lty = 2, col = "blue") # add cutoff line

Plot for DFFITS test (No Influential Obs)

#Plot DFFITS
n_new <- nrow(mydata_new)
p_new <- length(mydata_regression_new$coefficients)
plot(dffits(mydata_regression_new), main = "DFFITS for No Influential Obs")
abline(h = 2*sqrt((p_new+1)/(n_new-p_new-1)), lty = 2, col = "blue") # add cutoff line

Plot for DFBETAS test (No Influential Obs)

#Plot DFBETAS
n_new <- nrow(mydata_new)
plot(dfbetas(mydata_regression_new), main = "DFBETAS for No Influential Obs")
abline(h = 2/sqrt(n_new), lty = 2, col = "blue") # add cutoff line

Plot for Residual vs Leverage

In the example above, we can see that observation #8 lies closest to the border of Cook’s distance, but it doesn’t fall outside of the dashed line. This means there are not any influential points in our regression model.

#Plot Residual Vs Leverage
plot(mydata_regression_new, main = "Residual vs Leverage", which=5)