I want to demonstrate how to identify outliers and high leverage and influential data points. I will be using Advertisement Revenue dataset from website http://www.stat.tamu.edu/~sheather/book/data_sets.php.

Plot the data using boxplot.

library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(ggrepel)
library(knitr)    # Report display, table format
library(kableExtra)

AdRevDf <- read.csv("https://raw.githubusercontent.com/akulapa/Data621-Week04-Discussion/master/AdRevenue.csv", header= TRUE, stringsAsFactors = F)
AdRevDf$XY <- paste0("(",round(AdRevDf$Circulation,2), " ,", round(AdRevDf$AdRevenue,2),")")

p1 <- ggplot(data = AdRevDf, aes(x = "", y = Circulation)) + geom_boxplot() + labs(title="Boxplot: Circulation",x="") + coord_flip()
p2 <-ggplot(data = AdRevDf, aes(x = "", y = AdRevenue)) + geom_boxplot() + labs(title="Boxplot: Advertisement Revenue",x="") + coord_flip()

AdRevDf$BxPOutlierC <- ifelse(AdRevDf$Circulation>(summary(AdRevDf$Circulation)[5]+1.5*IQR(AdRevDf$Circulation)),'Yes','No')
AdRevDf$BxPOutlierA <- ifelse(AdRevDf$AdRevenue>(summary(AdRevDf$AdRevenue)[5]+1.5*IQR(AdRevDf$AdRevenue)),'Yes','No')
AdRevDf$Outlier <- ifelse((AdRevDf$BxPOutlierC == 'Yes'|AdRevDf$BxPOutlierA == 'Yes'),'Yes','No')

grid.arrange(p1, p2, nrow=2, newpage = F)

According to boxplot any observation that lies outside lower limit \((Q1 - 1.5\times IQR)\) and upper limit \((Q3 + 1.5\times IQR)\) is considered outlier.

Outliers - Circulation

summary(AdRevDf$Circulation)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3310  0.9922  1.6760  3.1180  2.7430 32.7000
AdRevDf %>% filter(BxPOutlierC == 'Yes') %>% arrange(desc(Circulation)) %>% select(Magazine, AdRevenue, Circulation) %>% 
  kable(format="html", caption = "Outliers Based on Circulation") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
Outliers Based on Circulation
Magazine AdRevenue Circulation
Parade (1) 876.907 32.700
AARP The Magazine 487.662 23.434
USA Weekend 640.072 23.041
Reader’s Digest 291.034 10.094
American Profile 291.169 8.162
Better Homes and Gardens 396.865 7.639

Outliers - Advertisement Revenue

summary(AdRevDf$AdRevenue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    61.1   104.9   133.8   171.1   179.4   876.9
AdRevDf %>% filter(BxPOutlierA == 'Yes') %>% arrange(desc(AdRevenue)) %>% select(Magazine, AdRevenue, Circulation) %>% 
  kable(format="html", caption = "Outliers Based on Advertisement Revenue") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
Outliers Based on Advertisement Revenue
Magazine AdRevenue Circulation
Parade (1) 876.907 32.700
USA Weekend 640.072 23.041
AARP The Magazine 487.662 23.434
Better Homes and Gardens 396.865 7.639
Sports Illustrated 304.185 3.205
Good Housekeeping 291.829 4.741

Regression Equation

AdRevDf1 <- AdRevDf %>% filter(BxPOutlierA == 'No' & BxPOutlierC == 'No')

withOL<-lm(AdRevenue~Circulation, data=AdRevDf)
withoutOL<-lm(AdRevenue~Circulation, data=AdRevDf1)


z <- list(xx = format(coef(withOL)[1], digits = 4),
          yy = format(coef(withOL)[2], digits = 4),
          r2 = format(summary(withOL)$r.squared, digits = 3));

eq <- substitute(italic(hat(y)) == xx + yy %.% italic(x)*","~~italic(r)^2~"="~r2,z)

withOLeq <- as.character(as.expression(eq))

z <- list(xx = format(coef(withoutOL)[1], digits = 4),
          yy = format(coef(withoutOL)[2], digits = 4),
          r2 = format(summary(withoutOL)$r.squared, digits = 3));

eq <- substitute(italic(hat(y)) == xx + yy %.% italic(x)*","~~italic(r)^2~"="~r2,z)

withoutOLeq <- as.character(as.expression(eq))

ggplot(data=AdRevDf, aes(Circulation,AdRevenue)) + 
  geom_point(aes(col=Outlier)) + 
  scale_color_manual(values=c("black", "red")) +
  scale_y_continuous(limits=c(-50,900)) +
  geom_abline(intercept = coef(withOL)[1], slope = coef(withOL)[2], color="blue") +
  geom_abline(intercept = coef(withoutOL)[1], slope = coef(withoutOL)[2], color="purple") +
  geom_text_repel(data=filter(AdRevDf, (BxPOutlierC == 'Yes'|BxPOutlierA == 'Yes')), aes(label=XY), size=3) +
  annotate("text", x = 20, y = 400, label = withOLeq, colour="blue", size = 3, parse=T) +
  annotate("text", x = 20, y = 350, label = "(With Outliers)", colour="blue", size = 3) + 
  annotate("text", x = 10, y = 750, label = withoutOLeq, colour="purple", size = 3, parse=T) +
  annotate("text", x = 10, y = 700, label = "(Without Outliers)", colour="purple", size = 3)

Linear model summary with outliers.

summary(withOL)
## 
## Call:
## lm(formula = AdRevenue ~ Circulation, data = AdRevDf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.694  -22.939   -7.845   13.810  131.130 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  99.8095     5.8547   17.05   <2e-16 ***
## Circulation  22.8534     0.9518   24.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.22 on 68 degrees of freedom
## Multiple R-squared:  0.8945, Adjusted R-squared:  0.8929 
## F-statistic: 576.5 on 1 and 68 DF,  p-value: < 2.2e-16

Linear model summary without outliers.

summary(withoutOL)
## 
## Call:
## lm(formula = AdRevenue ~ Circulation, data = AdRevDf1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.973 -12.297  -2.443  10.744  66.429 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   65.522      6.216   10.54 2.81e-15 ***
## Circulation   41.161      3.162   13.02  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.67 on 60 degrees of freedom
## Multiple R-squared:  0.7385, Adjusted R-squared:  0.7342 
## F-statistic: 169.5 on 1 and 60 DF,  p-value: < 2.2e-16

Leverage values

AdRevDf$Leverage <- hatvalues(withOL)

AdRevDfHat <- AdRevDf %>% arrange(desc(Leverage)) %>% slice(1:10) %>% select(Magazine, XY, Leverage, Outlier)

ggplot(AdRevDfHat, aes(seq_along(Leverage), Leverage)) + geom_col() + geom_text(data=AdRevDfHat, aes(label=round(Leverage,2)), size=3, vjust=0) + labs(x="", y="Leverage", title = "Top 10 Leverage Values")

AdRevDfHat %>%  
  kable(format="html", caption = "Top 10 Leverage Values") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
Top 10 Leverage Values
Magazine XY Leverage Outlier
Parade (1) (32.7 ,876.91) 0.4589697 Yes
AARP The Magazine (23.43 ,487.66) 0.2240186 Yes
USA Weekend (23.04 ,640.07) 0.2159826 Yes
Reader’s Digest (10.09 ,291.03) 0.0390123 Yes
American Profile (8.16 ,291.17) 0.0272122 Yes
Better Homes and Gardens (7.64 ,396.87) 0.0246703 Yes
Modern Bride (0.33 ,67.54) 0.0182342 No
Brides Magazine (0.37 ,67.53) 0.0181273 No
New York (0.43 ,61.1) 0.0179587 No
W (0.46 ,76.17) 0.0178772 No

Observations with high leverage.

If leverage value of an observation is greater than three times the mean leverage value (\(3\times \frac{p}{n}\)), it is considered as high leverage observation. p is number of parameters (intercept \(\beta_0\) and slope \(\beta_1\)) and n is number of observations.

Mean leverage value = \(\frac{p}{n} = \frac{2}{70} = 0.0286\)

Three times mean leverage value = 0.0858

AdRevDfHat <- AdRevDf %>% filter(Leverage > 3 * round(2/70,4)) %>% select(Magazine, XY, Leverage, Outlier)

AdRevDfHat %>%  
  kable(format="html", caption = "High Leverage Values") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
High Leverage Values
Magazine XY Leverage Outlier
Parade (1) (32.7 ,876.91) 0.4589697 Yes
USA Weekend (23.04 ,640.07) 0.2159826 Yes
AARP The Magazine (23.43 ,487.66) 0.2240186 Yes

Analysis

In summary, data points are not outliers, but highly influential leverage points. Data points can be excluded from linear model.

References