I want to demonstrate how to identify outliers and high leverage and influential data points. I will be using Advertisement Revenue dataset from website http://www.stat.tamu.edu/~sheather/book/data_sets.php.
Plot the data using boxplot.
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(ggrepel)
library(knitr) # Report display, table format
library(kableExtra)
AdRevDf <- read.csv("https://raw.githubusercontent.com/akulapa/Data621-Week04-Discussion/master/AdRevenue.csv", header= TRUE, stringsAsFactors = F)
AdRevDf$XY <- paste0("(",round(AdRevDf$Circulation,2), " ,", round(AdRevDf$AdRevenue,2),")")
p1 <- ggplot(data = AdRevDf, aes(x = "", y = Circulation)) + geom_boxplot() + labs(title="Boxplot: Circulation",x="") + coord_flip()
p2 <-ggplot(data = AdRevDf, aes(x = "", y = AdRevenue)) + geom_boxplot() + labs(title="Boxplot: Advertisement Revenue",x="") + coord_flip()
AdRevDf$BxPOutlierC <- ifelse(AdRevDf$Circulation>(summary(AdRevDf$Circulation)[5]+1.5*IQR(AdRevDf$Circulation)),'Yes','No')
AdRevDf$BxPOutlierA <- ifelse(AdRevDf$AdRevenue>(summary(AdRevDf$AdRevenue)[5]+1.5*IQR(AdRevDf$AdRevenue)),'Yes','No')
AdRevDf$Outlier <- ifelse((AdRevDf$BxPOutlierC == 'Yes'|AdRevDf$BxPOutlierA == 'Yes'),'Yes','No')
grid.arrange(p1, p2, nrow=2, newpage = F)
According to boxplot any observation that lies outside lower limit \((Q1 - 1.5\times IQR)\) and upper limit \((Q3 + 1.5\times IQR)\) is considered outlier.
summary(AdRevDf$Circulation)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3310 0.9922 1.6760 3.1180 2.7430 32.7000
AdRevDf %>% filter(BxPOutlierC == 'Yes') %>% arrange(desc(Circulation)) %>% select(Magazine, AdRevenue, Circulation) %>%
kable(format="html", caption = "Outliers Based on Circulation") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
| Magazine | AdRevenue | Circulation |
|---|---|---|
| Parade (1) | 876.907 | 32.700 |
| AARP The Magazine | 487.662 | 23.434 |
| USA Weekend | 640.072 | 23.041 |
| Reader’s Digest | 291.034 | 10.094 |
| American Profile | 291.169 | 8.162 |
| Better Homes and Gardens | 396.865 | 7.639 |
summary(AdRevDf$AdRevenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61.1 104.9 133.8 171.1 179.4 876.9
AdRevDf %>% filter(BxPOutlierA == 'Yes') %>% arrange(desc(AdRevenue)) %>% select(Magazine, AdRevenue, Circulation) %>%
kable(format="html", caption = "Outliers Based on Advertisement Revenue") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
| Magazine | AdRevenue | Circulation |
|---|---|---|
| Parade (1) | 876.907 | 32.700 |
| USA Weekend | 640.072 | 23.041 |
| AARP The Magazine | 487.662 | 23.434 |
| Better Homes and Gardens | 396.865 | 7.639 |
| Sports Illustrated | 304.185 | 3.205 |
| Good Housekeeping | 291.829 | 4.741 |
AdRevDf1 <- AdRevDf %>% filter(BxPOutlierA == 'No' & BxPOutlierC == 'No')
withOL<-lm(AdRevenue~Circulation, data=AdRevDf)
withoutOL<-lm(AdRevenue~Circulation, data=AdRevDf1)
z <- list(xx = format(coef(withOL)[1], digits = 4),
yy = format(coef(withOL)[2], digits = 4),
r2 = format(summary(withOL)$r.squared, digits = 3));
eq <- substitute(italic(hat(y)) == xx + yy %.% italic(x)*","~~italic(r)^2~"="~r2,z)
withOLeq <- as.character(as.expression(eq))
z <- list(xx = format(coef(withoutOL)[1], digits = 4),
yy = format(coef(withoutOL)[2], digits = 4),
r2 = format(summary(withoutOL)$r.squared, digits = 3));
eq <- substitute(italic(hat(y)) == xx + yy %.% italic(x)*","~~italic(r)^2~"="~r2,z)
withoutOLeq <- as.character(as.expression(eq))
ggplot(data=AdRevDf, aes(Circulation,AdRevenue)) +
geom_point(aes(col=Outlier)) +
scale_color_manual(values=c("black", "red")) +
scale_y_continuous(limits=c(-50,900)) +
geom_abline(intercept = coef(withOL)[1], slope = coef(withOL)[2], color="blue") +
geom_abline(intercept = coef(withoutOL)[1], slope = coef(withoutOL)[2], color="purple") +
geom_text_repel(data=filter(AdRevDf, (BxPOutlierC == 'Yes'|BxPOutlierA == 'Yes')), aes(label=XY), size=3) +
annotate("text", x = 20, y = 400, label = withOLeq, colour="blue", size = 3, parse=T) +
annotate("text", x = 20, y = 350, label = "(With Outliers)", colour="blue", size = 3) +
annotate("text", x = 10, y = 750, label = withoutOLeq, colour="purple", size = 3, parse=T) +
annotate("text", x = 10, y = 700, label = "(Without Outliers)", colour="purple", size = 3)
Linear model summary with outliers.
summary(withOL)
##
## Call:
## lm(formula = AdRevenue ~ Circulation, data = AdRevDf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.694 -22.939 -7.845 13.810 131.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.8095 5.8547 17.05 <2e-16 ***
## Circulation 22.8534 0.9518 24.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.22 on 68 degrees of freedom
## Multiple R-squared: 0.8945, Adjusted R-squared: 0.8929
## F-statistic: 576.5 on 1 and 68 DF, p-value: < 2.2e-16
Linear model summary without outliers.
summary(withoutOL)
##
## Call:
## lm(formula = AdRevenue ~ Circulation, data = AdRevDf1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.973 -12.297 -2.443 10.744 66.429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.522 6.216 10.54 2.81e-15 ***
## Circulation 41.161 3.162 13.02 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.67 on 60 degrees of freedom
## Multiple R-squared: 0.7385, Adjusted R-squared: 0.7342
## F-statistic: 169.5 on 1 and 60 DF, p-value: < 2.2e-16
Leverage values
AdRevDf$Leverage <- hatvalues(withOL)
AdRevDfHat <- AdRevDf %>% arrange(desc(Leverage)) %>% slice(1:10) %>% select(Magazine, XY, Leverage, Outlier)
ggplot(AdRevDfHat, aes(seq_along(Leverage), Leverage)) + geom_col() + geom_text(data=AdRevDfHat, aes(label=round(Leverage,2)), size=3, vjust=0) + labs(x="", y="Leverage", title = "Top 10 Leverage Values")
AdRevDfHat %>%
kable(format="html", caption = "Top 10 Leverage Values") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
| Magazine | XY | Leverage | Outlier |
|---|---|---|---|
| Parade (1) | (32.7 ,876.91) | 0.4589697 | Yes |
| AARP The Magazine | (23.43 ,487.66) | 0.2240186 | Yes |
| USA Weekend | (23.04 ,640.07) | 0.2159826 | Yes |
| Reader’s Digest | (10.09 ,291.03) | 0.0390123 | Yes |
| American Profile | (8.16 ,291.17) | 0.0272122 | Yes |
| Better Homes and Gardens | (7.64 ,396.87) | 0.0246703 | Yes |
| Modern Bride | (0.33 ,67.54) | 0.0182342 | No |
| Brides Magazine | (0.37 ,67.53) | 0.0181273 | No |
| New York | (0.43 ,61.1) | 0.0179587 | No |
| W | (0.46 ,76.17) | 0.0178772 | No |
Observations with high leverage.
If leverage value of an observation is greater than three times the mean leverage value (\(3\times \frac{p}{n}\)), it is considered as high leverage observation. p is number of parameters (intercept \(\beta_0\) and slope \(\beta_1\)) and n is number of observations.
Mean leverage value = \(\frac{p}{n} = \frac{2}{70} = 0.0286\)
Three times mean leverage value = 0.0858
AdRevDfHat <- AdRevDf %>% filter(Leverage > 3 * round(2/70,4)) %>% select(Magazine, XY, Leverage, Outlier)
AdRevDfHat %>%
kable(format="html", caption = "High Leverage Values") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
| Magazine | XY | Leverage | Outlier |
|---|---|---|---|
| Parade (1) | (32.7 ,876.91) | 0.4589697 | Yes |
| USA Weekend | (23.04 ,640.07) | 0.2159826 | Yes |
| AARP The Magazine | (23.43 ,487.66) | 0.2240186 | Yes |
The \(R^2\) value has decreased from 0.894495 to 0.7385235 when outliers are excluded from the model. We can conclude that relationship between AdRevenue and Circulation is moderately strong in the presence of outliers. Without the presence of outliers relationship is weak.
The standard error of \(\beta_1\) is almost two times more when outliers are removed. Value incresed from 22.8533933 to 41.1611438. Higher value of \(\beta_1\) leads to increase in width of confidence interval for \(\beta_1\). Cofidence interval is calculated as \(\bar x \pm z\times \frac{se}{\sqrt n}\). Where se is standard error of Circulation (\(\beta_1\)).
With and without outliers, linear model resulted in p-value for \(H_0: \beta_1 = 0\) less than 0.0001. We can say with 95% confidence that AdRevenue is function Circulation and both are related.
In the presence of outliers predicted responses(\(\beta_0\)) and estimated slope coefficient (\(\beta_1\)) are changing, we can conclude data points are highly influential.
Based on leverage value we can conclude almost all observations that are identified by boxplot as outliers are high leverage data points.
In summary, data points are not outliers, but highly influential leverage points. Data points can be excluded from linear model.