options(scipen=999)
library(dplyr)
library(tidyverse)
library(knitr)
library(kableExtra)
library(car)
library(ggrepel)
library(ggplot2)
library(plotly)

#Load data
MissAm.df <- read.csv("https://raw.githubusercontent.com/akulapa/Data621-Week08-Discussion/master/MissAmericato2008.csv", header= TRUE, stringsAsFactors = F)
attach(MissAm.df)

state.df <- data.frame(abbreviation = state.abb, State=state.name, stringsAsFactors = F)
MissAm.df <- MissAm.df %>% 
  mutate(InTop10 = ifelse(Top10>0, 1, 0)) %>% 
  inner_join(state.df)

MissAm.glm <- glm(InTop10 ~ LogPopulation + LogContestants + LogTotalArea, family=binomial(link = "logit"), data = MissAm.df)

#Cut of leverage
#we have 4 variables and 50 observations
highLeverageHat = 2 * (3+1)/50

#Leverage values
MissAm.df$hatVal <- hatvalues(MissAm.glm)

#standardized deviance residuals(sdr)
#Get pearson residuals
MissAm.df$pearsonResd <- residuals(MissAm.glm,'pearson')

MissAm.df$sdr <- MissAm.df$pearsonResd / (sqrt(1 - MissAm.df$hatVal))

#Cook's distance
MissAm.df$cookd <- cooks.distance(MissAm.glm)

#High leverage SDR
#data points falling outside 2 standard deviations
highLeverageSdrU <- mean(MissAm.df$sdr) + (2*sd(MissAm.df$sdr))
highLeverageSdrL <- mean(MissAm.df$sdr) - (2*sd(MissAm.df$sdr))

#High leverage based on Cook's distance
#data points falling outside 2 standard deviations
highLeverageCookdU <- mean(MissAm.df$cookd) + (2*sd(MissAm.df$cookd))
highLeverageCookdL <- mean(MissAm.df$cookd) - (2*sd(MissAm.df$cookd))

MissAm.df$Outlier <- ifelse((MissAm.df$hatVal > highLeverageHat | MissAm.df$sdr >  highLeverageSdrU | MissAm.df$sdr <  highLeverageSdrL | MissAm.df$cookd > highLeverageCookdU | MissAm.df$cookd < highLeverageCookdL),'Yes','No')

#summary(MissAm.glm)

Purpose of the discussion is to show convenience of having tooltip on graphs and plots.

Data is downloaded from http://www.stat.tamu.edu/~sheather/book/data_sets.php, and the MissAmericato2008.txt file is tab separated, I converted into comma separated csv file.

Residual plots can be used to assess the quality of the regression. When tooltip is added to the plot, it can provide information such as the value of a specific data point, the name of a group, or the label. I used plotly to add tooltip to the graph.

Plot is generated using influencePlot function from car package. It only provides row names of the data points that have high leverage value. One has to refer to data table to get actual information about data point.

influencePlot(MissAm.glm, col="red",id.n=5)

##       StudRes        Hat      CookD
## 8  -2.2315944 0.22191328 0.45381657
## 11  1.3549485 0.15762426 0.06839409
## 23 -1.2075931 0.17274282 0.05612712
## 25 -2.2347980 0.04654186 0.11263216
## 28  2.2400994 0.07957585 0.18272920
## 34 -0.7564691 0.15896707 0.01655727
## 35 -2.4581400 0.05002072 0.18585607
## 39  0.5455872 0.21222184 0.01187986
## 45 -0.7364479 0.17412150 0.01749629



The plot is generated using ggplot; it provides state names of the data points that have high leverage value. GGPlot allows developers to customize the plot. Due to static nature, too much information will make it look cluttered.

ggplot(data=MissAm.df, aes(hatVal,sdr)) + 
  geom_point(aes(col=Outlier)) + 
  scale_color_manual(values=c("black", "red")) +
  geom_vline(xintercept=highLeverageHat, color="blue") +
  geom_hline(yintercept=c(highLeverageSdrU, highLeverageSdrL), color="blue") +
  geom_text_repel(data=filter(MissAm.df, (Outlier == 'Yes')), aes(hatVal,sdr, label=State), size=3) +
  labs(title = sprintf("High Leverage Data Points Using GGPlot")) + xlab("Leverage(Hat-Values)") +
  ylab("Standardized Deviance Residuals") +
  annotate("text", x = 0.04, y = -2.3, label = 'SDR - Lower Bound', colour="blue", size = 3) + 
  annotate("text", x = 0.04, y = 2.3, label = 'SDR - Upper Bound', colour="blue", size = 3) +
  annotate("text", x = 0.18, y = 2.5, label = 'High Leverage Hat Value', colour="blue", size = 3)

MissAm.df %>% 
  select(State, Top10, InTop10, LogPopulation, LogContestants, LogTotalArea, pearsonResd, hatVal, sdr, cookd, Outlier) %>% 
  #filter(Outlier == 'Yes') %>% 
  kable("html",caption = "Miss America Contest - High Leverage Data Points", row.names=T) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "left", font_size = 12) %>%
  scroll_box(width = "100%", height = "250px")
Miss America Contest - High Leverage Data Points
State Top10 InTop10 LogPopulation LogContestants LogTotalArea pearsonResd hatVal sdr cookd Outlier
1 Alabama 6 1 11.9249 3.895894 10.8670 0.2105330 0.0551628 0.2165916 0.0006847 No
2 Alaska 0 0 9.8011 2.708050 13.4049 -0.1032992 0.0317091 -0.1049769 0.0000902 No
3 Arizona 0 0 12.0543 2.862201 11.6439 -0.9519512 0.1297454 -1.0204496 0.0388123 No
4 Arkansas 4 1 11.2702 3.766997 10.8814 0.3544838 0.1334701 0.3808067 0.0055841 No
5 California 5 1 14.0005 3.935739 12.0058 0.1119460 0.0236825 0.1132956 0.0000778 No
6 Colorado 0 0 11.8820 2.944439 11.5530 -1.0054959 0.0894318 -1.0537172 0.0272626 No
7 Connecticut 2 1 11.5742 2.944439 8.6203 0.2997294 0.0753287 0.3116991 0.0019787 No
8 Delaware 0 0 10.3397 2.852631 7.8196 -2.2253921 0.2219133 -2.5228564 0.4538166 Yes
9 Florida 3 1 13.0882 3.725693 11.0937 0.1558948 0.0245477 0.1578442 0.0001567 No
10 Georgia 4 1 12.5558 3.875359 10.9925 0.1634586 0.0300767 0.1659736 0.0002136 No
11 Hawaii 3 1 10.6205 2.564949 9.2994 1.1097716 0.1576243 1.2091514 0.0683941 No
12 Idaho 0 0 10.6565 2.970414 11.3334 -0.5974581 0.1030890 -0.6308597 0.0114359 No
13 Illinois 2 1 13.0341 3.238678 10.9667 0.2811020 0.0551326 0.2891868 0.0012199 No
14 Indiana 2 1 12.3026 3.126761 10.5028 0.3863551 0.0448888 0.3953299 0.0018363 No
15 Iowa 1 1 11.6306 2.879199 10.9380 0.9286648 0.0602478 0.9579709 0.0147086 No
16 Kansas 1 1 11.4371 2.978925 11.3178 1.0825289 0.0612578 1.1172911 0.0203652 No
17 Kentucky 2 1 11.7568 3.433987 10.6068 0.3673354 0.0462511 0.3761368 0.0017152 No
18 Louisiana 2 1 12.1042 3.465736 10.8559 0.3289336 0.0393027 0.3355946 0.0011519 No
19 Maine 0 0 10.6066 2.484907 10.4740 -0.4677695 0.0880701 -0.4898365 0.0057931 No
20 Maryland 3 1 12.1239 3.218876 9.4260 0.2291365 0.0354124 0.2333047 0.0004996 No
21 Massachusetts 2 1 12.4428 2.917771 9.2644 0.2628621 0.0688379 0.2724051 0.0013714 No
22 Michigan 3 1 12.8257 3.367296 11.4795 0.3386953 0.0564323 0.3486765 0.0018178 No
23 Minnesota 0 0 12.1286 2.724579 11.3730 -0.9430982 0.1727428 -1.0368994 0.0561271 Yes
24 Mississippi 3 1 11.6233 3.703768 10.7879 0.3045043 0.0754200 0.3166804 0.0020451 No
25 Missouri 0 0 12.1716 3.526361 11.1520 -2.9664769 0.0465419 -3.0380167 0.1126322 Yes
26 Montana 0 0 10.2920 2.852631 11.8985 -0.3253616 0.0915356 -0.3413597 0.0029353 No
27 Nebraska 0 0 11.0494 2.871680 11.2561 -0.6740112 0.0707396 -0.6991952 0.0093038 No
28 Nevada 1 1 10.9462 2.549445 11.6133 2.7895320 0.0795758 2.9076180 0.1827292 Yes
29 New Hampshire 1 1 10.6631 2.785011 9.1431 0.7623578 0.1392479 0.8217123 0.0273080 No
30 New Jersey 2 1 12.5266 3.242592 9.0735 0.1521253 0.0276865 0.1542760 0.0001694 No
31 New Mexico 0 0 11.0420 3.091042 11.7084 -0.7191120 0.1119656 -0.7631003 0.0183552 No
32 New York 3 1 13.4873 3.072693 10.9070 0.2649769 0.0915608 0.2780098 0.0019475 No
33 North Carolina 2 1 12.5005 3.332205 10.8934 0.3209050 0.0380294 0.3271866 0.0010580 No
34 North Dakota 0 0 10.1843 3.032546 11.1662 -0.5428548 0.1589671 -0.5919394 0.0165573 No
35 Ohio 0 0 12.9287 3.212187 10.7105 -3.6623304 0.0500207 -3.7575127 0.1858561 Yes
36 Oklahoma 5 1 11.6138 3.784190 11.1548 0.3277005 0.1021458 0.3458391 0.0034018 No
37 Oregon 1 1 11.6680 3.100092 11.4966 0.8911398 0.0650090 0.9215992 0.0147635 No
38 Pennsylvania 4 1 13.0212 3.178054 10.7376 0.2748416 0.0569872 0.2830243 0.0012102 No
39 Rhode Island 1 1 10.7402 2.656757 7.3428 0.3727730 0.2122218 0.4199934 0.0118799 Yes
40 South Carolina 1 1 11.8946 3.725693 10.3741 0.2112724 0.0418060 0.2158321 0.0005081 No
41 South Dakota 0 0 10.2306 2.674149 11.2531 -0.3387235 0.0744488 -0.3520831 0.0024928 No
42 Tennessee 1 1 12.1059 3.564827 10.6488 0.2631009 0.0372047 0.2681362 0.0006946 No
43 Texas 7 1 13.4640 3.811097 12.5009 0.2200745 0.0633954 0.2274006 0.0008750 No
44 Utah 2 1 11.5123 4.040123 11.3492 0.2734701 0.1339095 0.2938517 0.0033377 No
45 Vermont 0 0 10.0402 2.335375 9.1710 -0.5235920 0.1741215 -0.5761491 0.0174963 Yes
46 Virginia 3 1 12.4058 3.258097 10.6637 0.3333753 0.0376386 0.3398321 0.0011292 No
47 Washington 1 1 12.2114 2.995732 11.1747 0.6551044 0.0781176 0.6822959 0.0098619 No
48 West Virginia 2 1 10.9746 3.091042 10.0953 0.6807896 0.0730636 0.7071116 0.0098530 No
49 Wisconsin 3 1 12.2358 3.212187 11.0898 0.4719343 0.0454403 0.4830366 0.0027768 No
50 Wyoming 0 0 9.6926 1.909542 11.4908 -0.0859607 0.0188417 -0.0867821 0.0000362 No



Finally, Plotly to the rescue. Plotly allows developers add tooltip to the data points and users can get information instantly. Developers can also customize tooltip and provide sufficient information. It helps users to understand the data point and draw inferences.

pal <- c("blue", "red")
f <- list(
  size = 18,
  color = "#7f7f7f"
)
x <- list(
  title = "Leverage(Hat-Values)",
  titlefont = f
)
y <- list(
  title = "Standardized Deviance Residuals(SDR)",
  titlefont = f
)
l <- list(
  font = list(
    family = "sans-serif",
    size = 12,
    color = "#000"),
  bgcolor = "#E2E2E2",
  bordercolor = "#FFFFFF",
  borderwidth = 2)
p <- plot_ly(MissAm.df, x = ~hatVal, y = ~sdr, color = ~Outlier, mode = "markers", colors = pal,
         hoverinfo = 'text', text = ~paste('Outlier:',Outlier, '<br> State: ', State,'<br> Number of Times in Top 10 List: ', Top10,'<br> Log Population: ', round(LogPopulation,3), '<br> Log Contestants: ', round(LogContestants,3), '<br> Log Total Area:', round(LogTotalArea,3), '<br> Leverage(Hat-Value):', round(hatVal,3),'<br> SDR:', round(sdr,3))) %>% 
  layout(xaxis = x, yaxis = y, title = 'High Leverage Data Points Using Plotly',legend = l) 
p