options(scipen=999)
library(dplyr)
library(tidyverse)
library(knitr)
library(kableExtra)
library(car)
library(ggrepel)
library(ggplot2)
library(plotly)
#Load data
MissAm.df <- read.csv("https://raw.githubusercontent.com/akulapa/Data621-Week08-Discussion/master/MissAmericato2008.csv", header= TRUE, stringsAsFactors = F)
attach(MissAm.df)
state.df <- data.frame(abbreviation = state.abb, State=state.name, stringsAsFactors = F)
MissAm.df <- MissAm.df %>%
mutate(InTop10 = ifelse(Top10>0, 1, 0)) %>%
inner_join(state.df)
MissAm.glm <- glm(InTop10 ~ LogPopulation + LogContestants + LogTotalArea, family=binomial(link = "logit"), data = MissAm.df)
#Cut of leverage
#we have 4 variables and 50 observations
highLeverageHat = 2 * (3+1)/50
#Leverage values
MissAm.df$hatVal <- hatvalues(MissAm.glm)
#standardized deviance residuals(sdr)
#Get pearson residuals
MissAm.df$pearsonResd <- residuals(MissAm.glm,'pearson')
MissAm.df$sdr <- MissAm.df$pearsonResd / (sqrt(1 - MissAm.df$hatVal))
#Cook's distance
MissAm.df$cookd <- cooks.distance(MissAm.glm)
#High leverage SDR
#data points falling outside 2 standard deviations
highLeverageSdrU <- mean(MissAm.df$sdr) + (2*sd(MissAm.df$sdr))
highLeverageSdrL <- mean(MissAm.df$sdr) - (2*sd(MissAm.df$sdr))
#High leverage based on Cook's distance
#data points falling outside 2 standard deviations
highLeverageCookdU <- mean(MissAm.df$cookd) + (2*sd(MissAm.df$cookd))
highLeverageCookdL <- mean(MissAm.df$cookd) - (2*sd(MissAm.df$cookd))
MissAm.df$Outlier <- ifelse((MissAm.df$hatVal > highLeverageHat | MissAm.df$sdr > highLeverageSdrU | MissAm.df$sdr < highLeverageSdrL | MissAm.df$cookd > highLeverageCookdU | MissAm.df$cookd < highLeverageCookdL),'Yes','No')
#summary(MissAm.glm)
Purpose of the discussion is to show convenience of having tooltip on graphs and plots.
Data is downloaded from http://www.stat.tamu.edu/~sheather/book/data_sets.php, and the MissAmericato2008.txt file is tab separated, I converted into comma separated csv file.
Residual plots can be used to assess the quality of the regression. When tooltip is added to the plot, it can provide information such as the value of a specific data point, the name of a group, or the label. I used plotly to add tooltip to the graph.
Plot is generated using influencePlot function from car package. It only provides row names of the data points that have high leverage value. One has to refer to data table to get actual information about data point.
influencePlot(MissAm.glm, col="red",id.n=5)
## StudRes Hat CookD
## 8 -2.2315944 0.22191328 0.45381657
## 11 1.3549485 0.15762426 0.06839409
## 23 -1.2075931 0.17274282 0.05612712
## 25 -2.2347980 0.04654186 0.11263216
## 28 2.2400994 0.07957585 0.18272920
## 34 -0.7564691 0.15896707 0.01655727
## 35 -2.4581400 0.05002072 0.18585607
## 39 0.5455872 0.21222184 0.01187986
## 45 -0.7364479 0.17412150 0.01749629
The plot is generated using ggplot; it provides state names of the data points that have high leverage value. GGPlot allows developers to customize the plot. Due to static nature, too much information will make it look cluttered.
ggplot(data=MissAm.df, aes(hatVal,sdr)) +
geom_point(aes(col=Outlier)) +
scale_color_manual(values=c("black", "red")) +
geom_vline(xintercept=highLeverageHat, color="blue") +
geom_hline(yintercept=c(highLeverageSdrU, highLeverageSdrL), color="blue") +
geom_text_repel(data=filter(MissAm.df, (Outlier == 'Yes')), aes(hatVal,sdr, label=State), size=3) +
labs(title = sprintf("High Leverage Data Points Using GGPlot")) + xlab("Leverage(Hat-Values)") +
ylab("Standardized Deviance Residuals") +
annotate("text", x = 0.04, y = -2.3, label = 'SDR - Lower Bound', colour="blue", size = 3) +
annotate("text", x = 0.04, y = 2.3, label = 'SDR - Upper Bound', colour="blue", size = 3) +
annotate("text", x = 0.18, y = 2.5, label = 'High Leverage Hat Value', colour="blue", size = 3)
MissAm.df %>%
select(State, Top10, InTop10, LogPopulation, LogContestants, LogTotalArea, pearsonResd, hatVal, sdr, cookd, Outlier) %>%
#filter(Outlier == 'Yes') %>%
kable("html",caption = "Miss America Contest - High Leverage Data Points", row.names=T) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "left", font_size = 12) %>%
scroll_box(width = "100%", height = "250px")
| State | Top10 | InTop10 | LogPopulation | LogContestants | LogTotalArea | pearsonResd | hatVal | sdr | cookd | Outlier | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Alabama | 6 | 1 | 11.9249 | 3.895894 | 10.8670 | 0.2105330 | 0.0551628 | 0.2165916 | 0.0006847 | No |
| 2 | Alaska | 0 | 0 | 9.8011 | 2.708050 | 13.4049 | -0.1032992 | 0.0317091 | -0.1049769 | 0.0000902 | No |
| 3 | Arizona | 0 | 0 | 12.0543 | 2.862201 | 11.6439 | -0.9519512 | 0.1297454 | -1.0204496 | 0.0388123 | No |
| 4 | Arkansas | 4 | 1 | 11.2702 | 3.766997 | 10.8814 | 0.3544838 | 0.1334701 | 0.3808067 | 0.0055841 | No |
| 5 | California | 5 | 1 | 14.0005 | 3.935739 | 12.0058 | 0.1119460 | 0.0236825 | 0.1132956 | 0.0000778 | No |
| 6 | Colorado | 0 | 0 | 11.8820 | 2.944439 | 11.5530 | -1.0054959 | 0.0894318 | -1.0537172 | 0.0272626 | No |
| 7 | Connecticut | 2 | 1 | 11.5742 | 2.944439 | 8.6203 | 0.2997294 | 0.0753287 | 0.3116991 | 0.0019787 | No |
| 8 | Delaware | 0 | 0 | 10.3397 | 2.852631 | 7.8196 | -2.2253921 | 0.2219133 | -2.5228564 | 0.4538166 | Yes |
| 9 | Florida | 3 | 1 | 13.0882 | 3.725693 | 11.0937 | 0.1558948 | 0.0245477 | 0.1578442 | 0.0001567 | No |
| 10 | Georgia | 4 | 1 | 12.5558 | 3.875359 | 10.9925 | 0.1634586 | 0.0300767 | 0.1659736 | 0.0002136 | No |
| 11 | Hawaii | 3 | 1 | 10.6205 | 2.564949 | 9.2994 | 1.1097716 | 0.1576243 | 1.2091514 | 0.0683941 | No |
| 12 | Idaho | 0 | 0 | 10.6565 | 2.970414 | 11.3334 | -0.5974581 | 0.1030890 | -0.6308597 | 0.0114359 | No |
| 13 | Illinois | 2 | 1 | 13.0341 | 3.238678 | 10.9667 | 0.2811020 | 0.0551326 | 0.2891868 | 0.0012199 | No |
| 14 | Indiana | 2 | 1 | 12.3026 | 3.126761 | 10.5028 | 0.3863551 | 0.0448888 | 0.3953299 | 0.0018363 | No |
| 15 | Iowa | 1 | 1 | 11.6306 | 2.879199 | 10.9380 | 0.9286648 | 0.0602478 | 0.9579709 | 0.0147086 | No |
| 16 | Kansas | 1 | 1 | 11.4371 | 2.978925 | 11.3178 | 1.0825289 | 0.0612578 | 1.1172911 | 0.0203652 | No |
| 17 | Kentucky | 2 | 1 | 11.7568 | 3.433987 | 10.6068 | 0.3673354 | 0.0462511 | 0.3761368 | 0.0017152 | No |
| 18 | Louisiana | 2 | 1 | 12.1042 | 3.465736 | 10.8559 | 0.3289336 | 0.0393027 | 0.3355946 | 0.0011519 | No |
| 19 | Maine | 0 | 0 | 10.6066 | 2.484907 | 10.4740 | -0.4677695 | 0.0880701 | -0.4898365 | 0.0057931 | No |
| 20 | Maryland | 3 | 1 | 12.1239 | 3.218876 | 9.4260 | 0.2291365 | 0.0354124 | 0.2333047 | 0.0004996 | No |
| 21 | Massachusetts | 2 | 1 | 12.4428 | 2.917771 | 9.2644 | 0.2628621 | 0.0688379 | 0.2724051 | 0.0013714 | No |
| 22 | Michigan | 3 | 1 | 12.8257 | 3.367296 | 11.4795 | 0.3386953 | 0.0564323 | 0.3486765 | 0.0018178 | No |
| 23 | Minnesota | 0 | 0 | 12.1286 | 2.724579 | 11.3730 | -0.9430982 | 0.1727428 | -1.0368994 | 0.0561271 | Yes |
| 24 | Mississippi | 3 | 1 | 11.6233 | 3.703768 | 10.7879 | 0.3045043 | 0.0754200 | 0.3166804 | 0.0020451 | No |
| 25 | Missouri | 0 | 0 | 12.1716 | 3.526361 | 11.1520 | -2.9664769 | 0.0465419 | -3.0380167 | 0.1126322 | Yes |
| 26 | Montana | 0 | 0 | 10.2920 | 2.852631 | 11.8985 | -0.3253616 | 0.0915356 | -0.3413597 | 0.0029353 | No |
| 27 | Nebraska | 0 | 0 | 11.0494 | 2.871680 | 11.2561 | -0.6740112 | 0.0707396 | -0.6991952 | 0.0093038 | No |
| 28 | Nevada | 1 | 1 | 10.9462 | 2.549445 | 11.6133 | 2.7895320 | 0.0795758 | 2.9076180 | 0.1827292 | Yes |
| 29 | New Hampshire | 1 | 1 | 10.6631 | 2.785011 | 9.1431 | 0.7623578 | 0.1392479 | 0.8217123 | 0.0273080 | No |
| 30 | New Jersey | 2 | 1 | 12.5266 | 3.242592 | 9.0735 | 0.1521253 | 0.0276865 | 0.1542760 | 0.0001694 | No |
| 31 | New Mexico | 0 | 0 | 11.0420 | 3.091042 | 11.7084 | -0.7191120 | 0.1119656 | -0.7631003 | 0.0183552 | No |
| 32 | New York | 3 | 1 | 13.4873 | 3.072693 | 10.9070 | 0.2649769 | 0.0915608 | 0.2780098 | 0.0019475 | No |
| 33 | North Carolina | 2 | 1 | 12.5005 | 3.332205 | 10.8934 | 0.3209050 | 0.0380294 | 0.3271866 | 0.0010580 | No |
| 34 | North Dakota | 0 | 0 | 10.1843 | 3.032546 | 11.1662 | -0.5428548 | 0.1589671 | -0.5919394 | 0.0165573 | No |
| 35 | Ohio | 0 | 0 | 12.9287 | 3.212187 | 10.7105 | -3.6623304 | 0.0500207 | -3.7575127 | 0.1858561 | Yes |
| 36 | Oklahoma | 5 | 1 | 11.6138 | 3.784190 | 11.1548 | 0.3277005 | 0.1021458 | 0.3458391 | 0.0034018 | No |
| 37 | Oregon | 1 | 1 | 11.6680 | 3.100092 | 11.4966 | 0.8911398 | 0.0650090 | 0.9215992 | 0.0147635 | No |
| 38 | Pennsylvania | 4 | 1 | 13.0212 | 3.178054 | 10.7376 | 0.2748416 | 0.0569872 | 0.2830243 | 0.0012102 | No |
| 39 | Rhode Island | 1 | 1 | 10.7402 | 2.656757 | 7.3428 | 0.3727730 | 0.2122218 | 0.4199934 | 0.0118799 | Yes |
| 40 | South Carolina | 1 | 1 | 11.8946 | 3.725693 | 10.3741 | 0.2112724 | 0.0418060 | 0.2158321 | 0.0005081 | No |
| 41 | South Dakota | 0 | 0 | 10.2306 | 2.674149 | 11.2531 | -0.3387235 | 0.0744488 | -0.3520831 | 0.0024928 | No |
| 42 | Tennessee | 1 | 1 | 12.1059 | 3.564827 | 10.6488 | 0.2631009 | 0.0372047 | 0.2681362 | 0.0006946 | No |
| 43 | Texas | 7 | 1 | 13.4640 | 3.811097 | 12.5009 | 0.2200745 | 0.0633954 | 0.2274006 | 0.0008750 | No |
| 44 | Utah | 2 | 1 | 11.5123 | 4.040123 | 11.3492 | 0.2734701 | 0.1339095 | 0.2938517 | 0.0033377 | No |
| 45 | Vermont | 0 | 0 | 10.0402 | 2.335375 | 9.1710 | -0.5235920 | 0.1741215 | -0.5761491 | 0.0174963 | Yes |
| 46 | Virginia | 3 | 1 | 12.4058 | 3.258097 | 10.6637 | 0.3333753 | 0.0376386 | 0.3398321 | 0.0011292 | No |
| 47 | Washington | 1 | 1 | 12.2114 | 2.995732 | 11.1747 | 0.6551044 | 0.0781176 | 0.6822959 | 0.0098619 | No |
| 48 | West Virginia | 2 | 1 | 10.9746 | 3.091042 | 10.0953 | 0.6807896 | 0.0730636 | 0.7071116 | 0.0098530 | No |
| 49 | Wisconsin | 3 | 1 | 12.2358 | 3.212187 | 11.0898 | 0.4719343 | 0.0454403 | 0.4830366 | 0.0027768 | No |
| 50 | Wyoming | 0 | 0 | 9.6926 | 1.909542 | 11.4908 | -0.0859607 | 0.0188417 | -0.0867821 | 0.0000362 | No |
Finally, Plotly to the rescue. Plotly allows developers add tooltip to the data points and users can get information instantly. Developers can also customize tooltip and provide sufficient information. It helps users to understand the data point and draw inferences.
pal <- c("blue", "red")
f <- list(
size = 18,
color = "#7f7f7f"
)
x <- list(
title = "Leverage(Hat-Values)",
titlefont = f
)
y <- list(
title = "Standardized Deviance Residuals(SDR)",
titlefont = f
)
l <- list(
font = list(
family = "sans-serif",
size = 12,
color = "#000"),
bgcolor = "#E2E2E2",
bordercolor = "#FFFFFF",
borderwidth = 2)
p <- plot_ly(MissAm.df, x = ~hatVal, y = ~sdr, color = ~Outlier, mode = "markers", colors = pal,
hoverinfo = 'text', text = ~paste('Outlier:',Outlier, '<br> State: ', State,'<br> Number of Times in Top 10 List: ', Top10,'<br> Log Population: ', round(LogPopulation,3), '<br> Log Contestants: ', round(LogContestants,3), '<br> Log Total Area:', round(LogTotalArea,3), '<br> Leverage(Hat-Value):', round(hatVal,3),'<br> SDR:', round(sdr,3))) %>%
layout(xaxis = x, yaxis = y, title = 'High Leverage Data Points Using Plotly',legend = l)
p