Preparation

library(ggpubr)
library(ggplot2)
library(stargazer)
#let's load our data
setwd("~/Dropbox/Data Analysis F20 Recitation/Recitation Content/Recitation 4")
Pokemon<-read.csv("Pokemon.csv",header=T,na.strings="?")
#tell R that we are only working with Pokemon data

Faceting

This is a kernel density plot for continuous variables.

ggplot(data = Pokemon) +
  geom_density(aes(x = Speed,fill=Legendary), alpha=0.55)

If we want to visualize Speed for legendary and ordinary pokemons separately, we use faceting with facet_grid(). Let’s show them in two columns:

ggplot(data = Pokemon) +
    geom_density(aes(x = Speed, fill=Legendary), alpha=0.55) +
    facet_grid(rows=vars(Legendary))

ggplot(data = Pokemon) +
    geom_density(aes(x = Speed, fill=Legendary), alpha=0.55) +
    facet_grid(cols=vars(Legendary))

Practice Question

Can you reproduce this?

Question

Answer

Faceting with facet_wrap

Instead of faceting with a variable in the horizontal or vertical direction, facets can be placed next to each other, wrapping with a certain number of columns or rows. The label for each plot will be at the top of the plot.

ggplot(data = Pokemon) +
    geom_density(aes(x = Attack),fill="yellow", alpha=0.55) +
    facet_wrap(vars(Generation), ncol=5)

Statistical Transformations

stat="identity"

ggplot2 supports many different statistical transformations. For example, the “identity” transformation will leave the data “as is”. You can specify which statistical transformation a geom uses by passing it as the stat argument. I manually create the dataset:

library(tidyverse)

LCI <- function(x) {
  mean(x,na.rm=TRUE) - 1.96* plotrix::std.error(x,na.rm=TRUE)
}

HCI <- function(x) {
  mean(x,na.rm=TRUE) + 1.96* plotrix::std.error(x,na.rm=TRUE)
}

mystats<-Pokemon %>% group_by(Generation, Legendary) %>% 
  summarize(mean=mean(Attack, na.rm=TRUE),
            LCI=LCI(Attack),
            HCI=HCI(Attack))

mystats
## # A tibble: 12 x 5
## # Groups:   Generation [6]
##    Generation Legendary  mean   LCI   HCI
##         <int> <chr>     <dbl> <dbl> <dbl>
##  1          1 False      75.0  70.5  79.5
##  2          1 True      121.   88.0 154. 
##  3          2 False      70.7  64.3  77.1
##  4          2 True       99    79.1 119. 
##  5          3 False      77.0  71.5  82.5
##  6          3 True      118.   99.4 137. 
##  7          4 False      79.6  73.5  85.7
##  8          4 True      110.   98.3 121. 
##  9          5 False      78.2  73.7  82.8
## 10          5 True      120.  109.  132. 
## 11          6 False      70.5  64.9  76.0
## 12          6 True      125.  108.  142.
ggplot(mystats)+
  geom_bar(stat="identity", aes(x = factor(Generation), y = mean, fill=Legendary),position=position_dodge(.9))

stat_summary()

With the stat_summary() function, we can summarize y values at distinct x values. For example:

MyMedian <- function(x) {
  median(x,na.rm=TRUE)
}
ggplot(data = Pokemon) + aes(x = factor(Legendary), y = Attack)+
  stat_summary(geom="bar", fun=MyMedian)

Practice Question

Question

Reproduce this figure:

Answer

Themes

We can also change themes using theme_X. A complete list of themes are available here.

f1+theme_gray()

f1+theme_light()

f1+theme_linedraw()

Challenge of the Day!

Question

Can you reproduce this graph?

Hint #1: to retain the order in the graph based on a variable value, do this

data<-data[order(data$mean),] #to sort
data$Type.1<-factor(data$Type.1, levels = data$Type.1)  # convert to factor to retain sorted order in plot.

Hint #2: you should use geom_text() and add text in points using aes(label=varname).

Hint #3: use ggplot()+....+coord_flip() to flip the x and y axis.

Answer

LCI95 <- function(x) {
  mean(x,na.rm=TRUE) - 1.96* plotrix::std.error(x,na.rm=TRUE)
}

HCI95 <- function(x) {
  mean(x,na.rm=TRUE) + 1.96* plotrix::std.error(x,na.rm=TRUE)
}

LCI99 <- function(x) {
  mean(x,na.rm=TRUE) - 2.575* plotrix::std.error(x,na.rm=TRUE)
}

HCI99 <- function(x) {
  mean(x,na.rm=TRUE) + 2.575* plotrix::std.error(x,na.rm=TRUE)
}

data<-Pokemon %>% group_by(Type.1) %>% 
  summarize(mean=mean(Total),
            LCI95=LCI95(Total),
            HCI95=HCI95(Total),
            LCI99=LCI99(Total),
            HCI99=HCI99(Total))

data<-data[order(data$mean),] # sort
data$Type.1<-factor(data$Type.1, levels = data$Type.1)  # convert to factor to retain sorted order in plot.

ggplot(data=data)+
  geom_linerange(aes(x=Type.1, ymin=LCI99, ymax=HCI99), color="black", size=0.5)+
  geom_linerange(aes(x=Type.1, ymin=LCI95, ymax=HCI95), color="blue", size=1)+
  geom_point(aes(x=Type.1, y=mean),size=7, color="black")+
  geom_point(aes(x=Type.1, y=mean),size=6, color="white")+
  geom_text(aes(x=Type.1, y=mean, label=round(mean)),size=2, color="black")+
  theme_minimal()+
  coord_flip()+
  labs(title="Total Strength by Pokemon Type", x="", y="Total Score (average)")

Visualizing Regression Outputs

Let’s say, we have run the following model:

regression <- lm(Attack ~ Speed + Defense + Legendary, data=Pokemon)
library(tidyverse)
library(equatiomatic)
#install.packages("ggeffects")
library(ggeffects)
library(splines)
extract_eq(regression)

\[ \operatorname{Attack} = \alpha + \beta_{1}(\operatorname{Speed}) + \beta_{2}(\operatorname{Defense}) + \beta_{3}(\operatorname{Legendary}_{\operatorname{True}}) + \epsilon \]

stargazer(regression, header=FALSE, type="html")
Dependent variable:
Attack
Speed 0.368***
(0.034)
Defense 0.416***
(0.031)
LegendaryTrue 16.540***
(3.716)
Constant 21.826***
(3.377)
Observations 800
R2 0.349
Adjusted R2 0.347
Residual Std. Error 26.238 (df = 796)
F Statistic 142.233*** (df = 3; 796)
Note: p<0.1; p<0.05; p<0.01
ME.Speed<-ggpredict(regression, terms = "Speed") 

ME.Speed
## 
## # Predicted values of Attack
## # x = Speed
## 
##   x | Predicted |   SE |           95% CI
## -----------------------------------------
##   0 |     50.92 | 2.42 | [ 46.17,  55.67]
##  20 |     58.29 | 1.82 | [ 54.71,  61.86]
##  40 |     65.65 | 1.30 | [ 63.10,  68.20]
##  60 |     73.01 | 0.99 | [ 71.08,  74.95]
## 100 |     87.74 | 1.52 | [ 84.77,  90.71]
## 120 |     95.10 | 2.08 | [ 91.02,  99.19]
## 140 |    102.47 | 2.70 | [ 97.17, 107.76]
## 180 |    117.19 | 4.00 | [109.36, 125.03]
## 
## Adjusted for:
## *   Defense = 70.00
## * Legendary = False
#plot the relationship
plot(ME.Speed)

ME.Defense<-ggpredict(regression, terms = "Defense") 

ME.Defense
## 
## # Predicted values of Attack
## # x = Defense
## 
##   x | Predicted |   SE |           95% CI
## -----------------------------------------
##   0 |     45.76 | 2.41 | [ 41.04,  50.48]
##  30 |     58.23 | 1.61 | [ 55.08,  61.37]
##  60 |     70.70 | 1.03 | [ 68.67,  72.72]
##  90 |     83.17 | 1.12 | [ 80.97,  85.36]
## 110 |     91.48 | 1.53 | [ 88.48,  94.47]
## 140 |    103.95 | 2.32 | [ 99.41, 108.49]
## 170 |    116.42 | 3.18 | [110.18, 122.65]
## 230 |    141.36 | 4.97 | [131.61, 151.10]
## 
## Adjusted for:
## *     Speed = 65.00
## * Legendary = False
#plot the relationship
plot(ME.Defense)

ME.Legendary<-ggpredict(regression, terms = "Legendary") 

ME.Legendary
## 
## # Predicted values of Attack
## # x = Legendary
## 
## x     | Predicted |   SE |         95% CI
## -----------------------------------------
## False |     74.85 | 0.97 | [72.95, 76.75]
## True  |     91.39 | 3.61 | [84.33, 98.46]
## 
## Adjusted for:
## *   Speed = 65.00
## * Defense = 70.00
#plot the relationship
plot(ME.Legendary)

Let’s create a fake dataset for Pokemon strength and assume the relationship between strength and speed is exponential.

b0<-6
b1<-0.5
eps = rnorm(nrow(Pokemon),0, 1)
Pokemon$Strength = b0 + b1*I(Pokemon$Speed^2)+ eps #create fake data

ggplot(data=Pokemon) + geom_point(aes(x=Speed, y=Strength)) #visualize relationship

regression <- lm(Strength ~ I(Speed^2), data=Pokemon) #run regression

summary(regression)
## 
## Call:
## lm(formula = Strength ~ I(Speed^2), data = Pokemon)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.98078 -0.68202 -0.01038  0.70545  3.12279 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.999e+00  5.560e-02   107.9   <2e-16 ***
## I(Speed^2)  5.000e-01  7.889e-06 63378.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.982 on 798 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 4.017e+09 on 1 and 798 DF,  p-value: < 2.2e-16
#calculate the expected values (mean) of dependent variable based on speed
ME.Defense<-ggpredict(regression, terms = "Speed") 

ME.Defense
## 
## # Predicted values of Strength
## # x = Speed
## 
##     x | Predicted |      SE |               95% CI
## --------------------------------------------------
##    25 |    318.49 |    0.05 | [  318.39,   318.59]
##  1089 |  5.93e+05 |    9.31 | [5.93e+05, 5.93e+05]
##  2304 |  2.65e+06 |   41.83 | [2.65e+06, 2.65e+06]
##  3844 |  7.39e+06 |  116.53 | [7.39e+06, 7.39e+06]
##  5625 |  1.58e+07 |  249.57 | [1.58e+07, 1.58e+07]
##  7921 |  3.14e+07 |  494.93 | [3.14e+07, 3.14e+07]
## 10816 |  5.85e+07 |  922.85 | [5.85e+07, 5.85e+07]
## 32400 |  5.25e+08 | 8281.46 | [5.25e+08, 5.25e+08]
#plot the relationship
plot(ME.Defense)

Dummy Variable Regression

Let’s visualize dummy variable regression:

library(multcomp) # This is a cool library you should keep in mind
ideology = rep( c("Liberal", "Moderate", "Fund"), each = 1000)
eps = rnorm(3000, 0, 200)
#I assume liberal average value for Gun is 10, moderate is 15, fund is 25
Gun = 10 + 5*(ideology == "Moderate")+15*(ideology=="Fund") + eps

# Below is something similar to what you need to adopt. If you have further questions on interpreting dummy variable regressions, we can go over during my office hour.
Gunlm = lm(Gun ~ ideology)
summary(Gunlm)
## 
## Call:
## lm(formula = Gun ~ ideology)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -679.92 -134.81    0.23  137.17  637.56 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        28.378      6.205   4.573 4.99e-06 ***
## ideologyLiberal   -19.020      8.775  -2.167   0.0303 *  
## ideologyModerate   -5.864      8.775  -0.668   0.5040    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 196.2 on 2997 degrees of freedom
## Multiple R-squared:  0.001642,   Adjusted R-squared:  0.0009754 
## F-statistic: 2.464 on 2 and 2997 DF,  p-value: 0.08526
library(broom)
regoutput<-tidy(Gunlm, conf.int = TRUE)
regoutput
## # A tibble: 3 x 7
##   term             estimate std.error statistic    p.value conf.low conf.high
##   <chr>               <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 (Intercept)         28.4       6.20     4.57  0.00000499     16.2     40.5 
## 2 ideologyLiberal    -19.0       8.77    -2.17  0.0303        -36.2     -1.81
## 3 ideologyModerate    -5.86      8.77    -0.668 0.504         -23.1     11.3

Practice Question

Question

Reproduce this figure:

Answer

Interpreting Regression Outputs

But how do I interpret the results? The average for Fund. group is (Intercept) because R dropped it as a reference category. So when ideologyLiberal= ideologyModerate =0, we are talking about ideology Fund, which is our intercept.

#Fundamentalists
summary(glht(Gunlm, linfct = c("`(Intercept)`==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Gun ~ ideology)
## 
## Linear Hypotheses:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept) == 0   28.378      6.205   4.573 4.99e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
#The average for Liberal group is 
summary(glht(Gunlm, linfct = c("`(Intercept)`+ideologyLiberal==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Gun ~ ideology)
## 
## Linear Hypotheses:
##                                      Estimate Std. Error t value Pr(>|t|)
## `(Intercept)` + ideologyLiberal == 0    9.358      6.205   1.508    0.132
## (Adjusted p values reported -- single-step method)
#The average for Moderate group is 
summary(glht(Gunlm, linfct = c("`(Intercept)`+ideologyModerate==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Gun ~ ideology)
## 
## Linear Hypotheses:
##                                       Estimate Std. Error t value Pr(>|t|)    
## `(Intercept)` + ideologyModerate == 0   22.514      6.205   3.628  0.00029 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

Interpreting Interaction Terms

Let’s say, we have run the following model:

regression <- lm(Attack ~ Speed + Defense + Legendary + Defense*Legendary, data=Pokemon)
equatiomatic::extract_eq(regression)

\[ \operatorname{Attack} = \alpha + \beta_{1}(\operatorname{Speed}) + \beta_{2}(\operatorname{Defense}) + \beta_{3}(\operatorname{Legendary}_{\operatorname{True}}) + \beta_{4}(\operatorname{Defense} \times \operatorname{Legendary}_{\operatorname{True}}) + \epsilon \]

stargazer(regression, header=FALSE, type="html")
Dependent variable:
Attack
Speed 0.352***
(0.034)
Defense 0.452***
(0.032)
LegendaryTrue 68.650***
(12.389)
Defense:LegendaryTrue -0.527***
(0.120)
Constant 20.331***
(3.356)
Observations 800
R2 0.364
Adjusted R2 0.361
Residual Std. Error 25.940 (df = 795)
F Statistic 113.989*** (df = 4; 795)
Note: p<0.1; p<0.05; p<0.01

This table doesnt tell us much! Let’s first see how we interpret interaction effects: free recitation note

The effect of Defense when Legendary=TRUE is:

library(multcomp) # This is a cool library you should keep in mind
summary(glht(regression, linfct = c("Defense+`Defense:LegendaryTrue`==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense * 
##     Legendary, data = Pokemon)
## 
## Linear Hypotheses:
##                                        Estimate Std. Error t value Pr(>|t|)
## Defense + `Defense:LegendaryTrue` == 0 -0.07576    0.11566  -0.655    0.513
## (Adjusted p values reported -- single-step method)

The effect of Defense when Legendary=FALSE is:

summary(glht(regression, linfct = c("Defense==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense * 
##     Legendary, data = Pokemon)
## 
## Linear Hypotheses:
##              Estimate Std. Error t value Pr(>|t|)    
## Defense == 0  0.45151    0.03151   14.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

The effect of Legendary when Defense=2

summary(glht(regression, linfct = c("LegendaryTrue+`Defense:LegendaryTrue`*2==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense * 
##     Legendary, data = Pokemon)
## 
## Linear Hypotheses:
##                                                  Estimate Std. Error t value
## LegendaryTrue + `Defense:LegendaryTrue` * 2 == 0    67.60      12.16   5.558
##                                                  Pr(>|t|)    
## LegendaryTrue + `Defense:LegendaryTrue` * 2 == 0 3.72e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

The effect of Legendary when Defense=0

summary(glht(regression, linfct = c("LegendaryTrue==0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense * 
##     Legendary, data = Pokemon)
## 
## Linear Hypotheses:
##                    Estimate Std. Error t value Pr(>|t|)    
## LegendaryTrue == 0    68.65      12.39   5.541 4.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

To visualize the effect of our variables, we will use ggpredict() function from ggeffects package.

ME.Defense<-ggpredict(regression, terms = "Speed")
ME.Defense
## 
## # Predicted values of Attack
## # x = Speed
## 
##   x | Predicted |   SE |           95% CI
## -----------------------------------------
##   0 |     51.94 | 2.41 | [ 47.22,  56.65]
##  20 |     58.97 | 1.81 | [ 55.43,  62.52]
##  40 |     66.01 | 1.29 | [ 63.49,  68.53]
##  60 |     73.05 | 0.98 | [ 71.13,  74.96]
## 100 |     87.12 | 1.51 | [ 84.17,  90.07]
## 120 |     94.16 | 2.07 | [ 90.10,  98.21]
## 140 |    101.19 | 2.69 | [ 95.93, 106.46]
## 180 |    115.26 | 3.98 | [107.47, 123.06]
## 
## Adjusted for:
## *   Defense = 70.00
## * Legendary = False
plot(ME.Defense)

myinteraction<-ggpredict(regression, terms = c("Defense", "Legendary"))
myinteraction
## 
## # Predicted values of Attack
## # x = Defense
## 
## # Legendary = False
## 
##   x | Predicted |   SE |           95% CI
## -----------------------------------------
##   0 |     43.20 | 2.45 | [ 38.40,  48.00]
##  40 |     61.26 | 1.38 | [ 58.55,  63.97]
##  80 |     79.32 | 0.99 | [ 77.37,  81.27]
## 110 |     92.87 | 1.54 | [ 89.84,  95.89]
## 150 |    110.93 | 2.65 | [105.73, 116.12]
## 230 |    147.05 | 5.08 | [137.08, 157.01]
## 
## # Legendary = True
## 
##   x | Predicted |    SE |          95% CI
## -----------------------------------------
##   0 |    111.85 | 12.17 | [88.00, 135.70]
##  40 |    108.82 |  7.84 | [93.46, 124.18]
##  80 |    105.79 |  4.20 | [97.56, 114.01]
## 110 |    103.51 |  3.58 | [96.49, 110.54]
## 150 |    100.48 |  6.63 | [87.49, 113.48]
## 230 |     94.42 | 15.32 | [64.41, 124.44]
## 
## Adjusted for:
## * Speed = 65.00
plot(myinteraction)

myinteraction<-ggpredict(regression, terms = c("Legendary","Defense"))
myinteraction
## 
## # Predicted values of Attack
## # x = Legendary
## 
## # Defense = 42.7
## 
## x     | Predicted |   SE |          95% CI
## ------------------------------------------
## False |     62.48 | 1.32 | [59.89,  65.07]
## True  |    108.61 | 7.56 | [93.80, 123.43]
## 
## # Defense = 73.8
## 
## x     | Predicted |   SE |          95% CI
## ------------------------------------------
## False |     76.52 | 0.96 | [74.64,  78.40]
## True  |    106.26 | 4.65 | [97.15, 115.36]
## 
## # Defense = 105
## 
## x     | Predicted |   SE |          95% CI
## ------------------------------------------
## False |     90.61 | 1.42 | [87.82,  93.40]
## True  |    103.89 | 3.46 | [97.12, 110.67]
## 
## Adjusted for:
## * Speed = 65.00
plot(myinteraction)