ggplot2 - Part 2library(ggpubr)
library(ggplot2)
library(stargazer)
#let's load our data
setwd("~/Dropbox/Data Analysis F20 Recitation/Recitation Content/Recitation 4")
Pokemon<-read.csv("Pokemon.csv",header=T,na.strings="?")
#tell R that we are only working with Pokemon dataThis is a kernel density plot for continuous variables.
ggplot(data = Pokemon) +
geom_density(aes(x = Speed,fill=Legendary), alpha=0.55)If we want to visualize Speed for legendary and ordinary pokemons separately, we use faceting with facet_grid(). Let’s show them in two columns:
ggplot(data = Pokemon) +
geom_density(aes(x = Speed, fill=Legendary), alpha=0.55) +
facet_grid(rows=vars(Legendary))ggplot(data = Pokemon) +
geom_density(aes(x = Speed, fill=Legendary), alpha=0.55) +
facet_grid(cols=vars(Legendary))Can you reproduce this?
facet_wrapInstead of faceting with a variable in the horizontal or vertical direction, facets can be placed next to each other, wrapping with a certain number of columns or rows. The label for each plot will be at the top of the plot.
ggplot(data = Pokemon) +
geom_density(aes(x = Attack),fill="yellow", alpha=0.55) +
facet_wrap(vars(Generation), ncol=5)stat="identity"ggplot2 supports many different statistical transformations. For example, the “identity” transformation will leave the data “as is”. You can specify which statistical transformation a geom uses by passing it as the stat argument. I manually create the dataset:
library(tidyverse)
LCI <- function(x) {
mean(x,na.rm=TRUE) - 1.96* plotrix::std.error(x,na.rm=TRUE)
}
HCI <- function(x) {
mean(x,na.rm=TRUE) + 1.96* plotrix::std.error(x,na.rm=TRUE)
}
mystats<-Pokemon %>% group_by(Generation, Legendary) %>%
summarize(mean=mean(Attack, na.rm=TRUE),
LCI=LCI(Attack),
HCI=HCI(Attack))
mystats## # A tibble: 12 x 5
## # Groups: Generation [6]
## Generation Legendary mean LCI HCI
## <int> <chr> <dbl> <dbl> <dbl>
## 1 1 False 75.0 70.5 79.5
## 2 1 True 121. 88.0 154.
## 3 2 False 70.7 64.3 77.1
## 4 2 True 99 79.1 119.
## 5 3 False 77.0 71.5 82.5
## 6 3 True 118. 99.4 137.
## 7 4 False 79.6 73.5 85.7
## 8 4 True 110. 98.3 121.
## 9 5 False 78.2 73.7 82.8
## 10 5 True 120. 109. 132.
## 11 6 False 70.5 64.9 76.0
## 12 6 True 125. 108. 142.
ggplot(mystats)+
geom_bar(stat="identity", aes(x = factor(Generation), y = mean, fill=Legendary),position=position_dodge(.9))stat_summary()With the stat_summary() function, we can summarize y values at distinct x values. For example:
MyMedian <- function(x) {
median(x,na.rm=TRUE)
}
ggplot(data = Pokemon) + aes(x = factor(Legendary), y = Attack)+
stat_summary(geom="bar", fun=MyMedian)Reproduce this figure:
We can also change themes using theme_X. A complete list of themes are available here.
f1+theme_gray()f1+theme_light()f1+theme_linedraw()Can you reproduce this graph?
Hint #1: to retain the order in the graph based on a variable value, do this
data<-data[order(data$mean),] #to sort
data$Type.1<-factor(data$Type.1, levels = data$Type.1) # convert to factor to retain sorted order in plot.Hint #2: you should use geom_text() and add text in points using aes(label=varname).
Hint #3: use ggplot()+....+coord_flip() to flip the x and y axis.
LCI95 <- function(x) {
mean(x,na.rm=TRUE) - 1.96* plotrix::std.error(x,na.rm=TRUE)
}
HCI95 <- function(x) {
mean(x,na.rm=TRUE) + 1.96* plotrix::std.error(x,na.rm=TRUE)
}
LCI99 <- function(x) {
mean(x,na.rm=TRUE) - 2.575* plotrix::std.error(x,na.rm=TRUE)
}
HCI99 <- function(x) {
mean(x,na.rm=TRUE) + 2.575* plotrix::std.error(x,na.rm=TRUE)
}
data<-Pokemon %>% group_by(Type.1) %>%
summarize(mean=mean(Total),
LCI95=LCI95(Total),
HCI95=HCI95(Total),
LCI99=LCI99(Total),
HCI99=HCI99(Total))
data<-data[order(data$mean),] # sort
data$Type.1<-factor(data$Type.1, levels = data$Type.1) # convert to factor to retain sorted order in plot.
ggplot(data=data)+
geom_linerange(aes(x=Type.1, ymin=LCI99, ymax=HCI99), color="black", size=0.5)+
geom_linerange(aes(x=Type.1, ymin=LCI95, ymax=HCI95), color="blue", size=1)+
geom_point(aes(x=Type.1, y=mean),size=7, color="black")+
geom_point(aes(x=Type.1, y=mean),size=6, color="white")+
geom_text(aes(x=Type.1, y=mean, label=round(mean)),size=2, color="black")+
theme_minimal()+
coord_flip()+
labs(title="Total Strength by Pokemon Type", x="", y="Total Score (average)")Let’s say, we have run the following model:
regression <- lm(Attack ~ Speed + Defense + Legendary, data=Pokemon)library(tidyverse)
library(equatiomatic)
#install.packages("ggeffects")
library(ggeffects)
library(splines)extract_eq(regression)\[ \operatorname{Attack} = \alpha + \beta_{1}(\operatorname{Speed}) + \beta_{2}(\operatorname{Defense}) + \beta_{3}(\operatorname{Legendary}_{\operatorname{True}}) + \epsilon \]
stargazer(regression, header=FALSE, type="html")| Dependent variable: | |
| Attack | |
| Speed | 0.368*** |
| (0.034) | |
| Defense | 0.416*** |
| (0.031) | |
| LegendaryTrue | 16.540*** |
| (3.716) | |
| Constant | 21.826*** |
| (3.377) | |
| Observations | 800 |
| R2 | 0.349 |
| Adjusted R2 | 0.347 |
| Residual Std. Error | 26.238 (df = 796) |
| F Statistic | 142.233*** (df = 3; 796) |
| Note: | p<0.1; p<0.05; p<0.01 |
ME.Speed<-ggpredict(regression, terms = "Speed")
ME.Speed##
## # Predicted values of Attack
## # x = Speed
##
## x | Predicted | SE | 95% CI
## -----------------------------------------
## 0 | 50.92 | 2.42 | [ 46.17, 55.67]
## 20 | 58.29 | 1.82 | [ 54.71, 61.86]
## 40 | 65.65 | 1.30 | [ 63.10, 68.20]
## 60 | 73.01 | 0.99 | [ 71.08, 74.95]
## 100 | 87.74 | 1.52 | [ 84.77, 90.71]
## 120 | 95.10 | 2.08 | [ 91.02, 99.19]
## 140 | 102.47 | 2.70 | [ 97.17, 107.76]
## 180 | 117.19 | 4.00 | [109.36, 125.03]
##
## Adjusted for:
## * Defense = 70.00
## * Legendary = False
#plot the relationship
plot(ME.Speed)ME.Defense<-ggpredict(regression, terms = "Defense")
ME.Defense##
## # Predicted values of Attack
## # x = Defense
##
## x | Predicted | SE | 95% CI
## -----------------------------------------
## 0 | 45.76 | 2.41 | [ 41.04, 50.48]
## 30 | 58.23 | 1.61 | [ 55.08, 61.37]
## 60 | 70.70 | 1.03 | [ 68.67, 72.72]
## 90 | 83.17 | 1.12 | [ 80.97, 85.36]
## 110 | 91.48 | 1.53 | [ 88.48, 94.47]
## 140 | 103.95 | 2.32 | [ 99.41, 108.49]
## 170 | 116.42 | 3.18 | [110.18, 122.65]
## 230 | 141.36 | 4.97 | [131.61, 151.10]
##
## Adjusted for:
## * Speed = 65.00
## * Legendary = False
#plot the relationship
plot(ME.Defense)ME.Legendary<-ggpredict(regression, terms = "Legendary")
ME.Legendary##
## # Predicted values of Attack
## # x = Legendary
##
## x | Predicted | SE | 95% CI
## -----------------------------------------
## False | 74.85 | 0.97 | [72.95, 76.75]
## True | 91.39 | 3.61 | [84.33, 98.46]
##
## Adjusted for:
## * Speed = 65.00
## * Defense = 70.00
#plot the relationship
plot(ME.Legendary)Let’s create a fake dataset for Pokemon strength and assume the relationship between strength and speed is exponential.
b0<-6
b1<-0.5
eps = rnorm(nrow(Pokemon),0, 1)
Pokemon$Strength = b0 + b1*I(Pokemon$Speed^2)+ eps #create fake data
ggplot(data=Pokemon) + geom_point(aes(x=Speed, y=Strength)) #visualize relationshipregression <- lm(Strength ~ I(Speed^2), data=Pokemon) #run regression
summary(regression)##
## Call:
## lm(formula = Strength ~ I(Speed^2), data = Pokemon)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.98078 -0.68202 -0.01038 0.70545 3.12279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.999e+00 5.560e-02 107.9 <2e-16 ***
## I(Speed^2) 5.000e-01 7.889e-06 63378.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.982 on 798 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 4.017e+09 on 1 and 798 DF, p-value: < 2.2e-16
#calculate the expected values (mean) of dependent variable based on speed
ME.Defense<-ggpredict(regression, terms = "Speed")
ME.Defense##
## # Predicted values of Strength
## # x = Speed
##
## x | Predicted | SE | 95% CI
## --------------------------------------------------
## 25 | 318.49 | 0.05 | [ 318.39, 318.59]
## 1089 | 5.93e+05 | 9.31 | [5.93e+05, 5.93e+05]
## 2304 | 2.65e+06 | 41.83 | [2.65e+06, 2.65e+06]
## 3844 | 7.39e+06 | 116.53 | [7.39e+06, 7.39e+06]
## 5625 | 1.58e+07 | 249.57 | [1.58e+07, 1.58e+07]
## 7921 | 3.14e+07 | 494.93 | [3.14e+07, 3.14e+07]
## 10816 | 5.85e+07 | 922.85 | [5.85e+07, 5.85e+07]
## 32400 | 5.25e+08 | 8281.46 | [5.25e+08, 5.25e+08]
#plot the relationship
plot(ME.Defense)Let’s visualize dummy variable regression:
library(multcomp) # This is a cool library you should keep in mind
ideology = rep( c("Liberal", "Moderate", "Fund"), each = 1000)
eps = rnorm(3000, 0, 200)
#I assume liberal average value for Gun is 10, moderate is 15, fund is 25
Gun = 10 + 5*(ideology == "Moderate")+15*(ideology=="Fund") + eps
# Below is something similar to what you need to adopt. If you have further questions on interpreting dummy variable regressions, we can go over during my office hour.
Gunlm = lm(Gun ~ ideology)
summary(Gunlm)##
## Call:
## lm(formula = Gun ~ ideology)
##
## Residuals:
## Min 1Q Median 3Q Max
## -679.92 -134.81 0.23 137.17 637.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.378 6.205 4.573 4.99e-06 ***
## ideologyLiberal -19.020 8.775 -2.167 0.0303 *
## ideologyModerate -5.864 8.775 -0.668 0.5040
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 196.2 on 2997 degrees of freedom
## Multiple R-squared: 0.001642, Adjusted R-squared: 0.0009754
## F-statistic: 2.464 on 2 and 2997 DF, p-value: 0.08526
library(broom)
regoutput<-tidy(Gunlm, conf.int = TRUE)
regoutput## # A tibble: 3 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 28.4 6.20 4.57 0.00000499 16.2 40.5
## 2 ideologyLiberal -19.0 8.77 -2.17 0.0303 -36.2 -1.81
## 3 ideologyModerate -5.86 8.77 -0.668 0.504 -23.1 11.3
Reproduce this figure:
But how do I interpret the results? The average for Fund. group is (Intercept) because R dropped it as a reference category. So when ideologyLiberal= ideologyModerate =0, we are talking about ideology Fund, which is our intercept.
#Fundamentalists
summary(glht(Gunlm, linfct = c("`(Intercept)`==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Gun ~ ideology)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) == 0 28.378 6.205 4.573 4.99e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
#The average for Liberal group is
summary(glht(Gunlm, linfct = c("`(Intercept)`+ideologyLiberal==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Gun ~ ideology)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## `(Intercept)` + ideologyLiberal == 0 9.358 6.205 1.508 0.132
## (Adjusted p values reported -- single-step method)
#The average for Moderate group is
summary(glht(Gunlm, linfct = c("`(Intercept)`+ideologyModerate==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Gun ~ ideology)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## `(Intercept)` + ideologyModerate == 0 22.514 6.205 3.628 0.00029 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
Let’s say, we have run the following model:
regression <- lm(Attack ~ Speed + Defense + Legendary + Defense*Legendary, data=Pokemon)equatiomatic::extract_eq(regression)\[ \operatorname{Attack} = \alpha + \beta_{1}(\operatorname{Speed}) + \beta_{2}(\operatorname{Defense}) + \beta_{3}(\operatorname{Legendary}_{\operatorname{True}}) + \beta_{4}(\operatorname{Defense} \times \operatorname{Legendary}_{\operatorname{True}}) + \epsilon \]
stargazer(regression, header=FALSE, type="html")| Dependent variable: | |
| Attack | |
| Speed | 0.352*** |
| (0.034) | |
| Defense | 0.452*** |
| (0.032) | |
| LegendaryTrue | 68.650*** |
| (12.389) | |
| Defense:LegendaryTrue | -0.527*** |
| (0.120) | |
| Constant | 20.331*** |
| (3.356) | |
| Observations | 800 |
| R2 | 0.364 |
| Adjusted R2 | 0.361 |
| Residual Std. Error | 25.940 (df = 795) |
| F Statistic | 113.989*** (df = 4; 795) |
| Note: | p<0.1; p<0.05; p<0.01 |
This table doesnt tell us much! Let’s first see how we interpret interaction effects: free recitation note
The effect of Defense when Legendary=TRUE is:
library(multcomp) # This is a cool library you should keep in mind
summary(glht(regression, linfct = c("Defense+`Defense:LegendaryTrue`==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense *
## Legendary, data = Pokemon)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## Defense + `Defense:LegendaryTrue` == 0 -0.07576 0.11566 -0.655 0.513
## (Adjusted p values reported -- single-step method)
The effect of Defense when Legendary=FALSE is:
summary(glht(regression, linfct = c("Defense==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense *
## Legendary, data = Pokemon)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## Defense == 0 0.45151 0.03151 14.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
The effect of Legendary when Defense=2
summary(glht(regression, linfct = c("LegendaryTrue+`Defense:LegendaryTrue`*2==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense *
## Legendary, data = Pokemon)
##
## Linear Hypotheses:
## Estimate Std. Error t value
## LegendaryTrue + `Defense:LegendaryTrue` * 2 == 0 67.60 12.16 5.558
## Pr(>|t|)
## LegendaryTrue + `Defense:LegendaryTrue` * 2 == 0 3.72e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
The effect of Legendary when Defense=0
summary(glht(regression, linfct = c("LegendaryTrue==0")))##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = Attack ~ Speed + Defense + Legendary + Defense *
## Legendary, data = Pokemon)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## LegendaryTrue == 0 68.65 12.39 5.541 4.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
To visualize the effect of our variables, we will use ggpredict() function from ggeffects package.
ME.Defense<-ggpredict(regression, terms = "Speed")
ME.Defense##
## # Predicted values of Attack
## # x = Speed
##
## x | Predicted | SE | 95% CI
## -----------------------------------------
## 0 | 51.94 | 2.41 | [ 47.22, 56.65]
## 20 | 58.97 | 1.81 | [ 55.43, 62.52]
## 40 | 66.01 | 1.29 | [ 63.49, 68.53]
## 60 | 73.05 | 0.98 | [ 71.13, 74.96]
## 100 | 87.12 | 1.51 | [ 84.17, 90.07]
## 120 | 94.16 | 2.07 | [ 90.10, 98.21]
## 140 | 101.19 | 2.69 | [ 95.93, 106.46]
## 180 | 115.26 | 3.98 | [107.47, 123.06]
##
## Adjusted for:
## * Defense = 70.00
## * Legendary = False
plot(ME.Defense)myinteraction<-ggpredict(regression, terms = c("Defense", "Legendary"))
myinteraction##
## # Predicted values of Attack
## # x = Defense
##
## # Legendary = False
##
## x | Predicted | SE | 95% CI
## -----------------------------------------
## 0 | 43.20 | 2.45 | [ 38.40, 48.00]
## 40 | 61.26 | 1.38 | [ 58.55, 63.97]
## 80 | 79.32 | 0.99 | [ 77.37, 81.27]
## 110 | 92.87 | 1.54 | [ 89.84, 95.89]
## 150 | 110.93 | 2.65 | [105.73, 116.12]
## 230 | 147.05 | 5.08 | [137.08, 157.01]
##
## # Legendary = True
##
## x | Predicted | SE | 95% CI
## -----------------------------------------
## 0 | 111.85 | 12.17 | [88.00, 135.70]
## 40 | 108.82 | 7.84 | [93.46, 124.18]
## 80 | 105.79 | 4.20 | [97.56, 114.01]
## 110 | 103.51 | 3.58 | [96.49, 110.54]
## 150 | 100.48 | 6.63 | [87.49, 113.48]
## 230 | 94.42 | 15.32 | [64.41, 124.44]
##
## Adjusted for:
## * Speed = 65.00
plot(myinteraction)myinteraction<-ggpredict(regression, terms = c("Legendary","Defense"))
myinteraction##
## # Predicted values of Attack
## # x = Legendary
##
## # Defense = 42.7
##
## x | Predicted | SE | 95% CI
## ------------------------------------------
## False | 62.48 | 1.32 | [59.89, 65.07]
## True | 108.61 | 7.56 | [93.80, 123.43]
##
## # Defense = 73.8
##
## x | Predicted | SE | 95% CI
## ------------------------------------------
## False | 76.52 | 0.96 | [74.64, 78.40]
## True | 106.26 | 4.65 | [97.15, 115.36]
##
## # Defense = 105
##
## x | Predicted | SE | 95% CI
## ------------------------------------------
## False | 90.61 | 1.42 | [87.82, 93.40]
## True | 103.89 | 3.46 | [97.12, 110.67]
##
## Adjusted for:
## * Speed = 65.00
plot(myinteraction)