Jennifer Ganeles
4/14/19
This week’s homework explores gender representation in both the Marvel and DC universes using the FiveThirtyEight Comic Characters Dataset found on Kaggle. The data is split into two files for Marvel and DC respectively: marvel-wikia-data.csv and dc-wikia-data.csv.
Using data visualization, I will be exploring the following variables:
Gender: Male or female
APPEARANCES: Number of appearances
Year: Year that character was first introduced
YEARS: Number of years since character first appeared
ALIGN: Whether a character is bad, good, or neutral
ALIVE: Whether a character is living or deceased
Given my previous homework on gender differences in Marvel comics (Homework 7), I hypothesize that males will severely outnumber females in both universes, but females will appear in the comics, on average, more times than males when controlling for alignment, alive status, and years since character was introduced.
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(magrittr)
library(gridExtra)
#MARVEL
Marvel<-read_csv("/Users/jenniferganeles/Downloads/marvel-wikia-data.csv")
marvel_comics<-
mutate (Marvel, Gender=
recode(SEX, "Agender Characters"="Other", "Genderfluid Characters"="Other",
"Male Characters"="Male", "Female Characters"="Female"),
ALIVE=
recode(ALIVE, "Deceased Characters"="Deceased", "Living Characters"="Living"),
YEARS=
2014-Year,
Gender=as.factor(Gender),
ALIGN=as.factor(ALIGN),
ALIVE=as.factor(ALIVE),
Universe="Marvel")%>%
filter(Gender!="Other", !is.na(Gender), !is.na(APPEARANCES))
#DC
DC<-read_csv("/Users/jenniferganeles/Downloads/dc-wikia-data.csv")
dc_comics<-
mutate (DC, Gender=
recode(SEX, "Genderless Characters"="Other", "Transgender Characters"="Other",
"Male Characters"="Male", "Female Characters"="Female"),
ALIVE=
recode(ALIVE, "Deceased Characters"="Deceased", "Living Characters"="Living"),
Gender=as.factor(Gender),
ALIGN=as.factor(ALIGN),
ALIVE=as.factor(ALIVE),
YEARS=
2014-YEAR,
Universe="DC")%>%
filter(Gender!="Other", !is.na(Gender), !is.na(APPEARANCES))
The following bar graph represents the frequency of male and female characters in both the Marvel and DC universes:
marvel_gender<-marvel_comics%>%
group_by(Gender)%>%
tally
dc_gender<-dc_comics%>%
group_by(Gender)%>%
tally
total<-rbind(marvel_gender, dc_gender)%>%
mutate(Universe=
recode(n, "3599"="Marvel", "10899"="Marvel", "1880"="DC", "4527"="DC"))
ggplot(data=total)+
geom_col(aes(x=Universe,y=n, fill=Gender), position="dodge")+
labs(title="Total Characters: DC vs. Marvel", y="Number of Characters")+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))
The following line graphs depict how many characters have been introduced each year (from around 1940 to 2014). It shows the temporal trend of new male and female characters.
#MARVEL
new_marvel <- marvel_comics%>%
group_by(Year, Gender) %>%
tally
nm<-ggplot(new_marvel, aes(x=Year, y=n)) +
geom_line(aes(color = Gender), size = 2)+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))+
labs(y="New Characters (Marvel)")
#DC
new_dc <- dc_comics%>%
group_by(YEAR, Gender) %>%
tally
nd<-ggplot(new_dc, aes(x=YEAR, y=n)) +
geom_line(aes(color = Gender), size = 2)+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))+
labs(x="Year", y="New Characters (DC)")
grid.arrange(nm,nd, nrow=2)
Below, one can see the characters with highest number of appearances in the comics.
#MARVEL
marvel_top<-marvel_comics%>%
group_by(APPEARANCES, name)%>%
arrange(-APPEARANCES)
mpop<-ggplot(data=marvel_top[1:10,], aes(x=reorder(name,APPEARANCES), y=APPEARANCES))+
geom_bar(stat = "identity", aes(fill=Gender)) +
scale_fill_manual(values = c("turquoise3"))+
coord_flip()+
labs(x="", y="Appearances", title="Top Ten Marvel Characters")+
theme_tufte()
#DC
dc_top<-dc_comics%>%
group_by(APPEARANCES, name)%>%
arrange(-APPEARANCES)
dcpop<-ggplot(data=dc_top[1:10,], aes(x=reorder(name,APPEARANCES), y=APPEARANCES))+
geom_bar(stat = "identity", aes(fill=Gender)) +
coord_flip()+
labs(x="", y="Appearances", title="Top Ten DC Characters")+
theme_tufte()
grid.arrange(mpop, dcpop, nrow=2)
The following boxplots represent the distribution of appearances for male and female characters. However, it is important to note that only recurring characters were included in this distribution (where appearances equal 150 or greater).
library(Hmisc)
#MARVEL
pop<-marvel_comics%>%
filter(APPEARANCES>=150)
mbx<-ggplot(data=pop, aes(x=Gender,y=APPEARANCES, fill=Gender))+
geom_boxplot() + stat_summary(fun.data = "mean_cl_boot", colour = "red")+
labs(title="Marvel")+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))
#DC
pop2<-dc_comics%>%
filter(APPEARANCES>=150)
dcbx<-ggplot(data=pop2, aes(x=Gender,y=APPEARANCES, fill=Gender))+
geom_boxplot() + stat_summary(fun.data = "mean_cl_boot", colour = "blue")+
labs(title="DC")+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(mbx, dcbx, nrow=1)
The following regressions represent the relationship between how many years a character has appeared in the comics and how many times they have appeared in the comics. However, once again, only frequently recurring characters were included in this model (where appearances equal 150 or greater).
#MARVEL
mlm<-ggplot(data=pop, aes(x=YEARS,y=APPEARANCES, fill=Gender))+
geom_point()+stat_smooth(method="lm", color='red')+
labs(title="Marvel", y= "Appearances", x="Years")+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))
#DC
dclm<-ggplot(data=pop2, aes(x=YEARS,y=APPEARANCES, fill=Gender))+
geom_point()+stat_smooth(method="lm")+
labs(title="DC", y="Appearances", x="Years")+
theme_tufte()+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(mlm, dclm, nrow=1)
The below plots explore the median gender difference in appearances, taking into account whether a character is living or deceased, as well as whether a character is bad, good, or neutral.
#MARVEL
marvel_comics2<-marvel_comics%>%
filter(!is.na(ALIGN))
int<-ggplot(data=marvel_comics2) +
aes(x = Gender, color = ALIGN, group=ALIGN, y = APPEARANCES) +
stat_summary(fun.y = median, geom ="point")+
stat_summary(fun.y = median, geom = "line")+
facet_wrap( ~ ALIVE)+
labs(title="Marvel", y="Appearances (Median)")+theme(plot.title = element_text(hjust = 0.5))
#DC
dc_comics2<-dc_comics%>%
filter(!is.na(ALIVE), !is.na(ALIGN), ALIGN!="Reformed Criminals")
int2<-ggplot(data=dc_comics2) +
aes(x = Gender, color = ALIGN, group=ALIGN, y = APPEARANCES) +
stat_summary(fun.y = median, geom ="point")+
stat_summary(fun.y = median, geom = "line")+
facet_wrap( ~ ALIVE)+
labs(title="DC", y="Appearances (Median)")+theme(plot.title = element_text(hjust = 0.5))
grid.arrange(int, int2, nrow=2)
Using a poisson model, the below plots represent the mean gender difference in MARVEL appearances while controlling for alignment, alive status, and number of years since introduced.
library(Zelig)
MPoisson <- zelig(APPEARANCES ~ Gender+ALIGN+ALIVE+YEARS, model = "poisson", data = marvel_comics, cite = F)
xm<-setx(MPoisson,Gender="Male")
xf<-setx(MPoisson,Gender="Female")
s<-sim(MPoisson,x=xm,x1=xf)
evm <- s$get_qi(xvalue="x", qi="ev")
evf<-s$get_qi(xvalue="x1",qi="ev")
df <- as.data.frame(cbind(evf,evm))%>%
rename("Female"=V1, "Male"=V2)
tidd <- df %>%
gather(Gender, APPEARANCES, 1:2)
appear<-tidd%>%
group_by(Gender)%>%
summarise(mean = mean(APPEARANCES), sd = sd(APPEARANCES))
mdiff<-
ggplot(data=tidd, aes(x=APPEARANCES))+
geom_density(fill="red")+
facet_wrap(~Gender)+
geom_vline(data=appear,aes(xintercept=mean),color="black") +
xlab("Average Number of Appearances (Expected Value)")+
ggtitle("Gender Difference in Marvel Comic Appearances")+
theme_bw()+theme(plot.title = element_text(hjust = 0.5))
x <- setx(MPoisson, Gender = "Male")
x1 <- setx(MPoisson, Gender = "Female")
s <- sim(MPoisson, x = x, x1 = x1)
fdm <- s$get_qi(xvalue="x1", qi="fd")
genderdiff <- as.data.frame(cbind(fdm))%>%
rename("Gender Difference"=V1)
gender_diff<- genderdiff %>%
gather(class, simv)
gf<-gender_diff %>%
group_by(class) %>%
summarise(mean = mean(simv), sd = sd(simv))
m_diff<-
ggplot(data=gender_diff, aes(simv)) +
geom_density(fill="red") +
facet_grid(~class) +
geom_vline(data=gf,aes(xintercept=mean))+
labs(x = "Simulated First Difference (Mean)")+
theme_bw()
grid.arrange(mdiff, m_diff, nrow=2)
Using a poisson model, the below plots represent the mean gender difference in DC appearances while controlling for alignment, alive status, and number of years since introduced.
DCPoisson <- zelig(APPEARANCES ~ Gender+ALIGN+ALIVE+YEARS, model = "poisson", data = dc_comics, cite = F)
xm<-setx(DCPoisson, Gender="Male")
xf<-setx(DCPoisson, Gender="Female")
s<-sim(DCPoisson, x=xm, x1=xf)
evm <- s$get_qi(xvalue="x", qi="ev")
evf<-s$get_qi(xvalue="x1",qi="ev")
df2 <- as.data.frame(cbind(evf,evm))%>%
rename("Female"=V1,
"Male"=V2)
tidd2 <- df2 %>%
gather(Gender, APPEARANCES, 1:2)
appear2<-tidd2%>%
group_by(Gender)%>%
summarise(mean = mean(APPEARANCES), sd = sd(APPEARANCES))
mdiff2<-
ggplot(data=tidd2, aes(x=APPEARANCES))+
geom_density(fill="blue")+
facet_wrap(~Gender)+
geom_vline(data=appear2,aes(xintercept=mean),color="black") +
xlab("Average Number of Appearances (Expected Value)")+
ggtitle("Gender Difference in DC Comic Appearances")+
theme_bw()+theme(plot.title = element_text(hjust = 0.5))
x <- setx(DCPoisson, Gender = "Male")
x1 <- setx(DCPoisson, Gender = "Female")
s <- sim(DCPoisson, x = x, x1 = x1)
fd <- s$get_qi(xvalue="x1", qi="fd")
genderdiff2 <- as.data.frame(cbind(fd))%>%
rename("Gender Difference"=V1)
gender_diff2<- genderdiff2 %>%
gather(class, simv)
gf2<-gender_diff2 %>%
group_by(class) %>%
summarise(mean = mean(simv), sd = sd(simv))
m_diff2<-
ggplot(data=gender_diff2, aes(simv)) +
geom_density(fill="blue") +
facet_grid(~class) +
geom_vline(data=gf2,aes(xintercept=mean))+
labs(x = "Simulated First Difference (Mean)")+
theme_bw()
grid.arrange(mdiff2, m_diff2, nrow=2)