Introduction

This dataset contains information about Marvel and DC characters since ~1939 until 2014 (August 24th). It was used for the 538 study : Comic Books Are Still Made By Men, For Men And About Men.

In this kernel, I will look at the Gender distribution per publisher, its change vs. Time (idea is that we may see more LGBT characters in recent years), at the most populat characters and try some modelling/prediction with these data (i.e can we predict to which publisher belongs a character given its gender and other characteristics)

[setup]
options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
[packages]
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(RColorBrewer)
library(grid)
library(wordcloud)
library(plotrix)
library(fmsb)
library(fivethirtyeight)
library(knitr)
#data(package = "fivethirtyeight")
df<-comic_characters

Overview

Here I groupby the data per sex and publisher, calculate raw yields and percentages. I haven’t removed at this step the NA’s since I wanted to get all the characters and only the count matter for these plots.
[Show Code]
#calculate percentage of each gender for each publisher
pop2<-df %>% group_by(sex,publisher) %>% summarise(number = n()) %>% arrange(-number)
res<-pop2 %>% group_by(publisher) %>% mutate(countT= sum(number)) %>% group_by(sex) %>% mutate(percentage=100*number/countT)
res$LABEL <-paste0(round(res$percentage,2))

#make dataframe for the pie chart
pieC<-as.data.frame(pop2 %>% group_by(publisher) %>% select(number) %>% summarise(sum=sum(number)))

#define color palette
#colourCount = length(unique(res$sex))
#getPalette = colorRampPalette(wes_palette("FantasticFox"))

g1<-ggplot(data=res,aes(x=sex,y=percentage,fill=publisher)) + geom_bar(width = 0.9, stat="identity",position='dodge') + 
  theme(axis.text.x = element_text(angle=90, hjust=1),legend.position='none') + 
  geom_text(aes(label=LABEL), position=position_dodge(width=0.9), vjust=-0.25,size=2.5) + 
  scale_fill_manual(values = c("olivedrab","steelblue")) + 
  xlab('') + ylab('Percentage')

g2<-ggplot(pieC,aes(x="",y=sum,fill=publisher)) + geom_bar(stat='identity',width = 1) + 
  coord_polar(theta="y") + theme_void() + theme(axis.text.x=element_blank(),legend.position='bottom') + 
  scale_fill_manual(values = c("olivedrab","steelblue")) + 
  geom_text(aes(y =c(20000,8000), label = paste(pieC$publisher,": ",pieC$sum)))

grid.arrange(g1,g2,ncol=2,widths=2:1)

Comments :

Evolution of gender vs. Time

For the following plots, I groupby the rows per (year,month and sex) for both publishers (Marvel and DC) and count the number of entries. Therefore each bin (color code) is the number of characters created per month.

[Show Code]
#select data with no na's for sex and date
marvel<-as.data.frame(df %>% filter(publisher=='Marvel'))
marvel<-marvel[!is.na(marvel$month) & !is.na(marvel$sex),]
#groupby
res<-marvel %>% group_by(year,month,sex) %>% summarise(number= n())
#re-order month 
res$ordered_month <- factor(res$month, levels = month.name)

res$sex <- factor(res$sex, levels = c("Female Characters", "Male Characters", "Genderfluid Characters","Agender Characters"))
g1<-ggplot(data=res, aes(year,ordered_month)) + geom_tile(aes(fill = number),colour = "white") + 
  scale_fill_gradient(low="steelblue", high="black") + facet_wrap(~sex,ncol=4) + theme(axis.title.x=element_blank(),axis.ticks.x=element_blank(),axis.title.y=element_blank(),axis.ticks.y=element_blank(),legend.position='right',legend.title=element_blank(),legend.key.size = unit(.2, "cm"))+ 
  xlim(1935,2015)

dc<-as.data.frame(df %>% filter(publisher=='DC'))
dc<-dc[!is.na(dc$month) & !is.na(dc$sex),]
dc$month<-ifelse(dc$month=="01","January",dc$month)
dc<-dc %>% filter(month!="Holiday")
tes<-dc%>% group_by(year,month,sex) %>% summarise(number= n())
tes$month<-as.factor(tes$month)
tes$ordered_month <- factor(tes$month, levels = month.name)
tes$sex <- factor(tes$sex, levels = c("Female Characters", "Male Characters", "Genderless Characters","Transgender Characters"))

g2<-ggplot(data=tes, aes(year,ordered_month)) + geom_tile(aes(fill = number),colour = "white") + 
  scale_fill_gradient(low="olivedrab", high="black") + facet_wrap(~sex,ncol=4) + 
  theme(axis.title.x=element_blank(),axis.ticks.x=element_blank(),axis.title.y=element_blank(),axis.ticks.y=element_blank(),legend.position='right',legend.title=element_blank(),legend.key.size = unit(.2, "cm")) + 
  xlim(1935,2015)

#grid.arrange(g1,g2,ncol=1)
grid.arrange(rbind(ggplotGrob(g1), ggplotGrob(g2), size = "last"))

Comments :

Details per characters

in progress, only vs Align for now, code needs optimization

We can look at the characteristics of characters for both publishers. Here I groupby the data for a given feature (among id,align,eye,hair and alive), and report the percentage for each gender in a radarchart.

vs. Hair

[Show Code]
#empty list to hold data
listMarvel<-list()
listDC<-list()

#empty dataframe for summary
summaryMarvel<-data.frame(matrix(vector(),ncol=4))
summaryDC<-data.frame(matrix(vector(),ncol=4))

#create vector of by_sex for each publisher
typeMarvel<-as.data.frame(unique(df %>% filter(publisher=='Marvel') %>% select(sex) %>% na.omit()))
typeDC<-as.data.frame(unique(df %>% filter(publisher=='DC') %>% select(sex) %>% na.omit()))
typeMarvel<-typeMarvel$sex
typeDC<-typeDC$sex

colnames(summaryMarvel)<-typeMarvel
colnames(summaryDC)<-typeDC

for(i in 1:length(typeDC)){
    listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>%  select(hair,sex) %>% na.omit() %>% group_by(hair) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(hair,percentage_dc,number))
}
names(listDC)<-typeDC

for(i in 1:length(typeMarvel)){
    listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>%  select(hair,sex) %>% na.omit() %>% group_by(hair) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(hair,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel

tt<-c()
vv<-c()
for(j in 1:4){
  tt[j]<-as.data.frame(listMarvel[[j]] %>% select(hair,percentage_marvel) %>% top_n(n=1))$hair[1]
  vv[j]<-as.data.frame(listDC[[j]] %>% select(hair,percentage_dc) %>% top_n(n=1))$hair[1]
}

summaryMarvel[1,]<-tt
summaryDC[1,]<-vv
#listDC
#listMarvel
Male characters
[Show Code]
merged<-merge(listDC[[1]],listMarvel[[1]],by='hair',all=TRUE)
min<-rep(0,length(merged$hair))
max<-rep(50,length(merged$hair))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$hair
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

Agender characters
[Show Code]
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(100,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$hair
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)

Genderless characters
[Show Code]
min<-rep(0,length(listDC[[3]]$percentage_dc))
max<-rep(50,length(listDC[[3]]$percentage_dc))
test<-rbind(max,min,listDC[[3]]$percentage_dc)
colnames(test)<-listDC[[3]]$hair
row.names(test)<-c('max','min','percentage_dc')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('DC Genderless characters'), bty = "n", pch=16 ,col=c('olivedrab2') , text.col = "black", cex=.8, pt.cex=1)

Female characters
[Show Code]
merged<-merge(listDC[[2]],listMarvel[[2]],by='hair',all=TRUE)
min<-rep(0,length(merged$hair))
max<-rep(50,length(merged$hair))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$hair
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

vs. Eye

[Show Code]
#empty list ot hold data
listMarvel<-list()
listDC<-list()

for(i in 1:length(typeDC)){
    listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>%  select(eye,sex) %>% na.omit() %>% group_by(eye) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(eye,percentage_dc,number))
}
names(listDC)<-typeDC

for(i in 1:length(typeMarvel)){
    listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>%  select(eye,sex) %>% na.omit() %>% group_by(eye) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(eye,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel

tt<-c()
vv<-c()
for(j in 1:4){
  tt[j]<-as.data.frame(listMarvel[[j]] %>% select(eye,percentage_marvel) %>% top_n(n=1))$eye[1]
  vv[j]<-as.data.frame(listDC[[j]] %>% select(eye,percentage_dc) %>% top_n(n=1))$eye[1]
}

summaryMarvel[2,]<-tt
summaryDC[2,]<-vv
#listDC
#listMarvel
Male characters
[Show Code]
merged<-merge(listDC[[1]],listMarvel[[1]],by='eye',all=TRUE)
min<-rep(0,length(merged$eye))
max<-rep(50,length(merged$eye))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$eye
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

Agender characters
[Show Code]
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(50,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$eye
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)

Genderless characters
[Show Code]
min<-rep(0,length(listDC[[3]]$percentage_dc))
max<-rep(50,length(listDC[[3]]$percentage_dc))
test<-rbind(max,min,listDC[[3]]$percentage_dc)
colnames(test)<-listDC[[3]]$eye
row.names(test)<-c('max','min','percentage_dc')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('DC Genderless characters'), bty = "n", pch=16 ,col=c('olivedrab2') , text.col = "black", cex=.8, pt.cex=1)

Female characters
[Show Code]
merged<-merge(listDC[[2]],listMarvel[[2]],by='eye',all=TRUE)
min<-rep(0,length(merged$eye))
max<-rep(50,length(merged$eye))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$eye
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

vs. Align

[Show Code]
#empty list ot hold data
listMarvel<-list()
listDC<-list()

for(i in 1:length(typeDC)){
    listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>%  select(align,sex) %>% na.omit() %>% group_by(align) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(align,percentage_dc,number))
}
names(listDC)<-typeDC

for(i in 1:length(typeMarvel)){
    listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>%  select(align,sex) %>% na.omit() %>% group_by(align) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(align,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel

tt<-c()
vv<-c()

for(j in 1:4){
  tt[j]<-as.data.frame(listMarvel[[j]] %>% select(align,percentage_marvel) %>% top_n(n=1))$align[1]
  vv[j]<-as.data.frame(listDC[[j]] %>% select(align,percentage_dc) %>% top_n(n=1))$align[1]
}

summaryMarvel[3,]<-tt
summaryDC[3,]<-vv
#listDC
#listMarvel
Male characters
[Show Code]
merged<-merge(listDC[[1]],listMarvel[[1]],by='align',all=TRUE)
min<-rep(0,length(merged$align))
max<-rep(75,length(merged$align))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$align
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

Agender characters
[Show Code]
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(100,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$align
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)

Genderless characters
[Show Code]
min<-rep(0,length(listDC[[3]]$percentage_dc))
max<-rep(100,length(listDC[[3]]$percentage_dc))
test<-rbind(max,min,listDC[[3]]$percentage_dc)
colnames(test)<-listDC[[3]]$align
row.names(test)<-c('max','min','percentage_dc')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('DC Genderless characters'), bty = "n", pch=16 ,col=c('olivedrab2') , text.col = "black", cex=.8, pt.cex=1)

Female characters
[Show Code]
merged<-merge(listDC[[2]],listMarvel[[2]],by='align',all=TRUE)
min<-rep(0,length(merged$align))
max<-rep(75,length(merged$align))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$align
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

vs. Id

[Show Code]
#empty list ot hold data
listMarvel<-list()
listDC<-list()

for(i in 1:length(typeDC)){
    listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>%  select(id,sex) %>% na.omit() %>% group_by(id) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(id,percentage_dc,number))
}
names(listDC)<-typeDC

for(i in 1:length(typeMarvel)){
    listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>%  select(id,sex) %>% na.omit() %>% group_by(id) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(id,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel

tt<-c()
vv<-c()

for(j in 1:4){
  tt[j]<-as.data.frame(listMarvel[[j]] %>% select(id,percentage_marvel) %>% top_n(n=1))$id[1]
  vv[j]<-as.data.frame(listDC[[j]] %>% select(id,percentage_dc) %>% top_n(n=1))$id[1]
}


summaryMarvel[4,]<-tt
summaryDC[4,]<-vv

#set finale features
row.names(summaryDC)<-c('Hair','Eye','Align','Id')
row.names(summaryMarvel)<-c('Hair','Eye','Align','Id')
#listDC
#listMarvel
Male characters
[Show Code]
merged<-merge(listDC[[1]],listMarvel[[1]],by='id',all=TRUE)
min<-rep(0,length(merged$id))
max<-rep(75,length(merged$id))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$id
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

Agender characters
[Show Code]
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(100,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$id
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)

Genderless characters
[Show Code]
ggplot(data=as.data.frame(listDC[[3]]),aes(x=id,percentage_dc)) + geom_bar(stat='identity',fill='olivedrab2') + ylab('Percentage')

Female characters
[Show Code]
merged<-merge(listDC[[2]],listMarvel[[2]],by='id',all=TRUE)
min<-rep(0,length(merged$id))
max<-rep(75,length(merged$id))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$id
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)

Summary

By taking the highest values for each (feature,sex), we can have a profile of each character.

MARVEL
kable(summaryMarvel, format = "markdown")
Male Characters Female Characters Genderfluid Characters Agender Characters
Hair Black Hair Black Hair Black Hair No Hair
Eye Brown Eyes Blue Eyes Brown Eyes White Eyes
Align Bad Characters Good Characters Good Characters Bad Characters
Id Secret Identity Secret Identity No Dual Identity Secret Identity
DC
kable(summaryDC, format = "markdown")
Male Characters Female Characters Genderless Characters Transgender Characters
Hair Black Hair Black Hair Black Hair NA
Eye Blue Eyes Blue Eyes Photocellular Eyes NA
Align Bad Characters Good Characters Bad Characters Bad Characters
Id Secret Identity Public Identity Public Identity NA

Popularity

By looking at the number of appearances, we can have an idea of the popularity of each super-heroes.
[Show Code]
marvelHeroes<- df %>% filter(publisher=='Marvel') %>% select(name, appearances)
marvelHeroes<-na.omit(marvelHeroes)
#the following regexp remove everything after the start of a parenthesis 
marvelHeroes$name<-gsub( " *\\(.*?\\) *", "",marvelHeroes$name)

DCHeroes<- df %>% filter(publisher=='DC') %>% select(name, appearances)
DCHeroes<-na.omit(DCHeroes)
DCHeroes$name<-gsub( " *\\(.*?\\) *", "",DCHeroes$name)
[Show Code]
colfuncBlue <- colorRampPalette(c("steelblue", "white"))
colfuncGreen <- colorRampPalette(c("olivedrab", "white"))
op <- par(mar=c(1, 2, 2, 1),mfrow=c(1, 2),bg="black")
wordcloud(DCHeroes$name,DCHeroes$appearances,min.freq =200,colors=colfuncGreen(10),scale = c(1.0, 0.5))
wordcloud(marvelHeroes$name,marvelHeroes$appearances,min.freq =400,colors=colfuncBlue(10),scale = c(1., 0.5))

Number of appearances vs. Time

[Show Code]
dd<-as.data.frame(df %>% select(name,sex,date,appearances,publisher,alive) %>% na.omit())
dd$name<-gsub( " *\\(.*?\\) *", "",dd$name)
ggplot(data=dd,aes(x=date,y=appearances)) + 
  geom_point(aes(color=factor(sex)),alpha=.25,size=3) + 
  theme(legend.position='top',legend.title=element_blank(),legend.key.size = unit(.2, "cm")) + 
  geom_text(aes(label=ifelse(appearances >2e3,as.character(name),''),hjust=-.15, vjust=0)) + 
  facet_wrap(~publisher)

There is an issue with the dataset date apparently since the 1st appearance of Superman is reported in 1986, which refers to the reboot of the franchise by DC.

[Show Entry]
##       name             sex       date appearances publisher             alive
## 1 Superman Male Characters 1986-10-01        2496        DC Living Characters

The 10 first top appearances (DC and Marvel together) are trusted by Male Characters. Also this plot is a bit biased because an older character wil have more number of appearances so a better variable to look at is the normalized appearance (# of appearance / number of month since its creation to now)

Number of appearances vs. Time (normalized)

The normalization is (# appearances / # months of publications). There is one issue to take care is that when the character is dead : in that case the normalization is not (today - 1st appearance) but (Death date - 1st appearance). However we do not have this information

#define final date (creation of dataset)
finalDate<-as.POSIXct("2014-08-24", format="%Y-%M-%d")
diffWeek<-function(df){
    myvec<-c()
    for(i in 1:length(df$date)){
      if(df$alive[i] == "Living Characters"){
        #calculate difference in days, then divided by 30
        myvec[i]<-as.numeric(difftime(finalDate,df$date[i],units="weeks"))
      } else if (df$alive[i] == "Deceased Characters"){
        myvec[i]<-as.numeric(difftime(finalDate,df$date[i],units="weeks"))
      }
    }
    return(myvec)
}
dd$weekNumber<-diffWeek(dd)
dd <- dd %>% mutate(weekAppearance = appearances / weekNumber)
#select only the top entries for visualization
ggplot(data=filter(dd,weekAppearance>.3), aes(x=reorder(name,-weekAppearance), y= weekAppearance,fill=factor(publisher))) + geom_bar(width = 0.9, stat="identity") + xlab('') + ylab('# of appearances / week') + theme(axis.text.x = element_text(angle=90, hjust=1),text = element_text(size=6),legend.position='top')

Issue with dead/revived characters
ggplot(data=filter(dd,weekAppearance>.26),aes(x=reorder(name,-weekAppearance),y=weekAppearance,fill=factor(publisher))) + geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1),text = element_text(size=6))

dd %>% filter(name=='Vision' | name=="Charles Xavier")
##              name                sex       date appearances publisher               alive
## 1  Charles Xavier    Male Characters 1963-09-01        1233    Marvel Deceased Characters
## 2          Vision    Male Characters 1968-10-01        1007    Marvel   Living Characters
## 3          Vision    Male Characters 2005-08-01         119    Marvel Deceased Characters
## 4  Charles Xavier    Male Characters 2013-01-01          17    Marvel   Living Characters
## 5          Vision    Male Characters 2008-06-01           6    Marvel Deceased Characters
## 6  Charles Xavier    Male Characters 1977-08-01           5    Marvel   Living Characters
## 7  Charles Xavier    Male Characters 1990-10-01           5    Marvel   Living Characters
## 8          Vision    Male Characters 1992-06-01           3    Marvel   Living Characters
## 9          Vision Agender Characters 1998-10-01           1    Marvel   Living Characters
## 10         Vision    Male Characters 2007-09-01           1    Marvel   Living Characters
##    weekNumber weekAppearance
## 1   2634.1734    0.468078523
## 2   2368.8877    0.425094022
## 3    447.0306    0.266201043
## 4     59.8877    0.283864641
## 5    299.1734    0.020055258
## 6   1908.0306    0.002620503
## 7   1221.0306    0.004094902
## 8   1134.0306    0.002645431
## 9    803.6020    0.001244397
## 10   338.3163    0.002955814

Apparently it creates several entries in the dataset which makes the distribution not ordered.

History :

  • version 1 : initial commit