This dataset contains information about Marvel and DC characters since ~1939 until 2014 (August 24th). It was used for the 538 study : Comic Books Are Still Made By Men, For Men And About Men.
In this kernel, I will look at the Gender
distribution per publisher, its change vs. Time (idea is that we may see more LGBT characters in recent years), at the most populat characters and try some modelling/prediction with these data (i.e can we predict to which publisher belongs a character given its gender and other characteristics)
options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(RColorBrewer)
library(grid)
library(wordcloud)
library(plotrix)
library(fmsb)
library(fivethirtyeight)
library(knitr)
#data(package = "fivethirtyeight")
df<-comic_characters
sex
and publisher
, calculate raw yields and percentages. I haven’t removed at this step the NA’s since I wanted to get all the characters and only the count matter for these plots.
#calculate percentage of each gender for each publisher
pop2<-df %>% group_by(sex,publisher) %>% summarise(number = n()) %>% arrange(-number)
res<-pop2 %>% group_by(publisher) %>% mutate(countT= sum(number)) %>% group_by(sex) %>% mutate(percentage=100*number/countT)
res$LABEL <-paste0(round(res$percentage,2))
#make dataframe for the pie chart
pieC<-as.data.frame(pop2 %>% group_by(publisher) %>% select(number) %>% summarise(sum=sum(number)))
#define color palette
#colourCount = length(unique(res$sex))
#getPalette = colorRampPalette(wes_palette("FantasticFox"))
g1<-ggplot(data=res,aes(x=sex,y=percentage,fill=publisher)) + geom_bar(width = 0.9, stat="identity",position='dodge') +
theme(axis.text.x = element_text(angle=90, hjust=1),legend.position='none') +
geom_text(aes(label=LABEL), position=position_dodge(width=0.9), vjust=-0.25,size=2.5) +
scale_fill_manual(values = c("olivedrab","steelblue")) +
xlab('') + ylab('Percentage')
g2<-ggplot(pieC,aes(x="",y=sum,fill=publisher)) + geom_bar(stat='identity',width = 1) +
coord_polar(theta="y") + theme_void() + theme(axis.text.x=element_blank(),legend.position='bottom') +
scale_fill_manual(values = c("olivedrab","steelblue")) +
geom_text(aes(y =c(20000,8000), label = paste(pieC$publisher,": ",pieC$sum)))
grid.arrange(g1,g2,ncol=2,widths=2:1)
Comments :
Male
are more present in both publishers compared to Female
on a (2.5-3):1 ratio, while LGBT characters represents less than 1 percentFor the following plots, I groupby the rows per (year
,month
and sex
) for both publishers (Marvel and DC) and count the number of entries. Therefore each bin (color code) is the number of characters created per month.
#select data with no na's for sex and date
marvel<-as.data.frame(df %>% filter(publisher=='Marvel'))
marvel<-marvel[!is.na(marvel$month) & !is.na(marvel$sex),]
#groupby
res<-marvel %>% group_by(year,month,sex) %>% summarise(number= n())
#re-order month
res$ordered_month <- factor(res$month, levels = month.name)
res$sex <- factor(res$sex, levels = c("Female Characters", "Male Characters", "Genderfluid Characters","Agender Characters"))
g1<-ggplot(data=res, aes(year,ordered_month)) + geom_tile(aes(fill = number),colour = "white") +
scale_fill_gradient(low="steelblue", high="black") + facet_wrap(~sex,ncol=4) + theme(axis.title.x=element_blank(),axis.ticks.x=element_blank(),axis.title.y=element_blank(),axis.ticks.y=element_blank(),legend.position='right',legend.title=element_blank(),legend.key.size = unit(.2, "cm"))+
xlim(1935,2015)
dc<-as.data.frame(df %>% filter(publisher=='DC'))
dc<-dc[!is.na(dc$month) & !is.na(dc$sex),]
dc$month<-ifelse(dc$month=="01","January",dc$month)
dc<-dc %>% filter(month!="Holiday")
tes<-dc%>% group_by(year,month,sex) %>% summarise(number= n())
tes$month<-as.factor(tes$month)
tes$ordered_month <- factor(tes$month, levels = month.name)
tes$sex <- factor(tes$sex, levels = c("Female Characters", "Male Characters", "Genderless Characters","Transgender Characters"))
g2<-ggplot(data=tes, aes(year,ordered_month)) + geom_tile(aes(fill = number),colour = "white") +
scale_fill_gradient(low="olivedrab", high="black") + facet_wrap(~sex,ncol=4) +
theme(axis.title.x=element_blank(),axis.ticks.x=element_blank(),axis.title.y=element_blank(),axis.ticks.y=element_blank(),legend.position='right',legend.title=element_blank(),legend.key.size = unit(.2, "cm")) +
xlim(1935,2015)
#grid.arrange(g1,g2,ncol=1)
grid.arrange(rbind(ggplotGrob(g1), ggplotGrob(g2), size = "last"))
Comments :
female
or male
female
characters appear early for Marvel (~1970) while it was in the late 70’s for DC comicsin progress, only vs Align
for now, code needs optimization
We can look at the characteristics of characters for both publishers. Here I groupby the data for a given feature (among id
,align
,eye
,hair
and alive
), and report the percentage for each gender in a radarchart.
Hair
#empty list to hold data
listMarvel<-list()
listDC<-list()
#empty dataframe for summary
summaryMarvel<-data.frame(matrix(vector(),ncol=4))
summaryDC<-data.frame(matrix(vector(),ncol=4))
#create vector of by_sex for each publisher
typeMarvel<-as.data.frame(unique(df %>% filter(publisher=='Marvel') %>% select(sex) %>% na.omit()))
typeDC<-as.data.frame(unique(df %>% filter(publisher=='DC') %>% select(sex) %>% na.omit()))
typeMarvel<-typeMarvel$sex
typeDC<-typeDC$sex
colnames(summaryMarvel)<-typeMarvel
colnames(summaryDC)<-typeDC
for(i in 1:length(typeDC)){
listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>% select(hair,sex) %>% na.omit() %>% group_by(hair) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(hair,percentage_dc,number))
}
names(listDC)<-typeDC
for(i in 1:length(typeMarvel)){
listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>% select(hair,sex) %>% na.omit() %>% group_by(hair) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(hair,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel
tt<-c()
vv<-c()
for(j in 1:4){
tt[j]<-as.data.frame(listMarvel[[j]] %>% select(hair,percentage_marvel) %>% top_n(n=1))$hair[1]
vv[j]<-as.data.frame(listDC[[j]] %>% select(hair,percentage_dc) %>% top_n(n=1))$hair[1]
}
summaryMarvel[1,]<-tt
summaryDC[1,]<-vv
#listDC
#listMarvel
merged<-merge(listDC[[1]],listMarvel[[1]],by='hair',all=TRUE)
min<-rep(0,length(merged$hair))
max<-rep(50,length(merged$hair))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$hair
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(100,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$hair
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listDC[[3]]$percentage_dc))
max<-rep(50,length(listDC[[3]]$percentage_dc))
test<-rbind(max,min,listDC[[3]]$percentage_dc)
colnames(test)<-listDC[[3]]$hair
row.names(test)<-c('max','min','percentage_dc')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('DC Genderless characters'), bty = "n", pch=16 ,col=c('olivedrab2') , text.col = "black", cex=.8, pt.cex=1)
merged<-merge(listDC[[2]],listMarvel[[2]],by='hair',all=TRUE)
min<-rep(0,length(merged$hair))
max<-rep(50,length(merged$hair))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$hair
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
Eye
#empty list ot hold data
listMarvel<-list()
listDC<-list()
for(i in 1:length(typeDC)){
listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>% select(eye,sex) %>% na.omit() %>% group_by(eye) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(eye,percentage_dc,number))
}
names(listDC)<-typeDC
for(i in 1:length(typeMarvel)){
listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>% select(eye,sex) %>% na.omit() %>% group_by(eye) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(eye,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel
tt<-c()
vv<-c()
for(j in 1:4){
tt[j]<-as.data.frame(listMarvel[[j]] %>% select(eye,percentage_marvel) %>% top_n(n=1))$eye[1]
vv[j]<-as.data.frame(listDC[[j]] %>% select(eye,percentage_dc) %>% top_n(n=1))$eye[1]
}
summaryMarvel[2,]<-tt
summaryDC[2,]<-vv
#listDC
#listMarvel
merged<-merge(listDC[[1]],listMarvel[[1]],by='eye',all=TRUE)
min<-rep(0,length(merged$eye))
max<-rep(50,length(merged$eye))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$eye
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(50,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$eye
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listDC[[3]]$percentage_dc))
max<-rep(50,length(listDC[[3]]$percentage_dc))
test<-rbind(max,min,listDC[[3]]$percentage_dc)
colnames(test)<-listDC[[3]]$eye
row.names(test)<-c('max','min','percentage_dc')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('DC Genderless characters'), bty = "n", pch=16 ,col=c('olivedrab2') , text.col = "black", cex=.8, pt.cex=1)
merged<-merge(listDC[[2]],listMarvel[[2]],by='eye',all=TRUE)
min<-rep(0,length(merged$eye))
max<-rep(50,length(merged$eye))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$eye
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
Align
#empty list ot hold data
listMarvel<-list()
listDC<-list()
for(i in 1:length(typeDC)){
listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>% select(align,sex) %>% na.omit() %>% group_by(align) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(align,percentage_dc,number))
}
names(listDC)<-typeDC
for(i in 1:length(typeMarvel)){
listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>% select(align,sex) %>% na.omit() %>% group_by(align) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(align,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel
tt<-c()
vv<-c()
for(j in 1:4){
tt[j]<-as.data.frame(listMarvel[[j]] %>% select(align,percentage_marvel) %>% top_n(n=1))$align[1]
vv[j]<-as.data.frame(listDC[[j]] %>% select(align,percentage_dc) %>% top_n(n=1))$align[1]
}
summaryMarvel[3,]<-tt
summaryDC[3,]<-vv
#listDC
#listMarvel
merged<-merge(listDC[[1]],listMarvel[[1]],by='align',all=TRUE)
min<-rep(0,length(merged$align))
max<-rep(75,length(merged$align))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$align
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(100,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$align
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listDC[[3]]$percentage_dc))
max<-rep(100,length(listDC[[3]]$percentage_dc))
test<-rbind(max,min,listDC[[3]]$percentage_dc)
colnames(test)<-listDC[[3]]$align
row.names(test)<-c('max','min','percentage_dc')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('DC Genderless characters'), bty = "n", pch=16 ,col=c('olivedrab2') , text.col = "black", cex=.8, pt.cex=1)
merged<-merge(listDC[[2]],listMarvel[[2]],by='align',all=TRUE)
min<-rep(0,length(merged$align))
max<-rep(75,length(merged$align))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$align
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
Id
#empty list ot hold data
listMarvel<-list()
listDC<-list()
for(i in 1:length(typeDC)){
listDC[[i]]<-as.data.frame(df %>% filter(publisher=='DC') %>% select(id,sex) %>% na.omit() %>% group_by(id) %>% filter(sex==typeDC[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_dc=round(100*number/countT,1)) %>% select(id,percentage_dc,number))
}
names(listDC)<-typeDC
for(i in 1:length(typeMarvel)){
listMarvel[[i]]<-as.data.frame(df %>% filter(publisher=='Marvel') %>% select(id,sex) %>% na.omit() %>% group_by(id) %>% filter(sex==typeMarvel[i]) %>% summarise(number= n()) %>% arrange(-number) %>% mutate(countT= sum(number)) %>% mutate(percentage_marvel=round(100*number/countT,1)) %>% select(id,percentage_marvel,number))
}
names(listMarvel)<-typeMarvel
tt<-c()
vv<-c()
for(j in 1:4){
tt[j]<-as.data.frame(listMarvel[[j]] %>% select(id,percentage_marvel) %>% top_n(n=1))$id[1]
vv[j]<-as.data.frame(listDC[[j]] %>% select(id,percentage_dc) %>% top_n(n=1))$id[1]
}
summaryMarvel[4,]<-tt
summaryDC[4,]<-vv
#set finale features
row.names(summaryDC)<-c('Hair','Eye','Align','Id')
row.names(summaryMarvel)<-c('Hair','Eye','Align','Id')
#listDC
#listMarvel
merged<-merge(listDC[[1]],listMarvel[[1]],by='id',all=TRUE)
min<-rep(0,length(merged$id))
max<-rep(75,length(merged$id))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$id
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC male characters','Marvel male characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
min<-rep(0,length(listMarvel[[4]]$percentage_marvel))
max<-rep(100,length(listMarvel[[4]]$percentage_marvel))
test<-rbind(max,min,listMarvel[[4]]$percentage_marvel)
colnames(test)<-listMarvel[[4]]$id
row.names(test)<-c('max','min','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=.8, y=1., legend = c('Marvel Agender characters'), bty = "n", pch=16 ,col=c('steelblue') , text.col = "black", cex=.8, pt.cex=1)
ggplot(data=as.data.frame(listDC[[3]]),aes(x=id,percentage_dc)) + geom_bar(stat='identity',fill='olivedrab2') + ylab('Percentage')
merged<-merge(listDC[[2]],listMarvel[[2]],by='id',all=TRUE)
min<-rep(0,length(merged$id))
max<-rep(75,length(merged$id))
test<-rbind(max,min,merged$percentage_dc,merged$percentage_marvel)
colnames(test)<-merged$id
row.names(test)<-c('max','min','percentage_dc','percentage_marvel')
test<-as.data.frame(test)
test[is.na(test)]<-0
radarchart( test , axistype=2 , pcol=c('olivedrab2','steelblue') , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=0.6,vlcex=.6,palcex=1.)
legend(x=1, y=1.3, legend = c('DC Female characters','Marvel Female characters'), bty = "n", pch=16 ,col=c('olivedrab2','steelblue') , text.col = "black", cex=.8, pt.cex=1)
By taking the highest values for each (feature
,sex
), we can have a profile of each character.
kable(summaryMarvel, format = "markdown")
Male Characters | Female Characters | Genderfluid Characters | Agender Characters | |
---|---|---|---|---|
Hair | Black Hair | Black Hair | Black Hair | No Hair |
Eye | Brown Eyes | Blue Eyes | Brown Eyes | White Eyes |
Align | Bad Characters | Good Characters | Good Characters | Bad Characters |
Id | Secret Identity | Secret Identity | No Dual Identity | Secret Identity |
kable(summaryDC, format = "markdown")
Male Characters | Female Characters | Genderless Characters | Transgender Characters | |
---|---|---|---|---|
Hair | Black Hair | Black Hair | Black Hair | NA |
Eye | Blue Eyes | Blue Eyes | Photocellular Eyes | NA |
Align | Bad Characters | Good Characters | Bad Characters | Bad Characters |
Id | Secret Identity | Public Identity | Public Identity | NA |
marvelHeroes<- df %>% filter(publisher=='Marvel') %>% select(name, appearances)
marvelHeroes<-na.omit(marvelHeroes)
#the following regexp remove everything after the start of a parenthesis
marvelHeroes$name<-gsub( " *\\(.*?\\) *", "",marvelHeroes$name)
DCHeroes<- df %>% filter(publisher=='DC') %>% select(name, appearances)
DCHeroes<-na.omit(DCHeroes)
DCHeroes$name<-gsub( " *\\(.*?\\) *", "",DCHeroes$name)
colfuncBlue <- colorRampPalette(c("steelblue", "white"))
colfuncGreen <- colorRampPalette(c("olivedrab", "white"))
op <- par(mar=c(1, 2, 2, 1),mfrow=c(1, 2),bg="black")
wordcloud(DCHeroes$name,DCHeroes$appearances,min.freq =200,colors=colfuncGreen(10),scale = c(1.0, 0.5))
wordcloud(marvelHeroes$name,marvelHeroes$appearances,min.freq =400,colors=colfuncBlue(10),scale = c(1., 0.5))
dd<-as.data.frame(df %>% select(name,sex,date,appearances,publisher,alive) %>% na.omit())
dd$name<-gsub( " *\\(.*?\\) *", "",dd$name)
ggplot(data=dd,aes(x=date,y=appearances)) +
geom_point(aes(color=factor(sex)),alpha=.25,size=3) +
theme(legend.position='top',legend.title=element_blank(),legend.key.size = unit(.2, "cm")) +
geom_text(aes(label=ifelse(appearances >2e3,as.character(name),''),hjust=-.15, vjust=0)) +
facet_wrap(~publisher)
There is an issue with the dataset date
apparently since the 1st appearance of Superman is reported in 1986, which refers to the reboot of the franchise by DC.
## name sex date appearances publisher alive
## 1 Superman Male Characters 1986-10-01 2496 DC Living Characters
The 10 first top appearances (DC and Marvel together) are trusted by Male Characters. Also this plot is a bit biased because an older character wil have more number of appearances so a better variable to look at is the normalized appearance (# of appearance / number of month since its creation to now)
The normalization is (# appearances / # months of publications). There is one issue to take care is that when the character is dead : in that case the normalization is not (today - 1st appearance)
but (Death date - 1st appearance)
. However we do not have this information
#define final date (creation of dataset)
finalDate<-as.POSIXct("2014-08-24", format="%Y-%M-%d")
diffWeek<-function(df){
myvec<-c()
for(i in 1:length(df$date)){
if(df$alive[i] == "Living Characters"){
#calculate difference in days, then divided by 30
myvec[i]<-as.numeric(difftime(finalDate,df$date[i],units="weeks"))
} else if (df$alive[i] == "Deceased Characters"){
myvec[i]<-as.numeric(difftime(finalDate,df$date[i],units="weeks"))
}
}
return(myvec)
}
dd$weekNumber<-diffWeek(dd)
dd <- dd %>% mutate(weekAppearance = appearances / weekNumber)
#select only the top entries for visualization
ggplot(data=filter(dd,weekAppearance>.3), aes(x=reorder(name,-weekAppearance), y= weekAppearance,fill=factor(publisher))) + geom_bar(width = 0.9, stat="identity") + xlab('') + ylab('# of appearances / week') + theme(axis.text.x = element_text(angle=90, hjust=1),text = element_text(size=6),legend.position='top')
ggplot(data=filter(dd,weekAppearance>.26),aes(x=reorder(name,-weekAppearance),y=weekAppearance,fill=factor(publisher))) + geom_bar(width = 0.9, stat="identity") + theme(axis.text.x = element_text(angle=90, hjust=1),text = element_text(size=6))
dd %>% filter(name=='Vision' | name=="Charles Xavier")
## name sex date appearances publisher alive
## 1 Charles Xavier Male Characters 1963-09-01 1233 Marvel Deceased Characters
## 2 Vision Male Characters 1968-10-01 1007 Marvel Living Characters
## 3 Vision Male Characters 2005-08-01 119 Marvel Deceased Characters
## 4 Charles Xavier Male Characters 2013-01-01 17 Marvel Living Characters
## 5 Vision Male Characters 2008-06-01 6 Marvel Deceased Characters
## 6 Charles Xavier Male Characters 1977-08-01 5 Marvel Living Characters
## 7 Charles Xavier Male Characters 1990-10-01 5 Marvel Living Characters
## 8 Vision Male Characters 1992-06-01 3 Marvel Living Characters
## 9 Vision Agender Characters 1998-10-01 1 Marvel Living Characters
## 10 Vision Male Characters 2007-09-01 1 Marvel Living Characters
## weekNumber weekAppearance
## 1 2634.1734 0.468078523
## 2 2368.8877 0.425094022
## 3 447.0306 0.266201043
## 4 59.8877 0.283864641
## 5 299.1734 0.020055258
## 6 1908.0306 0.002620503
## 7 1221.0306 0.004094902
## 8 1134.0306 0.002645431
## 9 803.6020 0.001244397
## 10 338.3163 0.002955814
Apparently it creates several entries in the dataset which makes the distribution not ordered.
History :