knitr::opts_chunk$set (echo=FALSE)
library(tidyverse)
library(readxl)
library(ggplot2)
library(dplyr)
Text Source: 20 Children Literature From Project Gutenburg
What are the most frequent verbs used to describe ‘she’ / ‘he’?
most_she<-subset(all,rate_sequence<16|rate_sequence>199) ## 15 each
most_she$`rate_she`<-ifelse(most_she$all_rate_she>=0.5,"more she","more he")
most_she<- most_she[order(most_she$all_rate_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)
p<-ggplot(most_she,aes(x=Term,y=all_rate_she,label=all_rate_she))+
geom_point(stat="identity",aes(col=`rate_she`),size=3)+
labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words',subtitle="Collected from 20 children literature")+
scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
ylim(0,1)+
geom_segment(aes(y=0.430633,x=Term,yend=all_rate_she,xend=Term))+
geom_hline(yintercept=0.430633,na.rm = FALSE,show.legend = NA)
## the relative ratio of the terms' likelihood of being paired with 'she' in the whole corpus
## calculated by: SUM(all_she)/SUM(all_totals)
p+coord_flip()

- Much more verbs are paired with ‘he’ than ‘she’ in the corpus.
- Female caracters are portrayed more by actions that expresses gentle emotions, feelings, desires and curiosity.The pattern of Male characters are less obvious, though few actions sounds more violent and intense.
What are the most gender-specifc words used by female authors?
all3<-filter(all,(female_ratio_she==0.5&female_she!=0.9)|(female_ratio_she==0.5&female_he!=0.9)|female_ratio_she!=0.5)
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all3<-all3[order(all3$female_ratio_she),]
she<- head(all3,15)
he<- tail(all3,15)
female_she<-rbind(she,he)
female_she$`rate_she`<-ifelse(female_she$female_ratio_she>=0.5,"more she","more he")
most_she<- female_she[order(female_she$female_ratio_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)
p3<-ggplot(most_she,aes(x=Term,y=female_ratio_she,label=female_ratio_she))+
geom_point(stat="identity",aes(col=`rate_she`),size=3)+
ylim(0,1)+
labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words by female authors',subtitle="Collected from 6 children literature by female authors")+
scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
geom_segment(aes(y=0.56427,x=Term,yend=female_ratio_she,xend=Term))+
geom_hline(yintercept=0.56427,na.rm = FALSE,show.legend = NA)
## There are more verbs used to describe female characters in the corpus of female writer.
p3+coord_flip()

- More verbs are paired with ‘she’ than ‘he’ in the corpus of female writers.
- Female caracters are portrayed more by actions that expresses sadness, sensitivity, hesitations, emotions, desires and curiosity.The actions of male characters are more dominant and intense.
What are the most gender-specifc words used by male authors?
all2<-filter(all,(male_ratio_she==0.5&male_she!=0.9)|(male_ratio_she==0.5&male_he!=0.9)|male_ratio_she!=0.5)
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all2<-all2[order(all2$male_ratio_she),]
she<- head(all2,15) # most frequently paired with she
he<- tail(all2,15) # most frequently paired with he
male_she<-rbind(she,he)
male_she$`rate_she`<-ifelse(male_she$male_ratio_she>=0.5,"more she","more he")
most_she<- male_she[order(male_she$male_ratio_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)
p2<-ggplot(most_she,aes(x=Term,y=male_ratio_she,label=male_ratio_she))+
geom_point(stat="identity",aes(col=`rate_she`),size=3)+
ylim(0,1)+
labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words by male authors',subtitle="Collected from 14 children literature by male authors")+
scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
geom_segment(aes(y=0.322479,x=Term,yend=male_ratio_she,xend=Term))+
geom_hline(yintercept=0.322479,na.rm = FALSE,show.legend = NA)
## There are more verbs used to describe male characters in the corpus of male writer.
p2+coord_flip()

- Much more verbs are paired with ‘he’ than ‘she’ in the corpus of male writers.
- Female caracters are portrayed more by actions that expresses sadness, sensitivity, hesitations, emotions, desires and curiosity.The actions of male characters are more dominant and intense.
Does authors’ gender make any difference?
all$`rate`<-ifelse(all$all_rate_she>=0.5,"more she","more he")
size<-all$all_totals*0.005
p4<-ggplot(all,aes(female_ratio_she,male_ratio_she))+
geom_point(aes(col=`rate`),size=size)+
xlim(0,1)+
ylim(0,1)+
labs(x='relatve collocation ratio with "she" by female authors',y='relatve collocation ratio by male authors',title='Comparison between authors of opposite genders ',subtitle="How likely will the term appear with 'she'")+
scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
geom_abline(mapping=NULL,slope=1)
p4

- If the point is closer to the line, that means the word is treated more similarly by male authors and female authors.
Is there any similarity?
all6<-filter(all,(male_she!=0.9&male_he!=0.9)&(female_she!=0.9&female_he!=0.9))
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all6<-all6[order(all6$abs_all_abs),]
nodif<- head(all6,30)
nodif<- nodif[order(nodif$abs_all_abs,decreasing=TRUE),]
nodif$Term<-factor(nodif$Term,levels=nodif$Term)
p6<-ggplot(nodif)+
geom_point(stat="identity",aes(x=Term,y=male_ratio_she,col="male author"),size=3)+
labs(x='terms',y='relative likelihood of appearing with "she"',title="Top 30 words where author's gender makes least difference")+
ylim(0,1)+
geom_segment(aes(y=0.5,x=Term,yend=male_ratio_she,xend=Term))+
geom_hline(yintercept=0.5,na.rm = FALSE,show.legend = NA)
same<- p6+geom_point(stat="identity",
aes(x=Term,y=female_ratio_she,
col="female author"),size=3)+
geom_segment(aes(y=0.5,x=Term,yend=female_ratio_she,xend=Term))+
scale_color_manual(values=c("female author"="#FFDB15","male author"="#8A6FDF"))
same+coord_flip()

Is there any difference?
howdif<- tail(all6,30)
howdif<- howdif[order(howdif$abs_all_abs,decreasing=TRUE),]
howdif$Term<-factor(howdif$Term,levels=howdif$Term)
p7<-ggplot(howdif)+
geom_point(stat="identity",aes(x=Term,y=male_ratio_she,col="male author"),size=3)+
labs(x='terms',y='relative likelihood of appearing with "she"',title="Top 30 words where author's gender makes great difference")+
ylim(0,1)+
geom_segment(aes(y=0.5,x=Term,yend=male_ratio_she,xend=Term))+
geom_hline(yintercept=0.5,na.rm = FALSE,show.legend = NA)
dif<- p7+geom_point(stat="identity",
aes(x=Term,y=female_ratio_she,
col="female author"),size=3)+
geom_segment(aes(y=0.5,x=Term,yend=female_ratio_she,xend=Term))+
scale_color_manual(values=c("female author"="#FFDB15","male author"="#8A6FDF"))
dif+coord_flip()
