knitr::opts_chunk$set (echo=FALSE)
library(tidyverse)
library(readxl)
library(ggplot2)
library(dplyr)

Text Source: 20 Children Literature From Project Gutenburg

Collocation Tool: Voyant Tools

datadir <- "/Users/clancy/Desktop/digital resources in the humanities/allallall/"
all <- read.csv(paste(datadir,"clean.csv",sep=""),header=TRUE,as.is=TRUE)
summary(all)
##      Term             female_she       female_he      female_ratio_she
##  Length:210         Min.   :  0.90   Min.   :  0.90   Min.   :0.1304  
##  Class :character   1st Qu.:  3.00   1st Qu.:  3.00   1st Qu.:0.4379  
##  Mode  :character   Median :  6.00   Median :  5.00   Median :0.5505  
##                     Mean   : 15.18   Mean   : 11.74   Mean   :0.5428  
##                     3rd Qu.: 12.00   3rd Qu.:  9.00   3rd Qu.:0.6667  
##                     Max.   :529.00   Max.   :457.00   Max.   :0.8966  
##     male_she         male_he       male_ratio_she       all_she      
##  Min.   :  0.90   Min.   :  0.90   Min.   :0.03614   Min.   :  0.90  
##  1st Qu.:  0.90   1st Qu.:  0.90   1st Qu.:0.27010   1st Qu.:  6.00  
##  Median :  3.00   Median :  9.00   Median :0.36785   Median :  9.00  
##  Mean   : 10.08   Mean   : 20.84   Mean   :0.36398   Mean   : 25.50  
##  3rd Qu.:  8.00   3rd Qu.: 18.00   3rd Qu.:0.50000   3rd Qu.: 19.75  
##  Max.   :357.00   Max.   :615.00   Max.   :0.66667   Max.   :886.00  
##      all_he         all_rate_she     rate_sequence      all_totals     
##  Min.   :   0.90   Min.   :0.03614   Min.   :  1.00   Min.   :   8.90  
##  1st Qu.:   8.00   1st Qu.:0.33333   1st Qu.: 54.25   1st Qu.:  14.25  
##  Median :  13.00   Median :0.40990   Median :108.50   Median :  23.00  
##  Mean   :  33.67   Mean   :0.40325   Mean   :107.97   Mean   :  59.17  
##  3rd Qu.:  28.00   3rd Qu.:0.50000   3rd Qu.:160.75   3rd Qu.:  44.00  
##  Max.   :1072.00   Max.   :0.93525   Max.   :214.00   Max.   :1958.00  
##    total_rank        all_abs          abs_all_abs    
##  Min.   :  1.00   Min.   :-0.36957   Min.   :0.0000  
##  1st Qu.: 57.25   1st Qu.: 0.03444   1st Qu.:0.1120  
##  Median :109.50   Median : 0.19871   Median :0.2143  
##  Mean   :109.22   Mean   : 0.17878   Mean   :0.2349  
##  3rd Qu.:161.75   3rd Qu.: 0.32447   3rd Qu.:0.3315  
##  Max.   :214.00   Max.   : 0.72542   Max.   :0.7254
## female_she refers to verbs' frequency in the texts written by female authors
## all_she refers to verbs' frequency throughout the whole corpus
## female_ratio_she = female_she / (female_she + female_he)
## all_abs = female_ratio_she - male_ratio_she
## abs_all_abs = ABS(all_abs)



What are the most frequent verbs used to describe ‘she’ / ‘he’?

most_she<-subset(all,rate_sequence<16|rate_sequence>199) ## 15 each
most_she$`rate_she`<-ifelse(most_she$all_rate_she>=0.5,"more she","more he")
most_she<- most_she[order(most_she$all_rate_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)

p<-ggplot(most_she,aes(x=Term,y=all_rate_she,label=all_rate_she))+
  geom_point(stat="identity",aes(col=`rate_she`),size=3)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words',subtitle="Collected from 20 children literature")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  ylim(0,1)+
  geom_segment(aes(y=0.430633,x=Term,yend=all_rate_she,xend=Term))+
  geom_hline(yintercept=0.430633,na.rm = FALSE,show.legend = NA)
## the relative ratio of the terms' likelihood of being paired with 'she' in the whole corpus
## calculated by: SUM(all_she)/SUM(all_totals) 

p+coord_flip()




What are the most gender-specifc words used by female authors?

all3<-filter(all,(female_ratio_she==0.5&female_she!=0.9)|(female_ratio_she==0.5&female_he!=0.9)|female_ratio_she!=0.5)
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all3<-all3[order(all3$female_ratio_she),]
she<- head(all3,15)
he<- tail(all3,15)
female_she<-rbind(she,he)

female_she$`rate_she`<-ifelse(female_she$female_ratio_she>=0.5,"more she","more he")
most_she<- female_she[order(female_she$female_ratio_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)

p3<-ggplot(most_she,aes(x=Term,y=female_ratio_she,label=female_ratio_she))+
  geom_point(stat="identity",aes(col=`rate_she`),size=3)+
  ylim(0,1)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words by female authors',subtitle="Collected from 6 children literature by female authors")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  geom_segment(aes(y=0.56427,x=Term,yend=female_ratio_she,xend=Term))+
  geom_hline(yintercept=0.56427,na.rm = FALSE,show.legend = NA)
## There are more verbs used to describe female characters in the corpus of female writer.
p3+coord_flip()




What are the most gender-specifc words used by male authors?

all2<-filter(all,(male_ratio_she==0.5&male_she!=0.9)|(male_ratio_she==0.5&male_he!=0.9)|male_ratio_she!=0.5)
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all2<-all2[order(all2$male_ratio_she),]

she<- head(all2,15) # most frequently paired with she
he<- tail(all2,15) # most frequently paired with he
male_she<-rbind(she,he)
male_she$`rate_she`<-ifelse(male_she$male_ratio_she>=0.5,"more she","more he")
most_she<- male_she[order(male_she$male_ratio_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)

p2<-ggplot(most_she,aes(x=Term,y=male_ratio_she,label=male_ratio_she))+
  geom_point(stat="identity",aes(col=`rate_she`),size=3)+
  ylim(0,1)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words by male authors',subtitle="Collected from 14 children literature by male authors")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  geom_segment(aes(y=0.322479,x=Term,yend=male_ratio_she,xend=Term))+
  geom_hline(yintercept=0.322479,na.rm = FALSE,show.legend = NA)
## There are more verbs used to describe male characters in the corpus of male writer.
p2+coord_flip()




Does authors’ gender make any difference?

all$`rate`<-ifelse(all$all_rate_she>=0.5,"more she","more he")
size<-all$all_totals*0.005
p4<-ggplot(all,aes(female_ratio_she,male_ratio_she))+
  geom_point(aes(col=`rate`),size=size)+
  xlim(0,1)+
  ylim(0,1)+
  labs(x='relatve collocation ratio with "she" by female authors',y='relatve collocation ratio by male authors',title='Comparison between authors of opposite genders ',subtitle="How likely will the term appear with 'she'")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  geom_abline(mapping=NULL,slope=1)
  
p4




Is there any similarity?

all6<-filter(all,(male_she!=0.9&male_he!=0.9)&(female_she!=0.9&female_he!=0.9))
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all6<-all6[order(all6$abs_all_abs),]

nodif<- head(all6,30)
nodif<- nodif[order(nodif$abs_all_abs,decreasing=TRUE),]
nodif$Term<-factor(nodif$Term,levels=nodif$Term)

p6<-ggplot(nodif)+
    geom_point(stat="identity",aes(x=Term,y=male_ratio_she,col="male author"),size=3)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title="Top 30 words where author's gender makes least difference")+
  ylim(0,1)+
  geom_segment(aes(y=0.5,x=Term,yend=male_ratio_she,xend=Term))+
  geom_hline(yintercept=0.5,na.rm = FALSE,show.legend = NA)

 same<- p6+geom_point(stat="identity",
                      aes(x=Term,y=female_ratio_she,
                          col="female author"),size=3)+
   geom_segment(aes(y=0.5,x=Term,yend=female_ratio_she,xend=Term))+
   scale_color_manual(values=c("female author"="#FFDB15","male author"="#8A6FDF"))
 
 same+coord_flip()




Is there any difference?

howdif<- tail(all6,30)
howdif<- howdif[order(howdif$abs_all_abs,decreasing=TRUE),]
howdif$Term<-factor(howdif$Term,levels=howdif$Term)

p7<-ggplot(howdif)+
    geom_point(stat="identity",aes(x=Term,y=male_ratio_she,col="male author"),size=3)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title="Top 30 words where author's gender makes great difference")+
  ylim(0,1)+
  geom_segment(aes(y=0.5,x=Term,yend=male_ratio_she,xend=Term))+
  geom_hline(yintercept=0.5,na.rm = FALSE,show.legend = NA)

 dif<- p7+geom_point(stat="identity",
                      aes(x=Term,y=female_ratio_she,
                          col="female author"),size=3)+
   geom_segment(aes(y=0.5,x=Term,yend=female_ratio_she,xend=Term))+
   scale_color_manual(values=c("female author"="#FFDB15","male author"="#8A6FDF"))
 
 dif+coord_flip()