knitr::opts_chunk$set (echo=FALSE)
library(tidyverse)
library(readxl)
library(ggplot2)
library(dplyr)

Text Source: 20 Children Literature From Project Gutenburg

Collocation Tool: Voyant Tools

datadir <- "/Users/clancy/Desktop/digital resources in the humanities/allallall/"
all <- read.csv(paste(datadir,"clean.csv",sep=""),header=TRUE,as.is=TRUE)
summary(all)

##      Term             female_she       female_he      female_ratio_she
##  Length:210         Min.   :  0.90   Min.   :  0.90   Min.   :0.1304  
##  Class :character   1st Qu.:  3.00   1st Qu.:  3.00   1st Qu.:0.4379  
##  Mode  :character   Median :  6.00   Median :  5.00   Median :0.5505  
##                     Mean   : 15.18   Mean   : 11.74   Mean   :0.5428  
##                     3rd Qu.: 12.00   3rd Qu.:  9.00   3rd Qu.:0.6667  
##                     Max.   :529.00   Max.   :457.00   Max.   :0.8966  
##     male_she         male_he       male_ratio_she       all_she      
##  Min.   :  0.90   Min.   :  0.90   Min.   :0.03614   Min.   :  0.90  
##  1st Qu.:  0.90   1st Qu.:  0.90   1st Qu.:0.27010   1st Qu.:  6.00  
##  Median :  3.00   Median :  9.00   Median :0.36785   Median :  9.00  
##  Mean   : 10.08   Mean   : 20.84   Mean   :0.36398   Mean   : 25.50  
##  3rd Qu.:  8.00   3rd Qu.: 18.00   3rd Qu.:0.50000   3rd Qu.: 19.75  
##  Max.   :357.00   Max.   :615.00   Max.   :0.66667   Max.   :886.00  
##      all_he         all_rate_she     rate_sequence      all_totals     
##  Min.   :   0.90   Min.   :0.03614   Min.   :  1.00   Min.   :   8.90  
##  1st Qu.:   8.00   1st Qu.:0.33333   1st Qu.: 54.25   1st Qu.:  14.25  
##  Median :  13.00   Median :0.40990   Median :108.50   Median :  23.00  
##  Mean   :  33.67   Mean   :0.40325   Mean   :107.97   Mean   :  59.17  
##  3rd Qu.:  28.00   3rd Qu.:0.50000   3rd Qu.:160.75   3rd Qu.:  44.00  
##  Max.   :1072.00   Max.   :0.93525   Max.   :214.00   Max.   :1958.00  
##    total_rank        all_abs          abs_all_abs    
##  Min.   :  1.00   Min.   :-0.36957   Min.   :0.0000  
##  1st Qu.: 57.25   1st Qu.: 0.03444   1st Qu.:0.1120  
##  Median :109.50   Median : 0.19871   Median :0.2143  
##  Mean   :109.22   Mean   : 0.17878   Mean   :0.2349  
##  3rd Qu.:161.75   3rd Qu.: 0.32447   3rd Qu.:0.3315  
##  Max.   :214.00   Max.   : 0.72542   Max.   :0.7254

## female_she refers to verbs' frequency in the texts written by female authors
## all_she refers to verbs' frequency throughout the whole corpus
## female_ratio_she = female_she / (female_she + female_he)
## all_abs = female_ratio_she - male_ratio_she
## abs_all_abs = ABS(all_abs)

What are the most frequent verbs used to describe ‘she’ / ‘he’?

most_she<-subset(all,rate_sequence<16|rate_sequence>199) ## 15 each
most_she$`rate_she`<-ifelse(most_she$all_rate_she>=0.5,"more she","more he")
most_she<- most_she[order(most_she$all_rate_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)

p<-ggplot(most_she,aes(x=Term,y=all_rate_she,label=all_rate_she))+
  geom_point(stat="identity",aes(col=`rate_she`),size=3)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words',subtitle="Collected from 20 children literature")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  ylim(0,1)+
  geom_segment(aes(y=0.430633,x=Term,yend=all_rate_she,xend=Term))+
  geom_hline(yintercept=0.430633,na.rm = FALSE,show.legend = NA)
## the relative ratio of the terms' likelihood of being paired with 'she' in the whole corpus
## calculated by: SUM(all_she)/SUM(all_totals) 

p+coord_flip()

Much more verbs are paired with ‘he’ than ‘she’ in the corpus.
Female caracters are portrayed more by actions that expresses gentle emotions, feelings, desires and curiosity.The pattern of Male characters are less obvious, though few actions sounds more violent and intense.

What are the most gender-specifc words used by female authors?

all3<-filter(all,(female_ratio_she==0.5&female_she!=0.9)|(female_ratio_she==0.5&female_he!=0.9)|female_ratio_she!=0.5)
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all3<-all3[order(all3$female_ratio_she),]
she<- head(all3,15)
he<- tail(all3,15)
female_she<-rbind(she,he)

female_she$`rate_she`<-ifelse(female_she$female_ratio_she>=0.5,"more she","more he")
most_she<- female_she[order(female_she$female_ratio_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)

p3<-ggplot(most_she,aes(x=Term,y=female_ratio_she,label=female_ratio_she))+
  geom_point(stat="identity",aes(col=`rate_she`),size=3)+
  ylim(0,1)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words by female authors',subtitle="Collected from 6 children literature by female authors")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  geom_segment(aes(y=0.56427,x=Term,yend=female_ratio_she,xend=Term))+
  geom_hline(yintercept=0.56427,na.rm = FALSE,show.legend = NA)
## There are more verbs used to describe female characters in the corpus of female writer.
p3+coord_flip()

More verbs are paired with ‘she’ than ‘he’ in the corpus of female writers.
Female caracters are portrayed more by actions that expresses sadness, sensitivity, hesitations, emotions, desires and curiosity.The actions of male characters are more dominant and intense.

What are the most gender-specifc words used by male authors?

all2<-filter(all,(male_ratio_she==0.5&male_she!=0.9)|(male_ratio_she==0.5&male_he!=0.9)|male_ratio_she!=0.5)
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all2<-all2[order(all2$male_ratio_she),]

she<- head(all2,15) # most frequently paired with she
he<- tail(all2,15) # most frequently paired with he
male_she<-rbind(she,he)
male_she$`rate_she`<-ifelse(male_she$male_ratio_she>=0.5,"more she","more he")
most_she<- male_she[order(male_she$male_ratio_she),]
most_she$Term<-factor(most_she$Term,levels=most_she$Term)

p2<-ggplot(most_she,aes(x=Term,y=male_ratio_she,label=male_ratio_she))+
  geom_point(stat="identity",aes(col=`rate_she`),size=3)+
  ylim(0,1)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title='Top 30 most gender-specific words by male authors',subtitle="Collected from 14 children literature by male authors")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  geom_segment(aes(y=0.322479,x=Term,yend=male_ratio_she,xend=Term))+
  geom_hline(yintercept=0.322479,na.rm = FALSE,show.legend = NA)
## There are more verbs used to describe male characters in the corpus of male writer.
p2+coord_flip()

Much more verbs are paired with ‘he’ than ‘she’ in the corpus of male writers.
Female caracters are portrayed more by actions that expresses sadness, sensitivity, hesitations, emotions, desires and curiosity.The actions of male characters are more dominant and intense.

Does authors’ gender make any difference?

all$`rate`<-ifelse(all$all_rate_she>=0.5,"more she","more he")
size<-all$all_totals*0.005
p4<-ggplot(all,aes(female_ratio_she,male_ratio_she))+
  geom_point(aes(col=`rate`),size=size)+
  xlim(0,1)+
  ylim(0,1)+
  labs(x='relatve collocation ratio with "she" by female authors',y='relatve collocation ratio by male authors',title='Comparison between authors of opposite genders ',subtitle="How likely will the term appear with 'she'")+
  scale_color_manual(values=c("more she"="#FC3C80","more he"="#40B0DF"))+
  geom_abline(mapping=NULL,slope=1)
  
p4

If the point is closer to the line, that means the word is treated more similarly by male authors and female authors.

Is there any similarity?

all6<-filter(all,(male_she!=0.9&male_he!=0.9)&(female_she!=0.9&female_he!=0.9))
## only preserve terms both appear in the list of female authors and male authors
## Null-or-zero values are converted to 0.9 for the sake of calculating the gender-specific rate. These terms are filtered in case the result might be contaminated
all6<-all6[order(all6$abs_all_abs),]

nodif<- head(all6,30)
nodif<- nodif[order(nodif$abs_all_abs,decreasing=TRUE),]
nodif$Term<-factor(nodif$Term,levels=nodif$Term)

p6<-ggplot(nodif)+
    geom_point(stat="identity",aes(x=Term,y=male_ratio_she,col="male author"),size=3)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title="Top 30 words where author's gender makes least difference")+
  ylim(0,1)+
  geom_segment(aes(y=0.5,x=Term,yend=male_ratio_she,xend=Term))+
  geom_hline(yintercept=0.5,na.rm = FALSE,show.legend = NA)

 same<- p6+geom_point(stat="identity",
                      aes(x=Term,y=female_ratio_she,
                          col="female author"),size=3)+
   geom_segment(aes(y=0.5,x=Term,yend=female_ratio_she,xend=Term))+
   scale_color_manual(values=c("female author"="#FFDB15","male author"="#8A6FDF"))
 
 same+coord_flip()

Is there any difference?

howdif<- tail(all6,30)
howdif<- howdif[order(howdif$abs_all_abs,decreasing=TRUE),]
howdif$Term<-factor(howdif$Term,levels=howdif$Term)

p7<-ggplot(howdif)+
    geom_point(stat="identity",aes(x=Term,y=male_ratio_she,col="male author"),size=3)+
  labs(x='terms',y='relative likelihood of appearing with "she"',title="Top 30 words where author's gender makes great difference")+
  ylim(0,1)+
  geom_segment(aes(y=0.5,x=Term,yend=male_ratio_she,xend=Term))+
  geom_hline(yintercept=0.5,na.rm = FALSE,show.legend = NA)

 dif<- p7+geom_point(stat="identity",
                      aes(x=Term,y=female_ratio_she,
                          col="female author"),size=3)+
   geom_segment(aes(y=0.5,x=Term,yend=female_ratio_she,xend=Term))+
   scale_color_manual(values=c("female author"="#FFDB15","male author"="#8A6FDF"))
 
 dif+coord_flip()

Collocation Analysis of Children’s Literature

UCL-SSN:20053281

5/6/2021

Text Source: 20 Children Literature From Project Gutenburg

Collocation Tool: Voyant Tools

What are the most frequent verbs used to describe ‘she’ / ‘he’?

What are the most gender-specifc words used by female authors?

What are the most gender-specifc words used by male authors?

Does authors’ gender make any difference?

Is there any similarity?

Is there any difference?