https://rpubs.com/gloukatou/733656

R Markdown

library(childesr)
library(wordbankr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
require(scales)


library(RColorBrewer)
myPalette <- brewer.pal(5, "Set2") 

Load annotations

#####TODOS first part is the descriptive section, to what extent the full range of populations is represented, what are some codes that are missing often,
##### descriptive section and representation of the population
##########systematic review of features
##################confront eg proportion of child population versus proportion of data by eg continent
#############second goal: point out strengths and limitations of the current CHILDES, explain what is missing


annotations1<- read.csv("Childes_corpora-Clara1.csv")
annotations2<- read.csv("Childes_corpora-Camila.csv")
annotations3<- read.csv("Childes_corpora-Lara.csv")

annotations <-rbind(annotations1, annotations2, annotations3) #merge annotations from 3 annotators

annotations<- rename(annotations, lang = Language)
annotations$Bilingualism.in.corpus..yes..no.[annotations$Bilingualism.in.corpus..yes..no. == "yes"] <- "Bilinguals"
annotations$Bilingualism.in.corpus..yes..no.[annotations$Bilingualism.in.corpus..yes..no. == "no"] <- "Monolinguals"

annotations <- annotations %>%  filter(!(Inclusion=="")) #only keep annotated corpora
annotations[annotations==""]<-NA
annotations[annotations=="N/A"]<-NA
annotations[annotations=="N/A (0,33)"]<-NA
annotations[annotations=="N/A (0)"]<-NA
annotations[annotations=="NA (0)"]<-NA

#head(annotations)

253 corpora already annotated.

Optional chunk merging with childesr for extra information

d_transcripts <- get_transcripts()
d_statistics <- get_speaker_statistics()

mergedChiAnnot <- annotations %>% left_join(d_transcripts)
mergedChiAnnot <- mergedChiAnnot %>% left_join(n_utt_d)
mergedChiAnnot <- mergedChiAnnot %>% left_join(n_utt_speaker)
mergedChiAnnot<-mergedChiAnnot[!(is.na(mergedChiAnnot$n_utt) | mergedChiAnnot$n_utt=="" ), ]

if ( length(unique(mergedChiAnnot$corpus_name)) < nrow(annotations) ) {
annotations$corpus_name[!annotations$corpus_name %in% mergedChiAnnot$corpus_name]
  } #print non merged corpora

Corpus inclusion

Based on “Inclusion” annotation column

How many corpora should be included?

inclu_data<- annotations %>%
  group_by(Inclusion) %>%
  count() 

ggplot(inclu_data,aes(y=Inclusion, x=n)) +
  geom_bar(stat="identity", width=1, color="white") +
  theme_minimal() + 
  ggtitle("Corpus inclusion") 

annotations <- annotations %>%
  filter(Inclusion=="yes")  

From the total amount of corpora, 141 have been included in this study. The following analyses will continue only on these included corpora.

Bilinguals

Based on “Bilingualism” annotation column

How many corpora are bilingual?

group_count <-function(data_, column_){
data_<- data_ %>%
  group_by({{column_}}) %>%
    count() 
      return(data_)
}

byBilinguals<-group_count(annotations, Bilingualism.in.corpus..yes..no.)
lbls <- paste(byBilinguals$Bilingualism.in.corpus..yes..no., byBilinguals$n, sep=" ") # add numbers to labels
pie(byBilinguals$n, labels=lbls, border="white", col=myPalette)

head(byBilinguals)
## # A tibble: 4 x 2
## # Groups:   Bilingualism.in.corpus..yes..no. [4]
##   Bilingualism.in.corpus..yes..no.     n
##   <chr>                            <int>
## 1 Bilinguals                          27
## 2 Monolinguals                        29
## 3 trilinguism                          1
## 4 <NA>                                84

84 corpora have NA in “Bilingualism in corpus”.

Siblings

Based on “Number of children with siblings” annotation column

How many of the target children have siblings?

byPart <- annotations %>%
    group_by(lang, Bilingualism.in.corpus..yes..no.) %>%
      summarise(numpar = sum(as.numeric(Number.of.participants), na.rm=TRUE)) 

bySibl<- annotations %>%
    group_by(lang) %>%
      summarise(numChildrenwithSiblings = sum(as.numeric(X..of.children.with.siblings)))

ggplot(bySibl[!is.na(bySibl$numChildrenwithSiblings),], mapping = aes(y=lang, x=numChildrenwithSiblings)) + 
  geom_point() + 
    ggtitle("Languages where number of children with siblings is not unknown") + 
      geom_col()

head(bySibl)
## # A tibble: 6 x 2
##   lang                 numChildrenwithSiblings
##   <chr>                                  <dbl>
## 1 Afrikaan                                   1
## 2 Arabic                                    NA
## 3 Basque                                    NA
## 4 Brazilian Portuguese                      NA
## 5 Cantonese                                  4
## 6 Cantonese-English                          6

Out of 1687 total target children, only 49 have siblings. No information for children in 65 languages (marked as NAs).

Based on “Average number of siblings” annotation column

How many siblings in average?

bySiblnum<-annotations %>%
    group_by(Average.number.of.siblings) %>%
      count()

ggplot(bySiblnum, mapping = aes(x=Average.number.of.siblings, y=n)) + 
  geom_point() + 
    ggtitle("Average number of siblings per corpus") + 
      geom_col()

#ggplot(bySiblnum[!is.na(bySiblnum$Average.number.of.siblings),], mapping = aes(x=Average.number.of.siblings, y=n)) 

annotations$Average.number.of.siblings <- as.numeric(as.character(annotations$Average.number.of.siblings))
head(bySiblnum)
## # A tibble: 6 x 2
## # Groups:   Average.number.of.siblings [6]
##   Average.number.of.siblings     n
##   <chr>                      <int>
## 1 0                             10
## 2 0.25                           1
## 3 0.33                           1
## 4 0.66                           1
## 5 0.75                           1
## 6 0.875                          1

The mean number of siblings has been left NA for 88 out of 141 corpora. In the remaining corpora, the mean number of siblings per corpus is 0.992451 with min 0 and max 2.

Based on “Average number of older siblings” annotation column

How many older siblings in average?

byoldSiblnum<-annotations %>%
    group_by(Average.number.of.older.siblings) %>%
      count()

ggplot(byoldSiblnum, mapping = aes(x=Average.number.of.older.siblings, y=n)) + 
  geom_point() + 
    ggtitle("Average number of older siblings per corpus") + 
      geom_col()

annotations$Average.number.of.older.siblings <- as.numeric(as.character(annotations$Average.number.of.older.siblings))

head(byoldSiblnum)
## # A tibble: 6 x 2
## # Groups:   Average.number.of.older.siblings [6]
##   Average.number.of.older.siblings     n
##   <chr>                            <int>
## 1 0                                   27
## 2 0,5                                  3
## 3 0,6                                  1
## 4 0.25                                 1
## 5 0.3                                  2
## 6 0.5                                  1

The mean number of older siblings has been left NA for 89 out of 141 corpora. In the remaining corpora, the mean number of older siblings per corpus is 0.4288043 with min NA and max 2.

Sessions

Based on “Mean sessions per child” annotation column

How many sessions per language?

annotations <- annotations[!is.na(annotations$Mean...sessions.per.child), ]
bySess<- annotations 
bySess$Mean...sessions.per.child <- sub(",",".",bySess$Mean...sessions.per.child)
bySess$Mean...sessions.per.child <- as.numeric(bySess$Mean...sessions.per.child)

bySess<- bySess %>%
  mutate(sessions = as.numeric(Number.of.participants) * Mean...sessions.per.child) %>%
    group_by(lang, Bilingualism.in.corpus..yes..no.) %>%
      summarise(numses = sum(as.numeric(sessions)))

ggplot(bySess, mapping = aes(y = reorder(lang, numses), numses)) + geom_point() + ggtitle("Number of sessions per language") + geom_col()+ facet_wrap(~Bilingualism.in.corpus..yes..no., scale="free")

Location

Based on “Neighbour.village.city.province.state.country” annotation column

For now only based on continents and countries. Lots of handcoding.

annotations$continent <-NA

annotations$continent[grep("England|Sweden|Spain|Madrid, Spain ; Tenerife, Canary Islands|Madrid, Spain|Navarra, Spain|Estonia|Tartu, Estonia ; Rapla, Estonia|Budapest, Hungary|West Coast, Ireland|Arfon area Gwynedd, North Wales|Netherlands|Belfast, Northern Ireland|Nottingham/Manchester, England|Sweden ; Portugal|Czech Republic|Bucharest, Romania|Postdam, Germany|Stuttgart, Germany|France|Alt penedes, region of catalonia|spain|Naples|Spain, Hungary, Lloret de mar|Spain, Salamanca|England, Brighton|France (Normandy, Marseille, + places visited)|Portugal|italy|Barcelona|Switzerland|Stockholm, Sweden|Iceland|Poznań, Poland|Romania|Antwerp, Belgium|Ireland|Hungary|Athens, Greece|Southern Estonia|Tartu, Estonia|A Coruña, Santiago de Compostela ; Lugo ; Pontevedra, A Estrada ; SPAIN|Moscow, Russia|Spain, Salamanca|Wales", annotations$Neighbour.village.city.province.state.country)]  <- "Europe"

annotations$continent[grep("Rio Cuarto, Cordoba, Argentina|Sao Paulo|Patzcuaro, Michoacan, iMexico|Jamaica", annotations$Neighbour.village.city.province.state.country)]  <- "Central and South America"

annotations$continent[grep("Iran|Israel|Bombay, India|China|Bangkok, Thailand|Tokyo area|Nagoya area|osaka area|Kuwait|Yahud, Israel|Nagoya center|Kusatsu City, Shiga Pref|Hong-Kong|Taiwan|Turkey", annotations$Neighbour.village.city.province.state.country)]  <- "Asia"

annotations$continent[grep("Michigan, USA|USA, Northern Virginia|Canada|California, USA|washington dc|United States|USA", annotations$Neighbour.village.city.province.state.country)]  <- "North America"

annotations$continent[grep("Alexandria, Egypt|Mokhotlong, Lesotho", annotations$Neighbour.village.city.province.state.country)]  <- "Africa"


annotations$country <-NA
annotations$country[grep("England|West Coast, Ireland|Arfon area Gwynedd, North Wales|Belfast, Northern Ireland|Nottingham/Manchester, England|England, Brighton|Ireland|Wales", annotations$Neighbour.village.city.province.state.country)]  <- "UK/Ireland"
annotations$country[grep("Sweden|Sweden ; Portugal|Stockholm, Sweden", annotations$Neighbour.village.city.province.state.country)]  <- "Sweden"
annotations$country[grep("Spain|Madrid, Spain ; Tenerife, Canary Islands|Madrid, Spain|Navarra, Spain|Alt penedes, region of catalonia|spain|Spain, Hungary, Lloret de mar|Spain, Salamanca|Barcelona|A Coruña, Santiago de Compostela ; Lugo ; Pontevedra, A Estrada ; SPAIN|Spain, Salamanca", annotations$Neighbour.village.city.province.state.country)]  <- "Spain"
annotations$country[grep("Estonia|Tartu, Estonia ; Rapla, Estonia|Southern Estonia|Tartu, Estonia", annotations$Neighbour.village.city.province.state.country)]  <- "Estonia"
annotations$country[grep("Budapest, Hungary|Hungary|Spain, Hungary, Lloret de mar", annotations$Neighbour.village.city.province.state.country)]  <- "Hungary"
annotations$country[grep("Sweden ; Portugal|Portugal", annotations$Neighbour.village.city.province.state.country)]  <- "Portugal"
annotations$country[grep("Czech Republic", annotations$Neighbour.village.city.province.state.country)]  <- "Czech Republic"
annotations$country[grep("Bucharest, Romania|Romania", annotations$Neighbour.village.city.province.state.country)]  <- "Romania"
annotations$country[grep("Postdam, Germany|Stuttgart, Germany", annotations$Neighbour.village.city.province.state.country)]  <- "Germany"
annotations$country[grep("France|France (Normandy, Marseille, + places visited)", annotations$Neighbour.village.city.province.state.country)]  <- "France"
annotations$country[grep("Naples|italy", annotations$Neighbour.village.city.province.state.country)]  <- "Italy"
annotations$country[grep("Switzerland", annotations$Neighbour.village.city.province.state.country)]  <- "Switzerland"
annotations$country[grep("Iceland", annotations$Neighbour.village.city.province.state.country)]  <- "Iceland"
annotations$country[grep("Poznań, Poland", annotations$Neighbour.village.city.province.state.country)]  <- "Poland"
annotations$country[grep("Antwerp, Belgium", annotations$Neighbour.village.city.province.state.country)]  <- "Belgium"
annotations$country[grep("Athens, Greece", annotations$Neighbour.village.city.province.state.country)]  <- "Greece"
annotations$country[grep("Moscow, Russia", annotations$Neighbour.village.city.province.state.country)]  <- "Russia"
annotations$country[grep("Rio Cuarto, Cordoba, Argentina", annotations$Neighbour.village.city.province.state.country)]  <- "Argentina"
annotations$country[grep("Sao Paulo", annotations$Neighbour.village.city.province.state.country)]  <- "Brazil"
annotations$country[grep("Patzcuaro, Michoacan, iMexico", annotations$Neighbour.village.city.province.state.country)]  <- "Mexico"
annotations$country[grep("Jamaica", annotations$Neighbour.village.city.province.state.country)]  <- "Jamaica"
annotations$country[grep("Iran", annotations$Neighbour.village.city.province.state.country)]  <- "Iran"
annotations$country[grep("Israel|Yahud, Israel", annotations$Neighbour.village.city.province.state.country)]  <- "Israel"
annotations$country[grep("Bombay, India", annotations$Neighbour.village.city.province.state.country)]  <- "India"
annotations$country[grep("China", annotations$Neighbour.village.city.province.state.country)]  <- "China"
annotations$country[grep("Bangkok, Thailand", annotations$Neighbour.village.city.province.state.country)]  <- "Thailand"
annotations$country[grep("Tokyo area|Nagoya area|osaka area|Nagoya center|Kusatsu City, Shiga Pref", annotations$Neighbour.village.city.province.state.country)]  <- "Japan"
annotations$country[grep("Kuwait", annotations$Neighbour.village.city.province.state.country)]  <- "Kuwait"
annotations$country[grep("Hong-Kong", annotations$Neighbour.village.city.province.state.country)]  <- "Hong-Kong"
annotations$country[grep("Taiwan", annotations$Neighbour.village.city.province.state.country)]  <- "Taiwan"
annotations$country[grep("Turkey", annotations$Neighbour.village.city.province.state.country)]  <- "Turkey"
annotations$country[grep("Michigan, USA|USA, Northern Virginia|California, USA|washington dc|United States|USA", annotations$Neighbour.village.city.province.state.country)]  <- "USA"
annotations$country[grep("Canada", annotations$Neighbour.village.city.province.state.country)]  <- "Canada"
annotations$country[grep("Alexandria, Egypt", annotations$Neighbour.village.city.province.state.country)]  <- "Egypt"
annotations$country[grep("Mokhotlong, Lesotho", annotations$Neighbour.village.city.province.state.country)]  <- "Lesotho"

Corpora by countries and continents

byLocation <- annotations %>%
  select(country, continent) %>%
    group_by(country, continent) %>%
      count()

ggplot(byLocation, aes(x=n, y=country)) +
  geom_bar(stat="identity", width=1, color="white") +
    ggtitle("Corpora by Countries and Continents") +
      facet_wrap(~continent, scale="free") 

head(byLocation)
## # A tibble: 6 x 3
## # Groups:   country, continent [6]
##   country        continent                     n
##   <chr>          <chr>                     <int>
## 1 Argentina      Central and South America     1
## 2 Belgium        Europe                        2
## 3 Brazil         Central and South America     1
## 4 Canada         North America                 2
## 5 China          Asia                          3
## 6 Czech Republic Europe                        1

36 countries are represented in childes. 5 continents. 68 of the 102 corpora are European, and 10 of the corpora are North American.

Based on “Neighbour.village.city.province.state.country” annotation column

Corpora by countries and continents

byLocation1 <- group_count(annotations, continent)
lbls <- paste(byLocation1$continent, byLocation1$n, sep=" ") # add numbers to labels
pie(byLocation1$n, labels=lbls, border="white", col=myPalette, main="Corpora by continent")

head(byLocation1)
## # A tibble: 6 x 2
## # Groups:   continent [6]
##   continent                     n
##   <chr>                     <int>
## 1 Africa                        2
## 2 Asia                         20
## 3 Central and South America     4
## 4 Europe                       66
## 5 North America                 8
## 6 <NA>                          2

Place of recording

Based on “Place.of.recordings.” annotation column

Will need handcoding.

byPlaceofRecording <- group_count(annotations, Place.of.recordings...home..nursery....)
lbls <- paste(byPlaceofRecording$Place.of.recordings...home..nursery...., byPlaceofRecording$n, sep=" ") # add numbers to labels
pie(byPlaceofRecording$n, labels=lbls, border="white", col=myPalette)

head(byPlaceofRecording)
## # A tibble: 6 x 2
## # Groups:   Place.of.recordings...home..nursery.... [6]
##   Place.of.recordings...home..nursery....                                      n
##   <chr>                                                                    <int>
## 1 6 in nursery, 3 at home                                                      1
## 2 home                                                                        69
## 3 home (mostly)                                                                1
## 4 home (TYA) ; home of mother, of paternal grandparents or maternal grand…     1
## 5 home or daycare                                                              1
## 6 home or kindergarten                                                         1

Recording method

Based on “Recording.method..static..portable..both..diary..other..” annotation column

Will need handcoding. For now only prints column

byRecord <- annotations %>% 
  group_by(Presence.of.investigator..yes.no.parent.is.investigator.half.half., Recording.method..static..portable..both..diary..other..) %>%
    summarise(n = n()) %>% 
      arrange(desc(n)) 
 
byRecord %>% 
  filter(!is.na(Presence.of.investigator..yes.no.parent.is.investigator.half.half.), !is.na(Recording.method..static..portable..both..diary..other..)) %>%
    rename( number_of_corpora = n)
## # A tibble: 38 x 3
## # Groups:   Presence.of.investigator..yes.no.parent.is.investigator.half.half.
## #   [9]
##    Presence.of.investigator..yes.… Recording.method..static..p… number_of_corpo…
##    <chr>                           <chr>                                   <int>
##  1 yes                             portable                                    7
##  2 yes                             static                                      5
##  3 yes                             audio                                       3
##  4 yes                             videotape                                   3
##  5 no                              audio recorded                              2
##  6 no                              portable                                    2
##  7 no                              static                                      2
##  8 no                              videotape                                   2
##  9 yes                             audio & videotape                           2
## 10 yes                             audiotape                                   2
## # … with 28 more rows
#lbls <- paste(byRecord$Presence.of.investigator..yes.no.parent.is.investigator.half.half., byRecord$Recording.method..static..portable..both..diary..other.., byRecord$n, sep=" & ") # add numbers to labels
#pie(byRecord$n, labels=lbls, border="white", col=myPalette)

21 corpora have NA in “Presence.of.investigator” column.

33 corpora have NA in “Recording.method” column.

Recording age

Based on “Mean.child.age.at.beginning.of.recordings, Mean.child.age.at.end.of.recordings” annotation columns

byAge <- annotations %>% 
 select(Mean.child.age.at.beginning.of.recordings, Mean.child.age.at.end.of.recordings, country, continent) %>%
  group_by(country, continent) %>%
    summarize(min = min(Mean.child.age.at.beginning.of.recordings), max = max(Mean.child.age.at.end.of.recordings)) %>%
       arrange(desc(min)) 


ggplot(byAge, aes(y=country))+
  geom_linerange(aes(xmin=min,xmax=max),linetype=2,color="blue")+
    geom_point(aes(x=min),size=3,color="red")+
      geom_point(aes(x=max),size=3,color="red")+
        ggtitle("Nim and Max age per country") +
          theme_bw()+
             facet_wrap(~continent, scale="free") 

Yes/No Tiers

Based on “CDS.annotation, English.translation, Morphological.annotation, Only.directed.to.child.transcribed, Correction.of.speech.errors” annotation columns

Mostly NAs. For now just prints column

group_select<-function(data_, column_){
data_ <- data_%>%
    group_by(!!column_) %>%
      select(Corpus, !!column_)
data_ <- data_[ order( data_[,3] ),]
return(data_)
}

byCdsTier <-group_select(annotations, "CDS.annotation..yes.no.")
byCdsTier <- byCdsTier %>% group_by(CDS.annotation..yes.no.) %>%  summarise(n = n())
head(byCdsTier)
## # A tibble: 3 x 2
##   CDS.annotation..yes.no.     n
##   <chr>                   <int>
## 1 no                         11
## 2 yes                         1
## 3 <NA>                       90
byannotTier <-group_select(annotations, "English.translation..yes.no")
byannotTier <- byannotTier %>% group_by(English.translation..yes.no) %>%  summarise(n = n())
head(byannotTier)
## # A tibble: 6 x 2
##   English.translation..yes.no                       n
##   <chr>                                         <int>
## 1 "-"                                               3
## 2 "- "                                              1
## 3 "no"                                             39
## 4 "no (from what I can see in the transcripts)"     5
## 5 "yes"                                             9
## 6  <NA>                                            45
bymorphTier <- group_select(annotations, "Morphological.annotation..yes.no.")
bymorphTier <- bymorphTier %>% group_by(Morphological.annotation..yes.no.) %>%  summarise(n = n())
head(bymorphTier)
## # A tibble: 3 x 2
##   Morphological.annotation..yes.no.     n
##   <chr>                             <int>
## 1 no                                   13
## 2 yes                                  28
## 3 <NA>                                 61
byCDStransTier <- group_select(annotations, "Only.directed.to.child.transcribed..i.e..speech.between.adults.not.transcribed...yes.no")
byCDStransTier <- byCDStransTier %>% group_by(Only.directed.to.child.transcribed..i.e..speech.between.adults.not.transcribed...yes.no) %>%  summarise(n = n())
head(byCDStransTier)
## # A tibble: 5 x 2
##   Only.directed.to.child.transcribed..i.e..speech.between.adults.not.tran…     n
##   <chr>                                                                    <int>
## 1 mostly                                                                       1
## 2 no                                                                           3
## 3 yes                                                                          3
## 4 yes (long conversations between adults were not transcribed)                 1
## 5 <NA>                                                                        94
byCorrections<- group_select(annotations, "Correction.of.speech.errors..i.e..syntactic...transcriptions.not.exactly.matching.speech...yes.no")
byCorrections <- byCorrections %>% group_by(Correction.of.speech.errors..i.e..syntactic...transcriptions.not.exactly.matching.speech...yes.no) %>%  summarise(n = n())
head(byCorrections)
## # A tibble: 3 x 2
##   Correction.of.speech.errors..i.e..syntactic...transcriptions.not.exactl…     n
##   <chr>                                                                    <int>
## 1 no                                                                           3
## 2 yes                                                                          8
## 3 <NA>                                                                        91

Weird variables

Based on “Fertility, Interbirth intervals, Schooling, Community, Healty, Culture, Mortality, Household…” annotation columns

Mostly NAs. For now just prints column

byFertility <- group_select(annotations, "Fertility.rate.of.community")
byFertility <- byFertility %>% group_by(Fertility.rate.of.community) %>%  summarise(n = n())
byFertility
## # A tibble: 2 x 2
##   Fertility.rate.of.community     n
## * <chr>                       <int>
## 1 no                              1
## 2 <NA>                          101
byInterbirth <- group_select(annotations, "Interbirth.intervals")
byInterbirth <- byInterbirth %>% group_by(Interbirth.intervals) %>%  summarise(n = n())
byInterbirth
## # A tibble: 3 x 2
##   Interbirth.intervals     n
## * <chr>                <int>
## 1 Na                       1
## 2 no                       1
## 3 <NA>                   100
bySchooling <- group_select(annotations, "Access.to.schooling.for.children..yes.no.only.elementary.")
bySchooling <- bySchooling %>% group_by(Access.to.schooling.for.children..yes.no.only.elementary.) %>%  summarise(n = n())
bySchooling
## # A tibble: 2 x 2
##   Access.to.schooling.for.children..yes.no.only.elementary.     n
## * <chr>                                                     <int>
## 1 yes                                                           8
## 2 <NA>                                                         94
byCommunity <- group_select(annotations, "Type.of.community..hunter.forager.herder.farmer.work.for.pay.industrial.")
byCommunity <- byCommunity %>% group_by(Type.of.community..hunter.forager.herder.farmer.work.for.pay.industrial.) %>%  summarise(n = n())
byCommunity
## # A tibble: 3 x 2
##   Type.of.community..hunter.forager.herder.farmer.work.for.pay.industrial.     n
## * <chr>                                                                    <int>
## 1 industrial                                                                   3
## 2 rural                                                                        1
## 3 <NA>                                                                        98
byHealth <- group_select(annotations, "Access.to.health.for.children...yes.no.")
byHealth <- byHealth %>% group_by(Access.to.health.for.children...yes.no.) %>%  summarise(n = n())
byHealth 
## # A tibble: 2 x 2
##   Access.to.health.for.children...yes.no.     n
## * <chr>                                   <int>
## 1 yes                                         2
## 2 <NA>                                      100
byCulture <- group_select(annotations, "Type.of.culture..minority..majority.")
byCulture <- byCulture %>% group_by(Type.of.culture..minority..majority.) %>%  summarise(n = n())
byCulture
## # A tibble: 4 x 2
##   Type.of.culture..minority..majority.     n
## * <chr>                                <int>
## 1 majority                                 1
## 2 majority?                                2
## 3 minority                                 2
## 4 <NA>                                    97
byMortality<- group_select(annotations, "Mortality.rate.of.children.in.the.community")
byMortality<- byMortality %>% group_by(Mortality.rate.of.children.in.the.community) %>%  summarise(n = n())
byMortality
## # A tibble: 1 x 2
##   Mortality.rate.of.children.in.the.community     n
## * <chr>                                       <int>
## 1 <NA>                                          102
byLangSpeakers<- group_select(annotations, "Number.of.speakers.of.the.language")
byLangSpeakers<- byLangSpeakers %>% group_by(Number.of.speakers.of.the.language) %>%  summarise(n = n())
byLangSpeakers
## # A tibble: 3 x 2
##   Number.of.speakers.of.the.language     n
## * <chr>                              <int>
## 1 4 million                              1
## 2 70% of population                      1
## 3 <NA>                                 100
byHousehold<- group_select(annotations, "Household.structure..nuclear..extended.")
byhousehold<- byHousehold %>% group_by(Household.structure..nuclear..extended.) %>%  summarise(n = n())
byhousehold
## # A tibble: 3 x 2
##   Household.structure..nuclear..extended.     n
## * <chr>                                   <int>
## 1 extended                                    1
## 2 nuclear                                    10
## 3 <NA>                                       91
byTimespent<- group_select(annotations, "Where.children.spend.their.time..home..nursery..playing.by.the.river....")
byTimespent<- byTimespent %>% group_by(Where.children.spend.their.time..home..nursery..playing.by.the.river....) %>%  summarise(n = n())
byTimespent
## # A tibble: 15 x 2
##    Where.children.spend.their.time..home..nursery..playing.by.the.river..…     n
##  * <chr>                                                                   <int>
##  1 "assistante maternelle francophone de 2;4 a 3;1 ans a mi-temps (20H pa…     1
##  2 "crèche, home"                                                              1
##  3 "crèche/grandmother 1 day a week"                                           1
##  4 "daycare"                                                                   1
##  5 "daycare ; grandparents/aunt "                                              1
##  6 "grandparents ; family with 2 children & a dog "                            1
##  7 "home, nursery"                                                             1
##  8 "home+dutch babysitter, and when school started: english speaking pres…     1
##  9 "kindergarten"                                                              1
## 10 "mother for the first year and then daycare 3h/day"                         1
## 11 "nursery"                                                                   1
## 12 "nursery school"                                                            1
## 13 "nursery school ; home with babysitter "                                    1
## 14 "playgroups & nursery school"                                               1
## 15  <NA>                                                                      88

Parental education, profession

Based on “Parental.education, Parental.profession” annotation columns

byParentEducation<- annotations[!is.na(annotations$Parental.education),] %>%
    group_by(lang, continent, Parental.education) %>%
      count() 

byParentEducation <- byParentEducation %>% 
  rename(
    number_of_corpora = n,
    language = lang,
    Parental.education.NAs_excluded = Parental.education)

byParentEducation
## # A tibble: 17 x 4
## # Groups:   language, continent, Parental.education.NAs_excluded [17]
##    language      continent     Parental.education.NAs_excluded  number_of_corpo…
##    <chr>         <chr>         <chr>                                       <int>
##  1 Basque        Europe        "Fathers : primary education ; …                1
##  2 catalan       Europe        "yes"                                           2
##  3 Czech         Europe        "high school ; university "                     1
##  4 dutch/english Europe        "university graduates"                          1
##  5 Estonian      Europe        "4 mothers w/ higher education …                1
##  6 French        Europe        "college degree"                                1
##  7 French        Europe        "university education"                          2
##  8 french/russi… Europe        "mother has 2 PhDs and father i…                1
##  9 German        Europe        "university degree or some kind…                1
## 10 Hebrew        Asia          "high education"                                1
## 11 Hebrew        Asia          "well-educated"                                 2
## 12 Hungarian/ca… Europe        "Both parents possess universit…                1
## 13 japanese      Asia          "Na"                                            1
## 14 Polish        Europe        "highly-educated"                               1
## 15 Spanish       Central and … "university"                                    1
## 16 Spanish       Europe        "higher degrees"                                1
## 17 Spanish       Europe        "higher education college degre…                1

102 corpora have NA for parental education.

byParentProfession<- annotations[!is.na(annotations$Parental.profession),] %>%
    group_by(lang, continent, Parental.profession) %>%
      count() 

byParentProfession <- byParentProfession %>% 
  rename(
    number_of_corpora = n,
    language = lang,
    Parental.profession.NAs_excluded = Parental.profession)

byParentProfession
## # A tibble: 25 x 4
## # Groups:   language, continent, Parental.profession.NAs_excluded [25]
##    language     continent      Parental.profession.NAs_excluded number_of_corpo…
##    <chr>        <chr>          <chr>                                       <int>
##  1 Basque       Europe         "Fathers : industrial operator …                1
##  2 Brazilian P… Central and S… "fahter linguist professor, mot…                1
##  3 Cantonese    Asia           "varied (mass transport company…                1
##  4 Cantonese-E… Asia           "varied"                                        1
##  5 catalan      Europe         "mother: researcher"                            2
##  6 Czech        Europe         "university scientist ; cultura…                1
##  7 Dutch        Europe         "researcher"                                    2
##  8 dutch/engli… Europe         "mother: part time free lance j…                1
##  9 English      Europe         "researcher"                                    1
## 10 English/Spa… Europe         "mother: linguist (investigator…                1
## # … with 15 more rows

102 corpora have NA for parental education.

Parental SES

Based on “Parental.ses” annotation column

byParentSES<- annotations[!is.na(annotations$Parental.socioeconomic.status),] %>%
    group_by(lang, continent, Parental.socioeconomic.status) %>%
      count() 

byParentSES <- byParentSES %>% 
  rename(
    number_of_corpora = n,
    language = lang,
    Parental.socioeconomic.status.NAs_excluded = Parental.socioeconomic.status)

byParentSES 
## # A tibble: 26 x 4
## # Groups:   language, continent, Parental.socioeconomic.status.NAs_excluded
## #   [26]
##    language       continent         Parental.socioeconomic.sta… number_of_corpo…
##    <chr>          <chr>             <chr>                                  <int>
##  1 Basque         Europe            medium                                     1
##  2 Brazilian Por… Central and Sout… upper-middle class                         1
##  3 Cantonese      Asia              working class                              1
##  4 catalan        Europe            middle class                               1
##  5 Dutch          Europe            lower-middle to middle-mid…                1
##  6 dutch/english  Europe            upper middle class                         1
##  7 English        Europe            middle-class                               1
##  8 English        Europe            predominantly middle-class                 1
##  9 English        Europe            upper working class                        1
## 10 English/Spani… Europe            middle class                               1
## # … with 16 more rows

102 corpora have NA for parental education.

Number of children in CHILDES

Based on “Number.of.participants” annotation column

Number of target children per language

byPart %>%      
  ggplot(mapping = aes(y = reorder(lang, numpar), numpar)) + 
    geom_point() + 
      ggtitle("Number of target children per language") + 
        geom_col()+ 
          facet_wrap(~Bilingualism.in.corpus..yes..no., scale="free") +
            labs(y= "Languages", x = "Number of target children")

byPart_nona <- byPart[!is.na(byPart$Bilingualism.in.corpus..yes..no.),]
byPart_mono <- byPart_nona %>%
  filter(Bilingualism.in.corpus..yes..no.=="Monolinguals")
byPart_bili <- byPart_nona %>%
  filter(Bilingualism.in.corpus..yes..no.=="Bilinguals")

head(byPart)
## # A tibble: 6 x 3
## # Groups:   lang [5]
##   lang                 Bilingualism.in.corpus..yes..no. numpar
##   <chr>                <chr>                             <dbl>
## 1 Afrikaan             <NA>                                  2
## 2 Arabic               <NA>                                 10
## 3 Basque               Bilinguals                           38
## 4 Basque               Monolinguals                          8
## 5 Brazilian Portuguese Monolinguals                          1
## 6 Cantonese            <NA>                                  8

0 corpora have NA in “Number of participants”. There is data from 153 target children in Monolingual corpora. There is data from 1318 target children in corpora which have not been classified as Monolingual/Bilingual. There is data from 215 target children in Bilingual corpora. Overall, the mean number of target children per language is 23.109589 with min 1 and max 475.

Based on “Number.of.participants” annotation column

Number of target children per continent

byPartCont <- annotations %>%
    group_by(continent) %>%
      select(Number.of.participants, continent) %>%
      summarise(numpar = sum(as.numeric(Number.of.participants), na.rm=TRUE)) 
## Warning in mask$eval_all_summarise(quo): NAs introduced by coercion
byPartCont %>%      
  ggplot(mapping = aes(y = reorder(continent, numpar), numpar)) + 
      ggtitle("Number of target children per continent") + 
        geom_col()+
           labs(y= "Continents", x = "Number of target children")

head(byPartCont)
## # A tibble: 6 x 2
##   continent                 numpar
##   <chr>                      <dbl>
## 1 Africa                        14
## 2 Asia                         274
## 3 Central and South America      5
## 4 Europe                       301
## 5 North America                 42
## 6 <NA>                           4

301 target children in Europe, 14 in Africa, 42 in North America, 274in Asia, 5 in Central and South America.