https://rpubs.com/gloukatou/733656
library(childesr)
library(wordbankr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
require(scales)
library(RColorBrewer)
myPalette <- brewer.pal(5, "Set2")
#####TODOS first part is the descriptive section, to what extent the full range of populations is represented, what are some codes that are missing often,
##### descriptive section and representation of the population
##########systematic review of features
##################confront eg proportion of child population versus proportion of data by eg continent
#############second goal: point out strengths and limitations of the current CHILDES, explain what is missing
annotations1<- read.csv("Childes_corpora-Clara1.csv")
annotations2<- read.csv("Childes_corpora-Camila.csv")
annotations3<- read.csv("Childes_corpora-Lara.csv")
annotations <-rbind(annotations1, annotations2, annotations3) #merge annotations from 3 annotators
annotations<- rename(annotations, lang = Language)
annotations$Bilingualism.in.corpus..yes..no.[annotations$Bilingualism.in.corpus..yes..no. == "yes"] <- "Bilinguals"
annotations$Bilingualism.in.corpus..yes..no.[annotations$Bilingualism.in.corpus..yes..no. == "no"] <- "Monolinguals"
annotations <- annotations %>% filter(!(Inclusion=="")) #only keep annotated corpora
annotations[annotations==""]<-NA
annotations[annotations=="N/A"]<-NA
annotations[annotations=="N/A (0,33)"]<-NA
annotations[annotations=="N/A (0)"]<-NA
annotations[annotations=="NA (0)"]<-NA
#head(annotations)
253 corpora already annotated.
d_transcripts <- get_transcripts()
d_statistics <- get_speaker_statistics()
mergedChiAnnot <- annotations %>% left_join(d_transcripts)
mergedChiAnnot <- mergedChiAnnot %>% left_join(n_utt_d)
mergedChiAnnot <- mergedChiAnnot %>% left_join(n_utt_speaker)
mergedChiAnnot<-mergedChiAnnot[!(is.na(mergedChiAnnot$n_utt) | mergedChiAnnot$n_utt=="" ), ]
if ( length(unique(mergedChiAnnot$corpus_name)) < nrow(annotations) ) {
annotations$corpus_name[!annotations$corpus_name %in% mergedChiAnnot$corpus_name]
} #print non merged corpora
inclu_data<- annotations %>%
group_by(Inclusion) %>%
count()
ggplot(inclu_data,aes(y=Inclusion, x=n)) +
geom_bar(stat="identity", width=1, color="white") +
theme_minimal() +
ggtitle("Corpus inclusion")
annotations <- annotations %>%
filter(Inclusion=="yes")
From the total amount of corpora, 141 have been included in this study. The following analyses will continue only on these included corpora.
group_count <-function(data_, column_){
data_<- data_ %>%
group_by({{column_}}) %>%
count()
return(data_)
}
byBilinguals<-group_count(annotations, Bilingualism.in.corpus..yes..no.)
lbls <- paste(byBilinguals$Bilingualism.in.corpus..yes..no., byBilinguals$n, sep=" ") # add numbers to labels
pie(byBilinguals$n, labels=lbls, border="white", col=myPalette)
head(byBilinguals)
## # A tibble: 4 x 2
## # Groups: Bilingualism.in.corpus..yes..no. [4]
## Bilingualism.in.corpus..yes..no. n
## <chr> <int>
## 1 Bilinguals 27
## 2 Monolinguals 29
## 3 trilinguism 1
## 4 <NA> 84
84 corpora have NA in “Bilingualism in corpus”.
byPart <- annotations %>%
group_by(lang, Bilingualism.in.corpus..yes..no.) %>%
summarise(numpar = sum(as.numeric(Number.of.participants), na.rm=TRUE))
bySibl<- annotations %>%
group_by(lang) %>%
summarise(numChildrenwithSiblings = sum(as.numeric(X..of.children.with.siblings)))
ggplot(bySibl[!is.na(bySibl$numChildrenwithSiblings),], mapping = aes(y=lang, x=numChildrenwithSiblings)) +
geom_point() +
ggtitle("Languages where number of children with siblings is not unknown") +
geom_col()
head(bySibl)
## # A tibble: 6 x 2
## lang numChildrenwithSiblings
## <chr> <dbl>
## 1 Afrikaan 1
## 2 Arabic NA
## 3 Basque NA
## 4 Brazilian Portuguese NA
## 5 Cantonese 4
## 6 Cantonese-English 6
Out of 1687 total target children, only 49 have siblings. No information for children in 65 languages (marked as NAs).
bySiblnum<-annotations %>%
group_by(Average.number.of.siblings) %>%
count()
ggplot(bySiblnum, mapping = aes(x=Average.number.of.siblings, y=n)) +
geom_point() +
ggtitle("Average number of siblings per corpus") +
geom_col()
#ggplot(bySiblnum[!is.na(bySiblnum$Average.number.of.siblings),], mapping = aes(x=Average.number.of.siblings, y=n))
annotations$Average.number.of.siblings <- as.numeric(as.character(annotations$Average.number.of.siblings))
head(bySiblnum)
## # A tibble: 6 x 2
## # Groups: Average.number.of.siblings [6]
## Average.number.of.siblings n
## <chr> <int>
## 1 0 10
## 2 0.25 1
## 3 0.33 1
## 4 0.66 1
## 5 0.75 1
## 6 0.875 1
The mean number of siblings has been left NA for 88 out of 141 corpora. In the remaining corpora, the mean number of siblings per corpus is 0.992451 with min 0 and max 2.
byoldSiblnum<-annotations %>%
group_by(Average.number.of.older.siblings) %>%
count()
ggplot(byoldSiblnum, mapping = aes(x=Average.number.of.older.siblings, y=n)) +
geom_point() +
ggtitle("Average number of older siblings per corpus") +
geom_col()
annotations$Average.number.of.older.siblings <- as.numeric(as.character(annotations$Average.number.of.older.siblings))
head(byoldSiblnum)
## # A tibble: 6 x 2
## # Groups: Average.number.of.older.siblings [6]
## Average.number.of.older.siblings n
## <chr> <int>
## 1 0 27
## 2 0,5 3
## 3 0,6 1
## 4 0.25 1
## 5 0.3 2
## 6 0.5 1
The mean number of older siblings has been left NA for 89 out of 141 corpora. In the remaining corpora, the mean number of older siblings per corpus is 0.4288043 with min NA and max 2.
annotations <- annotations[!is.na(annotations$Mean...sessions.per.child), ]
bySess<- annotations
bySess$Mean...sessions.per.child <- sub(",",".",bySess$Mean...sessions.per.child)
bySess$Mean...sessions.per.child <- as.numeric(bySess$Mean...sessions.per.child)
bySess<- bySess %>%
mutate(sessions = as.numeric(Number.of.participants) * Mean...sessions.per.child) %>%
group_by(lang, Bilingualism.in.corpus..yes..no.) %>%
summarise(numses = sum(as.numeric(sessions)))
ggplot(bySess, mapping = aes(y = reorder(lang, numses), numses)) + geom_point() + ggtitle("Number of sessions per language") + geom_col()+ facet_wrap(~Bilingualism.in.corpus..yes..no., scale="free")
annotations$continent <-NA
annotations$continent[grep("England|Sweden|Spain|Madrid, Spain ; Tenerife, Canary Islands|Madrid, Spain|Navarra, Spain|Estonia|Tartu, Estonia ; Rapla, Estonia|Budapest, Hungary|West Coast, Ireland|Arfon area Gwynedd, North Wales|Netherlands|Belfast, Northern Ireland|Nottingham/Manchester, England|Sweden ; Portugal|Czech Republic|Bucharest, Romania|Postdam, Germany|Stuttgart, Germany|France|Alt penedes, region of catalonia|spain|Naples|Spain, Hungary, Lloret de mar|Spain, Salamanca|England, Brighton|France (Normandy, Marseille, + places visited)|Portugal|italy|Barcelona|Switzerland|Stockholm, Sweden|Iceland|Poznań, Poland|Romania|Antwerp, Belgium|Ireland|Hungary|Athens, Greece|Southern Estonia|Tartu, Estonia|A Coruña, Santiago de Compostela ; Lugo ; Pontevedra, A Estrada ; SPAIN|Moscow, Russia|Spain, Salamanca|Wales", annotations$Neighbour.village.city.province.state.country)] <- "Europe"
annotations$continent[grep("Rio Cuarto, Cordoba, Argentina|Sao Paulo|Patzcuaro, Michoacan, iMexico|Jamaica", annotations$Neighbour.village.city.province.state.country)] <- "Central and South America"
annotations$continent[grep("Iran|Israel|Bombay, India|China|Bangkok, Thailand|Tokyo area|Nagoya area|osaka area|Kuwait|Yahud, Israel|Nagoya center|Kusatsu City, Shiga Pref|Hong-Kong|Taiwan|Turkey", annotations$Neighbour.village.city.province.state.country)] <- "Asia"
annotations$continent[grep("Michigan, USA|USA, Northern Virginia|Canada|California, USA|washington dc|United States|USA", annotations$Neighbour.village.city.province.state.country)] <- "North America"
annotations$continent[grep("Alexandria, Egypt|Mokhotlong, Lesotho", annotations$Neighbour.village.city.province.state.country)] <- "Africa"
annotations$country <-NA
annotations$country[grep("England|West Coast, Ireland|Arfon area Gwynedd, North Wales|Belfast, Northern Ireland|Nottingham/Manchester, England|England, Brighton|Ireland|Wales", annotations$Neighbour.village.city.province.state.country)] <- "UK/Ireland"
annotations$country[grep("Sweden|Sweden ; Portugal|Stockholm, Sweden", annotations$Neighbour.village.city.province.state.country)] <- "Sweden"
annotations$country[grep("Spain|Madrid, Spain ; Tenerife, Canary Islands|Madrid, Spain|Navarra, Spain|Alt penedes, region of catalonia|spain|Spain, Hungary, Lloret de mar|Spain, Salamanca|Barcelona|A Coruña, Santiago de Compostela ; Lugo ; Pontevedra, A Estrada ; SPAIN|Spain, Salamanca", annotations$Neighbour.village.city.province.state.country)] <- "Spain"
annotations$country[grep("Estonia|Tartu, Estonia ; Rapla, Estonia|Southern Estonia|Tartu, Estonia", annotations$Neighbour.village.city.province.state.country)] <- "Estonia"
annotations$country[grep("Budapest, Hungary|Hungary|Spain, Hungary, Lloret de mar", annotations$Neighbour.village.city.province.state.country)] <- "Hungary"
annotations$country[grep("Sweden ; Portugal|Portugal", annotations$Neighbour.village.city.province.state.country)] <- "Portugal"
annotations$country[grep("Czech Republic", annotations$Neighbour.village.city.province.state.country)] <- "Czech Republic"
annotations$country[grep("Bucharest, Romania|Romania", annotations$Neighbour.village.city.province.state.country)] <- "Romania"
annotations$country[grep("Postdam, Germany|Stuttgart, Germany", annotations$Neighbour.village.city.province.state.country)] <- "Germany"
annotations$country[grep("France|France (Normandy, Marseille, + places visited)", annotations$Neighbour.village.city.province.state.country)] <- "France"
annotations$country[grep("Naples|italy", annotations$Neighbour.village.city.province.state.country)] <- "Italy"
annotations$country[grep("Switzerland", annotations$Neighbour.village.city.province.state.country)] <- "Switzerland"
annotations$country[grep("Iceland", annotations$Neighbour.village.city.province.state.country)] <- "Iceland"
annotations$country[grep("Poznań, Poland", annotations$Neighbour.village.city.province.state.country)] <- "Poland"
annotations$country[grep("Antwerp, Belgium", annotations$Neighbour.village.city.province.state.country)] <- "Belgium"
annotations$country[grep("Athens, Greece", annotations$Neighbour.village.city.province.state.country)] <- "Greece"
annotations$country[grep("Moscow, Russia", annotations$Neighbour.village.city.province.state.country)] <- "Russia"
annotations$country[grep("Rio Cuarto, Cordoba, Argentina", annotations$Neighbour.village.city.province.state.country)] <- "Argentina"
annotations$country[grep("Sao Paulo", annotations$Neighbour.village.city.province.state.country)] <- "Brazil"
annotations$country[grep("Patzcuaro, Michoacan, iMexico", annotations$Neighbour.village.city.province.state.country)] <- "Mexico"
annotations$country[grep("Jamaica", annotations$Neighbour.village.city.province.state.country)] <- "Jamaica"
annotations$country[grep("Iran", annotations$Neighbour.village.city.province.state.country)] <- "Iran"
annotations$country[grep("Israel|Yahud, Israel", annotations$Neighbour.village.city.province.state.country)] <- "Israel"
annotations$country[grep("Bombay, India", annotations$Neighbour.village.city.province.state.country)] <- "India"
annotations$country[grep("China", annotations$Neighbour.village.city.province.state.country)] <- "China"
annotations$country[grep("Bangkok, Thailand", annotations$Neighbour.village.city.province.state.country)] <- "Thailand"
annotations$country[grep("Tokyo area|Nagoya area|osaka area|Nagoya center|Kusatsu City, Shiga Pref", annotations$Neighbour.village.city.province.state.country)] <- "Japan"
annotations$country[grep("Kuwait", annotations$Neighbour.village.city.province.state.country)] <- "Kuwait"
annotations$country[grep("Hong-Kong", annotations$Neighbour.village.city.province.state.country)] <- "Hong-Kong"
annotations$country[grep("Taiwan", annotations$Neighbour.village.city.province.state.country)] <- "Taiwan"
annotations$country[grep("Turkey", annotations$Neighbour.village.city.province.state.country)] <- "Turkey"
annotations$country[grep("Michigan, USA|USA, Northern Virginia|California, USA|washington dc|United States|USA", annotations$Neighbour.village.city.province.state.country)] <- "USA"
annotations$country[grep("Canada", annotations$Neighbour.village.city.province.state.country)] <- "Canada"
annotations$country[grep("Alexandria, Egypt", annotations$Neighbour.village.city.province.state.country)] <- "Egypt"
annotations$country[grep("Mokhotlong, Lesotho", annotations$Neighbour.village.city.province.state.country)] <- "Lesotho"
byLocation <- annotations %>%
select(country, continent) %>%
group_by(country, continent) %>%
count()
ggplot(byLocation, aes(x=n, y=country)) +
geom_bar(stat="identity", width=1, color="white") +
ggtitle("Corpora by Countries and Continents") +
facet_wrap(~continent, scale="free")
head(byLocation)
## # A tibble: 6 x 3
## # Groups: country, continent [6]
## country continent n
## <chr> <chr> <int>
## 1 Argentina Central and South America 1
## 2 Belgium Europe 2
## 3 Brazil Central and South America 1
## 4 Canada North America 2
## 5 China Asia 3
## 6 Czech Republic Europe 1
36 countries are represented in childes. 5 continents. 68 of the 102 corpora are European, and 10 of the corpora are North American.
byLocation1 <- group_count(annotations, continent)
lbls <- paste(byLocation1$continent, byLocation1$n, sep=" ") # add numbers to labels
pie(byLocation1$n, labels=lbls, border="white", col=myPalette, main="Corpora by continent")
head(byLocation1)
## # A tibble: 6 x 2
## # Groups: continent [6]
## continent n
## <chr> <int>
## 1 Africa 2
## 2 Asia 20
## 3 Central and South America 4
## 4 Europe 66
## 5 North America 8
## 6 <NA> 2
byPlaceofRecording <- group_count(annotations, Place.of.recordings...home..nursery....)
lbls <- paste(byPlaceofRecording$Place.of.recordings...home..nursery...., byPlaceofRecording$n, sep=" ") # add numbers to labels
pie(byPlaceofRecording$n, labels=lbls, border="white", col=myPalette)
head(byPlaceofRecording)
## # A tibble: 6 x 2
## # Groups: Place.of.recordings...home..nursery.... [6]
## Place.of.recordings...home..nursery.... n
## <chr> <int>
## 1 6 in nursery, 3 at home 1
## 2 home 69
## 3 home (mostly) 1
## 4 home (TYA) ; home of mother, of paternal grandparents or maternal grand… 1
## 5 home or daycare 1
## 6 home or kindergarten 1
byRecord <- annotations %>%
group_by(Presence.of.investigator..yes.no.parent.is.investigator.half.half., Recording.method..static..portable..both..diary..other..) %>%
summarise(n = n()) %>%
arrange(desc(n))
byRecord %>%
filter(!is.na(Presence.of.investigator..yes.no.parent.is.investigator.half.half.), !is.na(Recording.method..static..portable..both..diary..other..)) %>%
rename( number_of_corpora = n)
## # A tibble: 38 x 3
## # Groups: Presence.of.investigator..yes.no.parent.is.investigator.half.half.
## # [9]
## Presence.of.investigator..yes.… Recording.method..static..p… number_of_corpo…
## <chr> <chr> <int>
## 1 yes portable 7
## 2 yes static 5
## 3 yes audio 3
## 4 yes videotape 3
## 5 no audio recorded 2
## 6 no portable 2
## 7 no static 2
## 8 no videotape 2
## 9 yes audio & videotape 2
## 10 yes audiotape 2
## # … with 28 more rows
#lbls <- paste(byRecord$Presence.of.investigator..yes.no.parent.is.investigator.half.half., byRecord$Recording.method..static..portable..both..diary..other.., byRecord$n, sep=" & ") # add numbers to labels
#pie(byRecord$n, labels=lbls, border="white", col=myPalette)
21 corpora have NA in “Presence.of.investigator” column.
33 corpora have NA in “Recording.method” column.
byAge <- annotations %>%
select(Mean.child.age.at.beginning.of.recordings, Mean.child.age.at.end.of.recordings, country, continent) %>%
group_by(country, continent) %>%
summarize(min = min(Mean.child.age.at.beginning.of.recordings), max = max(Mean.child.age.at.end.of.recordings)) %>%
arrange(desc(min))
ggplot(byAge, aes(y=country))+
geom_linerange(aes(xmin=min,xmax=max),linetype=2,color="blue")+
geom_point(aes(x=min),size=3,color="red")+
geom_point(aes(x=max),size=3,color="red")+
ggtitle("Nim and Max age per country") +
theme_bw()+
facet_wrap(~continent, scale="free")
group_select<-function(data_, column_){
data_ <- data_%>%
group_by(!!column_) %>%
select(Corpus, !!column_)
data_ <- data_[ order( data_[,3] ),]
return(data_)
}
byCdsTier <-group_select(annotations, "CDS.annotation..yes.no.")
byCdsTier <- byCdsTier %>% group_by(CDS.annotation..yes.no.) %>% summarise(n = n())
head(byCdsTier)
## # A tibble: 3 x 2
## CDS.annotation..yes.no. n
## <chr> <int>
## 1 no 11
## 2 yes 1
## 3 <NA> 90
byannotTier <-group_select(annotations, "English.translation..yes.no")
byannotTier <- byannotTier %>% group_by(English.translation..yes.no) %>% summarise(n = n())
head(byannotTier)
## # A tibble: 6 x 2
## English.translation..yes.no n
## <chr> <int>
## 1 "-" 3
## 2 "- " 1
## 3 "no" 39
## 4 "no (from what I can see in the transcripts)" 5
## 5 "yes" 9
## 6 <NA> 45
bymorphTier <- group_select(annotations, "Morphological.annotation..yes.no.")
bymorphTier <- bymorphTier %>% group_by(Morphological.annotation..yes.no.) %>% summarise(n = n())
head(bymorphTier)
## # A tibble: 3 x 2
## Morphological.annotation..yes.no. n
## <chr> <int>
## 1 no 13
## 2 yes 28
## 3 <NA> 61
byCDStransTier <- group_select(annotations, "Only.directed.to.child.transcribed..i.e..speech.between.adults.not.transcribed...yes.no")
byCDStransTier <- byCDStransTier %>% group_by(Only.directed.to.child.transcribed..i.e..speech.between.adults.not.transcribed...yes.no) %>% summarise(n = n())
head(byCDStransTier)
## # A tibble: 5 x 2
## Only.directed.to.child.transcribed..i.e..speech.between.adults.not.tran… n
## <chr> <int>
## 1 mostly 1
## 2 no 3
## 3 yes 3
## 4 yes (long conversations between adults were not transcribed) 1
## 5 <NA> 94
byCorrections<- group_select(annotations, "Correction.of.speech.errors..i.e..syntactic...transcriptions.not.exactly.matching.speech...yes.no")
byCorrections <- byCorrections %>% group_by(Correction.of.speech.errors..i.e..syntactic...transcriptions.not.exactly.matching.speech...yes.no) %>% summarise(n = n())
head(byCorrections)
## # A tibble: 3 x 2
## Correction.of.speech.errors..i.e..syntactic...transcriptions.not.exactl… n
## <chr> <int>
## 1 no 3
## 2 yes 8
## 3 <NA> 91
byFertility <- group_select(annotations, "Fertility.rate.of.community")
byFertility <- byFertility %>% group_by(Fertility.rate.of.community) %>% summarise(n = n())
byFertility
## # A tibble: 2 x 2
## Fertility.rate.of.community n
## * <chr> <int>
## 1 no 1
## 2 <NA> 101
byInterbirth <- group_select(annotations, "Interbirth.intervals")
byInterbirth <- byInterbirth %>% group_by(Interbirth.intervals) %>% summarise(n = n())
byInterbirth
## # A tibble: 3 x 2
## Interbirth.intervals n
## * <chr> <int>
## 1 Na 1
## 2 no 1
## 3 <NA> 100
bySchooling <- group_select(annotations, "Access.to.schooling.for.children..yes.no.only.elementary.")
bySchooling <- bySchooling %>% group_by(Access.to.schooling.for.children..yes.no.only.elementary.) %>% summarise(n = n())
bySchooling
## # A tibble: 2 x 2
## Access.to.schooling.for.children..yes.no.only.elementary. n
## * <chr> <int>
## 1 yes 8
## 2 <NA> 94
byCommunity <- group_select(annotations, "Type.of.community..hunter.forager.herder.farmer.work.for.pay.industrial.")
byCommunity <- byCommunity %>% group_by(Type.of.community..hunter.forager.herder.farmer.work.for.pay.industrial.) %>% summarise(n = n())
byCommunity
## # A tibble: 3 x 2
## Type.of.community..hunter.forager.herder.farmer.work.for.pay.industrial. n
## * <chr> <int>
## 1 industrial 3
## 2 rural 1
## 3 <NA> 98
byHealth <- group_select(annotations, "Access.to.health.for.children...yes.no.")
byHealth <- byHealth %>% group_by(Access.to.health.for.children...yes.no.) %>% summarise(n = n())
byHealth
## # A tibble: 2 x 2
## Access.to.health.for.children...yes.no. n
## * <chr> <int>
## 1 yes 2
## 2 <NA> 100
byCulture <- group_select(annotations, "Type.of.culture..minority..majority.")
byCulture <- byCulture %>% group_by(Type.of.culture..minority..majority.) %>% summarise(n = n())
byCulture
## # A tibble: 4 x 2
## Type.of.culture..minority..majority. n
## * <chr> <int>
## 1 majority 1
## 2 majority? 2
## 3 minority 2
## 4 <NA> 97
byMortality<- group_select(annotations, "Mortality.rate.of.children.in.the.community")
byMortality<- byMortality %>% group_by(Mortality.rate.of.children.in.the.community) %>% summarise(n = n())
byMortality
## # A tibble: 1 x 2
## Mortality.rate.of.children.in.the.community n
## * <chr> <int>
## 1 <NA> 102
byLangSpeakers<- group_select(annotations, "Number.of.speakers.of.the.language")
byLangSpeakers<- byLangSpeakers %>% group_by(Number.of.speakers.of.the.language) %>% summarise(n = n())
byLangSpeakers
## # A tibble: 3 x 2
## Number.of.speakers.of.the.language n
## * <chr> <int>
## 1 4 million 1
## 2 70% of population 1
## 3 <NA> 100
byHousehold<- group_select(annotations, "Household.structure..nuclear..extended.")
byhousehold<- byHousehold %>% group_by(Household.structure..nuclear..extended.) %>% summarise(n = n())
byhousehold
## # A tibble: 3 x 2
## Household.structure..nuclear..extended. n
## * <chr> <int>
## 1 extended 1
## 2 nuclear 10
## 3 <NA> 91
byTimespent<- group_select(annotations, "Where.children.spend.their.time..home..nursery..playing.by.the.river....")
byTimespent<- byTimespent %>% group_by(Where.children.spend.their.time..home..nursery..playing.by.the.river....) %>% summarise(n = n())
byTimespent
## # A tibble: 15 x 2
## Where.children.spend.their.time..home..nursery..playing.by.the.river..… n
## * <chr> <int>
## 1 "assistante maternelle francophone de 2;4 a 3;1 ans a mi-temps (20H pa… 1
## 2 "crèche, home" 1
## 3 "crèche/grandmother 1 day a week" 1
## 4 "daycare" 1
## 5 "daycare ; grandparents/aunt " 1
## 6 "grandparents ; family with 2 children & a dog " 1
## 7 "home, nursery" 1
## 8 "home+dutch babysitter, and when school started: english speaking pres… 1
## 9 "kindergarten" 1
## 10 "mother for the first year and then daycare 3h/day" 1
## 11 "nursery" 1
## 12 "nursery school" 1
## 13 "nursery school ; home with babysitter " 1
## 14 "playgroups & nursery school" 1
## 15 <NA> 88
byParentEducation<- annotations[!is.na(annotations$Parental.education),] %>%
group_by(lang, continent, Parental.education) %>%
count()
byParentEducation <- byParentEducation %>%
rename(
number_of_corpora = n,
language = lang,
Parental.education.NAs_excluded = Parental.education)
byParentEducation
## # A tibble: 17 x 4
## # Groups: language, continent, Parental.education.NAs_excluded [17]
## language continent Parental.education.NAs_excluded number_of_corpo…
## <chr> <chr> <chr> <int>
## 1 Basque Europe "Fathers : primary education ; … 1
## 2 catalan Europe "yes" 2
## 3 Czech Europe "high school ; university " 1
## 4 dutch/english Europe "university graduates" 1
## 5 Estonian Europe "4 mothers w/ higher education … 1
## 6 French Europe "college degree" 1
## 7 French Europe "university education" 2
## 8 french/russi… Europe "mother has 2 PhDs and father i… 1
## 9 German Europe "university degree or some kind… 1
## 10 Hebrew Asia "high education" 1
## 11 Hebrew Asia "well-educated" 2
## 12 Hungarian/ca… Europe "Both parents possess universit… 1
## 13 japanese Asia "Na" 1
## 14 Polish Europe "highly-educated" 1
## 15 Spanish Central and … "university" 1
## 16 Spanish Europe "higher degrees" 1
## 17 Spanish Europe "higher education college degre… 1
102 corpora have NA for parental education.
byParentProfession<- annotations[!is.na(annotations$Parental.profession),] %>%
group_by(lang, continent, Parental.profession) %>%
count()
byParentProfession <- byParentProfession %>%
rename(
number_of_corpora = n,
language = lang,
Parental.profession.NAs_excluded = Parental.profession)
byParentProfession
## # A tibble: 25 x 4
## # Groups: language, continent, Parental.profession.NAs_excluded [25]
## language continent Parental.profession.NAs_excluded number_of_corpo…
## <chr> <chr> <chr> <int>
## 1 Basque Europe "Fathers : industrial operator … 1
## 2 Brazilian P… Central and S… "fahter linguist professor, mot… 1
## 3 Cantonese Asia "varied (mass transport company… 1
## 4 Cantonese-E… Asia "varied" 1
## 5 catalan Europe "mother: researcher" 2
## 6 Czech Europe "university scientist ; cultura… 1
## 7 Dutch Europe "researcher" 2
## 8 dutch/engli… Europe "mother: part time free lance j… 1
## 9 English Europe "researcher" 1
## 10 English/Spa… Europe "mother: linguist (investigator… 1
## # … with 15 more rows
102 corpora have NA for parental education.
byParentSES<- annotations[!is.na(annotations$Parental.socioeconomic.status),] %>%
group_by(lang, continent, Parental.socioeconomic.status) %>%
count()
byParentSES <- byParentSES %>%
rename(
number_of_corpora = n,
language = lang,
Parental.socioeconomic.status.NAs_excluded = Parental.socioeconomic.status)
byParentSES
## # A tibble: 26 x 4
## # Groups: language, continent, Parental.socioeconomic.status.NAs_excluded
## # [26]
## language continent Parental.socioeconomic.sta… number_of_corpo…
## <chr> <chr> <chr> <int>
## 1 Basque Europe medium 1
## 2 Brazilian Por… Central and Sout… upper-middle class 1
## 3 Cantonese Asia working class 1
## 4 catalan Europe middle class 1
## 5 Dutch Europe lower-middle to middle-mid… 1
## 6 dutch/english Europe upper middle class 1
## 7 English Europe middle-class 1
## 8 English Europe predominantly middle-class 1
## 9 English Europe upper working class 1
## 10 English/Spani… Europe middle class 1
## # … with 16 more rows
102 corpora have NA for parental education.
byPart %>%
ggplot(mapping = aes(y = reorder(lang, numpar), numpar)) +
geom_point() +
ggtitle("Number of target children per language") +
geom_col()+
facet_wrap(~Bilingualism.in.corpus..yes..no., scale="free") +
labs(y= "Languages", x = "Number of target children")
byPart_nona <- byPart[!is.na(byPart$Bilingualism.in.corpus..yes..no.),]
byPart_mono <- byPart_nona %>%
filter(Bilingualism.in.corpus..yes..no.=="Monolinguals")
byPart_bili <- byPart_nona %>%
filter(Bilingualism.in.corpus..yes..no.=="Bilinguals")
head(byPart)
## # A tibble: 6 x 3
## # Groups: lang [5]
## lang Bilingualism.in.corpus..yes..no. numpar
## <chr> <chr> <dbl>
## 1 Afrikaan <NA> 2
## 2 Arabic <NA> 10
## 3 Basque Bilinguals 38
## 4 Basque Monolinguals 8
## 5 Brazilian Portuguese Monolinguals 1
## 6 Cantonese <NA> 8
0 corpora have NA in “Number of participants”. There is data from 153 target children in Monolingual corpora. There is data from 1318 target children in corpora which have not been classified as Monolingual/Bilingual. There is data from 215 target children in Bilingual corpora. Overall, the mean number of target children per language is 23.109589 with min 1 and max 475.
byPartCont <- annotations %>%
group_by(continent) %>%
select(Number.of.participants, continent) %>%
summarise(numpar = sum(as.numeric(Number.of.participants), na.rm=TRUE))
## Warning in mask$eval_all_summarise(quo): NAs introduced by coercion
byPartCont %>%
ggplot(mapping = aes(y = reorder(continent, numpar), numpar)) +
ggtitle("Number of target children per continent") +
geom_col()+
labs(y= "Continents", x = "Number of target children")
head(byPartCont)
## # A tibble: 6 x 2
## continent numpar
## <chr> <dbl>
## 1 Africa 14
## 2 Asia 274
## 3 Central and South America 5
## 4 Europe 301
## 5 North America 42
## 6 <NA> 4
301 target children in Europe, 14 in Africa, 42 in North America, 274in Asia, 5 in Central and South America.