Rbind different corpora.
library(tidyverse)
library(lme4)
library(ggeffects)
library(wordbankr)
library(psych)
library(reshape2)
library(ggpubr)
library(rstatix)
library(data.table)
library(gridExtra)
clean_childes <- function(corpus_) {
#annot <- c("xxx", "yyy", "www", "-", "'")
#annotUtt <- filter(corpus_, lemma %in% annot)
#annotUttID<- unique(annotUtt$utterance_id)
#corpus_ <- filter(corpus_, !(utterance_id %in% annotUttID)) #remove utterances with annotations - incomplete info
corpus_<-corpus_ %>% filter (speaker_code != "CHI") #remove target child utterances
corpus_<-corpus_ %>% filter (pos != "PUNCT") #remove punctuation
corpus_ %>% mutate(lemma = tolower(lemma))
}
load_data <- function(language_list) {
for (lang in language_list){
if (lang == "French") {
french=read_csv("/Users/lscpuser/Documents/fyssen-project/french_1403.csv")
french=clean_childes(french)
}
if (lang == "Italian") {
italian=read_csv("/Users/lscpuser/Documents/fyssen-project/italian_1403.csv")
italian=clean_childes(italian)
}
}
return(list(french = data.table(french), italian =data.table(italian)))
}
Clean utterances by removing puctuation, incomplete sentences and target-child speech.
corpus<-load_data(language_list)
lapply(corpus, function(x) {
summary(x)
})
## $french
## X1 text lemma lex
## Min. : 11 Length:1716925 Length:1716925 Length:1716925
## 1st Qu.: 755447 Class :character Class :character Class :character
## Median :1442050 Mode :character Mode :character Mode :character
## Mean :1435194
## 3rd Qu.:2100678
## Max. :2970632
## pos tag dependency morph
## Length:1716925 Length:1716925 Length:1716925 Length:1716925
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## prefix prefix_ suffix suffix_
## Min. :7.187e+15 Length:1716925 Min. :4.050e+02 Length:1716925
## 1st Qu.:2.985e+18 Class :character 1st Qu.:5.902e+18 Class :character
## Median :9.153e+18 Mode :character Median :1.114e+19 Mode :character
## Mean :9.142e+18 Mean :1.038e+19
## 3rd Qu.:1.537e+19 3rd Qu.:1.462e+19
## Max. :1.800e+19 Max. :1.844e+19
## sentiment utterance_id target_child_id speaker_code
## Min. :0 Min. :17476553 Min. :23235 Length:1716925
## 1st Qu.:0 1st Qu.:17654464 1st Qu.:23275 Class :character
## Median :0 Median :17798673 Median :23312 Mode :character
## Mean :0 Mean :17787554 Mean :23319
## 3rd Qu.:0 3rd Qu.:17921273 3rd Qu.:23373
## Max. :0 Max. :18149594 Max. :23412
## corpus_name transcript_id language
## Length:1716925 Min. :44628 Length:1716925
## Class :character 1st Qu.:44898 Class :character
## Mode :character Median :45107 Mode :character
## Mean :45032
## 3rd Qu.:45175
## Max. :45263
##
## $italian
## X1 text lemma lex
## Min. : 4 Length:247618 Length:247618 Length:247618
## 1st Qu.:118387 Class :character Class :character Class :character
## Median :225572 Mode :character Mode :character Mode :character
## Mean :224898
## 3rd Qu.:333171
## Max. :437481
## pos tag dependency morph
## Length:247618 Length:247618 Length:247618 Length:247618
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## prefix prefix_ suffix suffix_
## Min. :7.187e+15 Length:247618 Min. :1.675e+16 Length:247618
## 1st Qu.:2.985e+18 Class :character 1st Qu.:5.098e+18 Class :character
## Median :1.190e+19 Mode :character Median :9.232e+18 Mode :character
## Mean :1.020e+19 Mean :9.781e+18
## 3rd Qu.:1.560e+19 3rd Qu.:1.405e+19
## Max. :1.800e+19 Max. :1.845e+19
## sentiment utterance_id target_child_id speaker_code
## Min. :0 Min. :7388853 Min. :14314 Length:247618
## 1st Qu.:0 1st Qu.:7425971 1st Qu.:14341 Class :character
## Median :0 Median :7450137 Median :14377 Mode :character
## Mean :0 Mean :7453151 Mean :14373
## 3rd Qu.:0 3rd Qu.:7481870 3rd Qu.:14412
## Max. :0 Max. :7514844 Max. :14421
## corpus_name transcript_id language
## Length:247618 Min. :22462 Length:247618
## Class :character 1st Qu.:22498 Class :character
## Mode :character Median :22531 Mode :character
## Mean :22538
## 3rd Qu.:22583
## Max. :22616
Count number of times child hears a lemma, and divide it by the total number of lemma tokens heard. This creates a ‘rawFrequency’ column, which gives 1 when grouped by target child.
#Count each lemma and sum of lemmas for each target child
frequency_raw <- function(corpus_) {
corpus_frequency<- corpus_ %>%
group_by(lemma, target_child_id, language) %>%
summarize(CountLemma=n()) #count each lemma for each child
countAllLemmas<-corpus_frequency %>%
group_by(target_child_id, language) %>%
summarize(countAllLemmasChild=sum(CountLemma)) #count all lemma tokens for child
corpus_frequency<-corpus_frequency %>%
left_join(countAllLemmas) #join infos
corpus_frequency<-corpus_frequency %>%
mutate (rawFrequency = CountLemma / countAllLemmasChild) %>%
mutate (FrequencyLog =log(1+ rawFrequency * 100)) #Convert rawFrequency to FrequencyLog to avoid very small values.
corpus_frequency1<- corpus_frequency %>%
group_by(lemma, language) %>%
summarize(FrequencyLogMean=mean(FrequencyLog))
corpus_frequency <- corpus_frequency %>%
left_join(corpus_frequency1)
return(corpus_frequency)
}
Model frequency by corpus size, and get intercept coefficient as a proxy for frequency:
frequency_model <- function(corpus) {
corpus_frequency<-frequency_raw(corpus)
models <- corpus_frequency %>%
group_by(language, lemma) %>% nest()
models_ <- models %>%
mutate (interceptmodel = map(.x=data, .f=~coef(lm(.x$rawFrequency ~(.x$countAllLemmasChild-.x$CountLemma), data=data))["(Intercept)"], na.rm = T))
models_<-models_ %>% unnest(interceptmodel)
corpus_frequency_<-corpus_frequency %>%
left_join(unique(models_))
return(corpus_frequency_)}
frequency_it<- frequency_model(corpus$italian)
frequency_fr<- frequency_model(corpus$french)
# TEST1 frequency metric:
frequency_it %>%
group_by(target_child_id) %>%
summarize (sum(rawFrequency)) #Test frequence: should be 1 for each child
## # A tibble: 11 x 2
## target_child_id `sum(rawFrequency)`
## * <dbl> <dbl>
## 1 14314 1
## 2 14325 1
## 3 14333 1
## 4 14341 1
## 5 14357 1
## 6 14377 1
## 7 14406 1
## 8 14408 1
## 9 14412 1
## 10 14415 1
## 11 14421 1
# TEST2 frequency metric:
frequency_fr %>%
arrange(desc(FrequencyLog))#: maximum values
## # A tibble: 40,897 x 10
## # Groups: lemma, target_child_id [40,897]
## lemma target_child_id language CountLemma countAllLemmasC… rawFrequency
## <chr> <dbl> <chr> <int> <int> <dbl>
## 1 être 23381 French 3626 54442 0.0666
## 2 le 23373 French 3792 61746 0.0614
## 3 être 23312 French 9213 155399 0.0593
## 4 le 23235 French 7965 138177 0.0576
## 5 être 23259 French 329 5856 0.0562
## 6 être 23288 French 8709 156818 0.0555
## 7 le 23243 French 12613 227150 0.0555
## 8 être 23296 French 5872 106207 0.0553
## 9 être 23235 French 7370 138177 0.0533
## 10 le 23288 French 8280 156818 0.0528
## # … with 40,887 more rows, and 4 more variables: FrequencyLog <dbl>,
## # FrequencyLogMean <dbl>, data <list>, interceptmodel <dbl>
Get wordbank data for language and aoa for each lemma:
list_class<-c("nouns", "verbs", "adjectives", "function_words", "other")
load_wordbank<- function(lang_) {
WB_tokens<- get_item_data(language = lang_, form = "WS")
WB_tokens<-WB_tokens %>%
filter (type == "word") #remove grammar
data <- get_instrument_data(language =lang_, form = "WS", items = WB_tokens$item_id, administrations = TRUE, iteminfo=TRUE)
names(data)[names(data) == "definition"] <- "lemma"
aoa<- fit_aoa(data, measure = "produces", method = "glmrob", proportion = 0.5) # 145 NAs out of 680
names(aoa)[names(aoa) == "definition"] <- "lemma"
dataAoa <- unique(data) %>% left_join(unique(aoa))
dataAoa <- dataAoa[!is.na(dataAoa$value), ] #remove WB lemmas with no value
dataAoa_class<-split(dataAoa, dataAoa$lexical_class)
return(dataAoa_class)
}
aoa_it<-load_wordbank("Italian")
aoa_fr<-load_wordbank("French (French)")
same_size_df<- function(df1, df2) { # add lemmas of the first half not existing at the second half, with 1
firstlistlemma<-(df1$name)
secondlistlemma<-(df2$name)
diff1<-setdiff(firstlistlemma,secondlistlemma)
df<-as.data.frame(diff1)
df[,2] <- NA
df[,3] <- 1
colnames(df)<- c("lemma","pos","CountLemma")
df$name = df$lemma
secondhalf<- rbind(df2, df)
return(secondhalf)
}
split_half_cor <-function(dataAoa, corpus){
n<-nrow(corpus) #corpus size in word tokens
wblemmas<-unique(dataAoa$lemma) #unique wordbank lemmas
ind <- sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.5, 0.5)) #randomly split word tokens
firsthalf <- corpus[ind, ] #split in two
secondhalf <- corpus[!ind, ]
firsthalf <- firsthalf %>%
group_by(lemma, pos) %>%
summarize(CountLemma=n()) #group by lemma and pos and count raw frequency
secondhalf <- secondhalf %>%
group_by(lemma, pos) %>%
summarize(CountLemma=n())
secondhalf <- secondhalf[secondhalf$lemma %in% wblemmas, ] #keep only lemmas existing in wordbank
firsthalf <- firsthalf[firsthalf$lemma %in% wblemmas, ]
firsthalf$name <- paste(firsthalf$lemma, "-", firsthalf$pos) #merge lemma and pos to a new name, just in case
secondhalf$name <- paste(secondhalf$lemma, "-", secondhalf$pos)
firsthalf<-firsthalf[order(firsthalf$name),] #order vector alphabetically
secondhalf<-secondhalf[order(secondhalf$name),]
secondhalf<- same_size_df(firsthalf, secondhalf)
firsthalf<-same_size_df(secondhalf, firsthalf)
firsthalf<-firsthalf[order(firsthalf$name),] #order again
secondhalf<-secondhalf[order(secondhalf$name),]
r<-cor(firsthalf$CountLemma, secondhalf$CountLemma, method="kendall") #measure r
return(r)
}
sbformula <- function(r){ #adjust with spearman-brown formula
r1<-(2*r)/(1+r)
return(r1)
}
r_fr_nouns <- split_half_cor (aoa_fr$nouns, corpus$french)
r_fr_adj <- split_half_cor (aoa_fr$adjectives, corpus$french)
r_fr_verbs <- split_half_cor (aoa_fr$verbs, corpus$french)
r_fr_other <- split_half_cor (aoa_fr$other, corpus$french)
r_fr_fw <- split_half_cor (aoa_fr$function_words, corpus$french)
r_it_nouns <- split_half_cor (aoa_it$nouns, corpus$italian)
r_it_adj <- split_half_cor (aoa_it$adjectives, corpus$italian)
r_it_verbs <- split_half_cor (aoa_it$verbs, corpus$italian)
r_it_other <- split_half_cor (aoa_it$other, corpus$italian)
r_it_fw <- split_half_cor (aoa_it$function_words, corpus$italian)
French nouns: 0.9091857, 0.952433 French adjectives: 0.9089918, 0.9523266 French verbs: 0.9361389, 0.9670163 French other: 0.8936086,0.9438155 French function words: 0.922108,0.9594757
Italian nouns: 0.7785536, 0.8754908 Italian adjectives: 0.7861569, 0.8802776 Italian verbs: 0.8135557, 0.8971941 Italian other: 0.8562832,0.9225782 Italian function words: 0.9075423,0.9515305
cronebach_alpha <-function(dataAoa, corpus_frequency_){
corpus_frequency_reliability <- corpus_frequency_ %>%
ungroup() %>%
select(lemma, rawFrequency, target_child_id)
wblemmas<-unique(dataAoa$lemma) #unique wordbank lemmas
corpus_frequency_reliability <- corpus_frequency_reliability [corpus_frequency_reliability $lemma %in% wblemmas, ] #keep only lemmas with corresponding items in wordbank
lemma_<-corpus_frequency_reliability$lemma #restructure dataframe
target_child_id_<-corpus_frequency_reliability$target_child_id
freq_<-corpus_frequency_reliability$rawFrequency
df<-data.frame(lemma_, target_child_id_, freq_)
corpus_frequency_reliability_<-tidyr::spread(df, target_child_id_, freq_)
child_ids_<- unique(as.character(colnames(corpus_frequency_reliability_)[3:ncol(corpus_frequency_reliability_)]))
child<-select(corpus_frequency_reliability_, child_ids_ )
a<-alpha(child)
return(a$raw_)
}
r_fr_nouns_a <- cronebach_alpha (aoa_fr$nouns, frequency_fr)
r_fr_adj_a <- cronebach_alpha (aoa_fr$adjectives, frequency_fr)
r_fr_verbs_a <- cronebach_alpha (aoa_fr$verbs, frequency_fr)
r_fr_other_a <- cronebach_alpha (aoa_fr$other, frequency_fr)
r_fr_fw_a <- cronebach_alpha (aoa_fr$function_words, frequency_fr)
r_it_nouns_a <- cronebach_alpha (aoa_it$nouns, frequency_it)
French nouns a: ,
French adjectives a: , French verbs a: ,
French other a: , French function words a: ,
Italian nouns a: ,
split_half_cor_aoa <-function(lang_, clas_){
i<-get_item_data(language = lang_,form="WS")
i<-i %>% filter(type=="word" & lexical_class == clas_) #get item data and filter by lexical class
ids<-unique(i$item_id)
ids<-lapply(X = ids, FUN = function(t) gsub(pattern = "item_", replacement = "", x = t, fixed = TRUE))
items<-get_instrument_data(language = lang_,form="WS", administrations = TRUE) #get instrument data and filter by item
items<-items %>% filter(num_item_id %in% ids)
admin<-as.data.frame(unique(items$data_id))
n<-nrow(admin) #corpus size in word tokens
ind <- sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.5, 0.5)) #randomly split administrations
adminfirstnum <- admin[ind, ]
adminsecondnum <- admin[!ind, ] #create two groups of administrations
adminfirst<-items %>% filter(data_id %in% adminfirstnum) #filter items in administrations
adminsecond<-items %>% filter(data_id %in% adminsecondnum)
aoafirst<- fit_aoa(adminfirst, method = "glmrob", proportion = 0.5) # get aoa for each group
aoasecond<- fit_aoa(adminsecond, method = "glmrob", proportion = 0.5) #
r<-cor(aoafirst$aoa, aoasecond$aoa, use="complete.obs", method="kendall") #measure r
return(r)
}
r_fr_nouns_aoa <- split_half_cor_aoa ("French (French)", "nouns")
r_fr_adj_aoa <- split_half_cor_aoa ("French (French)", "adjectives")
#r_fr_verbs_aoa <- split_half_cor_aoa ("French (French)", "verbs")
r_fr_other_aoa <- split_half_cor_aoa ("French (French)", "other")
r_fr_fw_aoa <- split_half_cor_aoa ("French (French)", "function_words")
r_it_nouns_aoa <- split_half_cor_aoa ("Italian", "nouns")
r_it_adj_aoa <- split_half_cor_aoa ("Italian", "adjectives")
r_it_verbs_aoa <- split_half_cor_aoa ("Italian", "verbs")
r_it_other_aoa <- split_half_cor_aoa ("Italian", "other")
r_it_fw_aoa <- split_half_cor_aoa ("Italian", "function_words")
French nouns Aoa: 0.9170325, 0.9567209 French adjectives AoA: 0.8930176, 0.9434858 French other AoA: 0.9136377,0.9548701 French function words AoA: 0.7452548,0.8540355
Italian nouns Aoa: 0.9228946, 0.9599014 Italian adjectives AoA: 0.9383637, 0.9682019 Italian verbs AoA: 0.9187637, 0.9576621 Italian other AoA: 0.8921086,0.9429782 Italian function words AoA: 0.9054159,0.9503604
Merge aoa and childes db, get final db “CHILDES_WB_short”:
mergeAoaChildes<- function(dataAoa, corpus_frequency_){
wblemmas<-unique(dataAoa$lemma) #unique wordbank lemmas
corpus_frequency_ <- corpus_frequency_[corpus_frequency_$lemma %in% wblemmas, ]
corpus_frequency_short <- corpus_frequency_ %>%
ungroup() %>%
select(lemma, language, FrequencyLogMean, interceptmodel) #get only important columns
CHILDES_WB <- merge(x=dataAoa, y=unique(corpus_frequency_short), by="lemma") #number of lemmas after merging: 474
CHILDES_WB_short <- CHILDES_WB %>% select(lemma, FrequencyLogMean, interceptmodel, lexical_category, aoa, num_item_id)
CHILDES_WB_short <- unique(CHILDES_WB_short)
CHILDES_WB_short <- CHILDES_WB_short[!is.na(CHILDES_WB_short$aoa), ]
return(CHILDES_WB_short)
}
db_fr_nouns<- mergeAoaChildes(aoa_fr$nouns, frequency_fr)
db_fr_verbs<- mergeAoaChildes(aoa_fr$verbs, frequency_fr)
db_fr_adj<- mergeAoaChildes(aoa_fr$adjectives, frequency_fr)
db_fr_adj
## lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1 attention 0.137263254 9.370544e-04 predicates 25 419
## 356 bien 0.415506458 8.158470e-03 predicates 27 425
## 769 bleu 0.054982879 5.779297e-04 predicates 26 428
## 1076 cassé 0.025427159 3.187934e-04 predicates 22 431
## 1751 dur 0.025259731 2.205437e-05 predicates 26 440
## 2232 fatigué 0.001476048 2.057404e-05 predicates 27 443
## 2509 froid 0.031991488 5.152067e-04 predicates 23 446
## 2945 jaune 0.047214506 5.768112e-04 predicates 27 450
## 3224 joli 0.050884202 7.027436e-04 predicates 26 451
## 3535 malade 0.017539046 4.384274e-04 predicates 27 455
## 4298 noir 0.019546901 2.178486e-04 predicates 29 463
## 4536 orange 0.022223556 2.706328e-04 predicates 29 464
## 4782 parti 0.003713586 5.507267e-05 predicates 22 465
## 5231 propre 0.017563599 3.976709e-04 predicates 27 470
## 5510 rouge 0.071982186 1.257882e-03 predicates 26 471
## 5823 sale 0.016988734 2.415072e-04 predicates 23 472
## 6666 triste 0.005230208 5.890709e-05 predicates 30 477
## 7073 vite 0.045267476 3.081426e-04 predicates 27 483
db_fr_other<- mergeAoaChildes(aoa_fr$other, frequency_fr)
db_fr_other
## lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1 aie 0.0014554610 1.427080e-05 other 19 1
## 1051 après 0.1515661777 1.349419e-03 other 27 586
## 1642 bain 0.0408513657 3.355171e-04 other 19 16
## 2162 bébé 0.0890503501 1.111531e-03 other 18 391
## 2739 bonjour 0.0312476463 3.742174e-04 other 22 17
## 3191 bravo 0.0872096653 1.699627e-03 other 19 19
## 4028 chut 0.0182661550 2.054636e-04 other 21 21
## 4806 clown 0.0167824724 5.508127e-04 other 26 392
## 5136 cocorico 0.0015341776 2.851071e-05 other 30 4
## 5394 coucou 0.0557649675 6.703171e-04 other 18 22
## 6116 crèche 0.0155480380 4.968859e-04 other 29 184
## 6364 dame 0.0115265254 4.397470e-05 other 25 394
## 6722 dehors 0.0161243100 2.134732e-04 other 23 185
## 7346 demain 0.0166755362 1.703467e-04 other 28 590
## 7757 docteur 0.0195588967 4.144584e-04 other 26 395
## 8092 école 0.0483364719 8.096942e-04 other 25 186
## 8599 enfant 0.0392454472 6.486279e-04 other 28 396
## 9441 fille 0.0391946223 6.066743e-04 other 25 398
## 9786 forêt 0.0051735161 6.170795e-05 other 30 190
## 9968 frère 0.0234439532 4.191399e-04 other 30 399
## 10192 garçon 0.0351677325 6.467403e-04 other 25 400
## 10916 goûter 0.0280264382 6.277802e-04 other 26 27
## 11810 grrrr 0.0009068303 2.432962e-04 other 25 6
## 12487 jour 0.0289332472 2.505931e-04 other 30 593
## 12694 magasin 0.0043083112 5.888421e-05 other 28 192
## 13163 maison 0.0583039890 6.426059e-04 other 23 193
## 14227 matin 0.0264098573 1.876658e-04 other 30 595
## 14430 merci 0.0938108370 1.149518e-03 other 18 29
## 14976 meuh 0.0067746022 1.436260e-04 other 18 7
## 15482 miaou 0.0060345689 9.303134e-05 other 19 9
## 15966 monsieur 0.0624028506 1.252204e-03 other 24 407
## 17186 nuit 0.0172615812 3.329053e-04 other 26 596
## 17696 oui 0.7822438131 1.326424e-02 other 20 32
## 18837 parc 0.0094040014 2.569022e-04 other 28 194
## 19299 plage 0.0032233035 1.332759e-05 other 29 196
## 19707 pompier 0.0103292881 1.997968e-04 other 28 416
## 19968 salut 0.0067356567 1.110628e-04 other 28 35
## 20258 sieste 0.0089311129 1.853827e-04 other 27 36
## 20954 travail 0.0129630873 2.279375e-04 other 26 199
db_fr_fw<- mergeAoaChildes(aoa_fr$function_words, frequency_fr)
db_fr_fw
## lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1 à 0.598376262 7.081328e-03 function_words 26 622
## 346 aller 0.999844152 1.775329e-02 function_words 27 598
## 869 aussi 0.182842101 2.105051e-03 function_words 28 649
## 1135 autre 0.213564269 2.513844e-03 function_words 30 650
## 1372 avec 0.325474003 4.549970e-03 function_words 28 630
## 1644 ça 0.036328125 4.392286e-04 function_words 25 665
## 2413 chez 0.041025686 4.343159e-04 function_words 28 631
## 2850 dans 0.410794314 5.532106e-03 function_words 26 632
## 3180 de 1.125054220 2.374640e-02 function_words 29 633
## 3425 dehors 0.016124310 2.134732e-04 function_words 25 634
## 3802 derrière 0.026845040 3.096356e-04 function_words 28 635
## 4203 elle 0.002310198 2.946515e-05 function_words 30 668
## 4421 encore 0.205448418 2.141529e-03 function_words 20 653
## 4958 et 0.869845097 1.602332e-02 function_words 28 687
## 5714 fait 0.073023150 7.606505e-04 function_words 28 607
## 5986 ici 0.075186547 9.055816e-04 function_words 26 638
## 6312 il 1.121415487 1.833189e-02 function_words 29 670
## 6547 je 0.892414340 1.260185e-02 function_words 30 672
## 6770 là 0.821261858 1.148032e-02 function_words 21 639
## 7383 loin 0.019324151 2.119081e-04 function_words 28 641
## 7637 lui 0.706998960 9.703350e-03 function_words 30 674
## 8035 moi 0.353769278 4.837002e-03 function_words 25 676
## 8871 où 0.330531743 4.104750e-03 function_words 25 617
## 9243 pas 0.958674305 1.606340e-02 function_words 22 656
## 9709 pour 0.334025982 4.064731e-03 function_words 28 643
## 9960 pourquoi 0.099399869 1.126199e-03 function_words 28 618
## 10369 qui 0.486452305 6.813911e-03 function_words 28 620
## 10645 quoi 0.375288566 5.200633e-03 function_words 26 621
## 11192 sous 0.028613700 4.268765e-04 function_words 29 645
## 11417 sur 0.249277162 3.254560e-03 function_words 27 646
## 11842 vouloir 0.547763634 7.799171e-03 function_words 30 614
db_it_nouns<- mergeAoaChildes(aoa_it$nouns, frequency_it)
db_it_verbs<- mergeAoaChildes(aoa_it$verbs, frequency_it)
db_it_adj<- mergeAoaChildes(aoa_it$adjectives, frequency_it)
db_it_adj
## lemma FrequencyLogMean interceptmodel lexical_category aoa
## 1 addormentato 0.004596719 9.005842e-05 predicates 29
## 753 alto 0.025062460 7.166983e-05 predicates 25
## 1505 amaro 0.014263614 3.605804e-04 predicates 31
## 2257 arancione 0.040020598 -1.452827e-04 predicates 30
## 3009 arrabbiato 0.008127064 1.249584e-04 predicates 27
## 3761 asciutto 0.006160739 1.244247e-04 predicates 27
## 4513 attento 0.046821428 6.525205e-04 predicates 29
## 5265 bagnato 0.021042998 3.815002e-04 predicates 24
## 6017 bello 0.197788404 2.596453e-03 predicates 20
## 6769 bianco 0.045363704 3.328443e-04 predicates 26
## 7521 blu 0.029461868 -5.241462e-05 predicates 25
## 8273 brutto 0.033249824 3.546348e-04 predicates 21
## 9025 buio 0.013931490 1.505579e-04 predicates 22
## 9777 buono 0.067095096 7.006293e-04 predicates 22
## 10529 caldo 0.023497030 3.899878e-04 predicates 22
## 11281 carino 0.004313898 2.035831e-05 predicates 31
## 12033 cattivo 0.031166380 4.018006e-04 predicates 24
## 12785 contento 0.019213151 2.653005e-04 predicates 32
## 13537 corto 0.004983072 7.189403e-05 predicates 32
## 15041 dolce 0.016213803 2.785136e-04 predicates 27
## 15793 duro 0.009434323 5.777302e-05 predicates 28
## 16545 felice 0.005786867 1.148014e-04 predicates 32
## 17297 ferito 0.001326524 1.327404e-05 predicates 34
## 18049 finito 0.028672135 2.221979e-04 predicates 26
## 18801 forte 0.049944559 6.852048e-04 predicates 28
## 19553 freddo 0.007920054 1.405329e-04 predicates 23
## 20305 gentile 0.006904879 1.335528e-04 predicates 34
## 21057 giallo 0.052780705 -5.541887e-04 predicates 25
## 21809 leggero 0.019822200 1.866329e-04 predicates 33
## 22561 lento 0.001326524 1.327404e-05 predicates 34
## 23313 lungo 0.064604026 9.339161e-04 predicates 28
## 24065 malato 0.020486344 3.949564e-04 predicates 28
## 24817 marrone 0.028958927 3.212651e-04 predicates 31
## 25569 morbido 0.032777073 5.982313e-04 predicates 30
## 26321 nero 0.041683955 3.591475e-04 predicates 28
## 27073 nuovo 0.042920087 2.736452e-04 predicates 28
## 27825 piano 0.007932877 7.964426e-05 predicates 26
## 28577 piccolo 0.076396816 6.766792e-04 predicates 23
## 29329 pieno 0.013852750 9.354978e-05 predicates 29
## 30081 povero 0.032520483 2.240110e-04 predicates 33
## 30833 pulito 0.011548966 2.033521e-04 predicates 25
## 31585 rosso 0.067919229 1.049582e-04 predicates 24
## 32337 rotto 0.053233260 7.114556e-04 predicates 21
## 33089 sbagliato 0.003144953 3.149904e-05 predicates 31
## 33841 spaventato 0.006653384 6.675567e-05 predicates 33
## 34593 sporco 0.018276985 2.706008e-04 predicates 22
## 35345 stanco 0.013066198 1.932643e-04 predicates 26
## 36097 sveglio 0.026107485 5.587303e-04 predicates 28
## 36849 ultimo 0.005934764 3.027159e-05 predicates 34
## 37601 vecchio 0.017532868 3.421765e-04 predicates 31
## 38353 verde 0.039187319 1.837683e-05 predicates 26
## 39105 vuoto 0.010384558 1.370612e-04 predicates 28
## num_item_id
## 1 504
## 753 505
## 1505 506
## 2257 507
## 3009 508
## 3761 509
## 4513 510
## 5265 511
## 6017 512
## 6769 513
## 7521 514
## 8273 515
## 9025 516
## 9777 517
## 10529 518
## 11281 520
## 12033 521
## 12785 522
## 13537 523
## 15041 525
## 15793 526
## 16545 527
## 17297 528
## 18049 529
## 18801 530
## 19553 531
## 20305 532
## 21057 533
## 21809 535
## 22561 536
## 23313 537
## 24065 538
## 24817 539
## 25569 540
## 26321 541
## 27073 542
## 27825 545
## 28577 546
## 29329 547
## 30081 548
## 30833 549
## 31585 550
## 32337 551
## 33089 553
## 33841 556
## 34593 557
## 35345 558
## 36097 560
## 36849 562
## 37601 563
## 38353 565
## 39105 566
db_it_other<- mergeAoaChildes(aoa_it$other, frequency_it)
db_it_other
## lemma FrequencyLogMean interceptmodel lexical_category aoa
## 1 asilo 0.036044244 4.721329e-04 other 24
## 753 bar 0.019952716 4.592303e-04 other 29
## 1505 bosco 0.012504391 1.417388e-04 other 29
## 2257 bravo 0.171609239 1.709279e-03 other 21
## 3761 campagna 0.015291155 6.581188e-05 other 32
## 4513 casa 0.110395214 1.437522e-03 other 21
## 5265 chiesa 0.010498481 2.475443e-04 other 30
## 6769 città 0.014327904 1.758433e-04 other 32
## 7521 coccodè 0.007896842 1.518153e-04 other 21
## 8273 domani 0.034065088 4.019861e-04 other 26
## 9025 donna 0.011886646 1.698930e-04 other 33
## 9777 dottore 0.026658323 5.525189e-04 other 24
## 10529 festa 0.010485374 9.805064e-05 other 27
## 11281 fratello 0.016311700 2.589707e-04 other 31
## 12033 giardino 0.020516003 3.382131e-04 other 27
## 12785 giorno 0.040388542 3.715680e-04 other 29
## 13537 giostra 0.001326524 1.327404e-05 other 26
## 14289 ieri 0.041597656 4.335704e-04 other 31
## 15041 lavoro 0.008238747 1.517020e-04 other 25
## 16545 mare 0.089601214 1.371452e-03 other 21
## 17297 mattina 0.021013316 3.555752e-04 other 30
## 18049 mercato 0.005304219 5.622189e-05 other 30
## 19553 montagna 0.026023894 3.546883e-04 other 29
## 21809 negozio 0.001326524 1.327404e-05 other 29
## 24817 notte 0.014549982 1.049265e-04 other 26
## 25569 oggi 0.044771672 3.838566e-04 other 29
## 26321 ospedale 0.009559563 1.499621e-04 other 31
## 27825 poliziotto 0.006122968 6.141752e-05 other 31
## 28577 pomeriggio 0.001326524 1.327404e-05 other 34
## 29329 presto 0.008325382 3.739995e-04 other 30
## 30081 scuola 0.052558264 8.030118e-04 other 24
## 30833 sera 0.021862317 2.903378e-04 other 30
## 32337 soldato 0.012909772 2.612284e-04 other 36
## 33089 sorella 0.006683515 1.002088e-05 other 32
## 33841 spiaggia 0.007175454 8.622374e-05 other 29
## 34593 supermercato 0.001326524 1.327404e-05 other 30
## 35345 uomo 0.040327415 8.767560e-04 other 31
## 36097 via 0.241581043 2.941935e-03 other 19
## 36849 vigile 0.008896778 1.280033e-04 other 31
## 38353 zio 0.065971284 1.249610e-03 other 19
## 39105 zoo 0.013924973 9.582252e-05 other 32
## num_item_id
## 1 325
## 753 326
## 1505 327
## 2257 378
## 3761 328
## 4513 329
## 5265 330
## 6769 332
## 7521 194
## 8273 568
## 9025 353
## 9777 354
## 10529 333
## 11281 355
## 12033 334
## 12785 570
## 13537 335
## 14289 571
## 15041 336
## 16545 337
## 17297 572
## 18049 338
## 19553 339
## 21809 340
## 24817 573
## 25569 574
## 26321 341
## 27825 366
## 28577 575
## 29329 576
## 30081 343
## 30833 577
## 32337 370
## 33089 371
## 33841 344
## 34593 345
## 35345 372
## 36097 398
## 36849 373
## 38353 375
## 39105 346
db_it_fw<- mergeAoaChildes(aoa_it$function_words, frequency_it)
db_it_fw
## lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1 a 0.875920981 1.520108e-02 function_words 26 610
## 753 che 1.049254052 2.104496e-02 function_words 30 596
## 1505 chi 0.539141768 8.602536e-03 function_words 27 603
## 2257 ci 0.515593130 7.067654e-03 function_words 33 593
## 3009 come 0.626041950 9.114532e-03 function_words 31 604
## 3761 con 0.386757129 4.155337e-03 function_words 28 613
## 4513 così 0.262457972 2.799918e-03 function_words 29 651
## 5265 da 0.235618427 2.631727e-03 function_words 27 611
## 6017 davanti 0.017541638 2.576666e-04 function_words 30 624
## 6769 dentro 0.235308611 2.857147e-03 function_words 27 620
## 7521 di 0.607931639 8.052515e-03 function_words 25 609
## 8273 dietro 0.029341518 2.870030e-04 function_words 29 625
## 9025 dove 0.455214158 6.606138e-03 function_words 27 605
## 9777 e 1.097594481 2.129571e-02 function_words 26 638
## 9780 e 1.097594481 2.129571e-02 function_words 27 652
## 11281 ecco 0.271886870 2.842194e-03 function_words 23 660
## 12033 fuori 0.104895533 1.402436e-03 function_words 25 621
## 12785 giù 0.059061199 4.354619e-04 function_words 23 615
## 13537 il 1.633086989 4.365417e-02 function_words 28 626
## 14289 in 0.364076692 3.522000e-03 function_words 31 612
## 15041 io 0.327471159 4.255063e-03 function_words 21 579
## 15793 la 1.451234715 3.560798e-02 function_words 26 628
## 16545 lei 0.084878496 1.117875e-03 function_words 32 582
## 17297 lo 0.063386184 1.003811e-03 function_words 29 627
## 18049 lontano 0.009306079 7.083074e-05 function_words 29 622
## 18801 loro 0.064918040 9.564582e-04 function_words 35 585
## 19553 lui 0.116291219 1.312676e-03 function_words 30 581
## 20305 ma 0.659845246 1.086208e-02 function_words 31 653
## 21057 molto 0.212765819 2.736212e-03 function_words 30 662
## 21809 nessuno 0.016320038 2.082957e-04 function_words 30 663
## 22561 niente 0.081139531 9.974170e-04 function_words 27 664
## 23313 noi 0.043998556 5.685382e-04 function_words 31 583
## 24065 per 0.285452224 3.447167e-03 function_words 30 616
## 24817 poco 0.023979613 1.374663e-04 function_words 24 665
## 25569 quale 0.094342866 1.422346e-03 function_words 31 607
## 26321 quando 0.184732854 2.456190e-03 function_words 31 608
## 27073 se 0.287755796 2.598505e-03 function_words 34 656
## 27825 si 0.765124391 1.226932e-02 function_words 29 595
## 28577 sopra 0.067055126 7.508139e-04 function_words 26 618
## 29329 sotto 0.066725183 4.618105e-04 function_words 25 619
## 30081 su 0.156877035 1.707746e-03 function_words 26 614
## 30833 tanto 0.073912172 4.174869e-04 function_words 24 667
## 31585 troppo 0.048910912 5.569208e-04 function_words 31 668
## 32337 tu 0.191673021 1.668384e-03 function_words 24 580
## 33089 tutto 0.459518294 5.737280e-03 function_words 25 669
## 34593 vicino 0.038381948 3.747489e-04 function_words 29 623
Plot frequency and aoa using log frequency and model intercept frequency:
plot_frequency1<-function(db){
ggplot(db,
aes(FrequencyLogMean, aoa, label=lemma)) +
geom_point() +
geom_text(aes(label=lemma),hjust=0, vjust=0) +xlim(0,0.6) #+ facet_wrap(~lexical_category, nrow=2)
}
plot_frequency2<-function(db){
ggplot(db,
aes(interceptmodel, aoa, label=lemma)) +
geom_point() +
geom_text(aes(label=lemma),hjust=0, vjust=0) # + facet_wrap(~lexical_category, nrow=2)
}
plot_frequency3<-function(db){
ggplot(db,
aes(x = interceptmodel, y = FrequencyLogMean, label = lemma)) +
geom_point() +
geom_smooth(method = "lm")# + facet_wrap(~lexical_category)
}
ggarrange(plot_frequency1(db_fr_nouns), plot_frequency1(db_fr_adj), plot_frequency1(db_fr_fw), plot_frequency1(db_fr_other), ncol=2, nrow=2, common.legend = TRUE, legend="right")
ggarrange(plot_frequency1(db_it_nouns), plot_frequency1(db_it_adj), plot_frequency1(db_it_fw), plot_frequency1(db_it_other), plot_frequency1(db_it_verbs), ncol=2, nrow=3, common.legend = TRUE, legend="right")
ggarrange(plot_frequency3(db_fr_nouns), plot_frequency3(db_fr_adj), plot_frequency3(db_fr_fw), plot_frequency3(db_fr_other), ncol=2, nrow=3, common.legend = TRUE, legend="right")
ggarrange(plot_frequency3(db_it_nouns), plot_frequency3(db_it_adj), plot_frequency3(db_it_fw), plot_frequency3(db_it_other), plot_frequency3(db_it_verbs), ncol=2, nrow=3, common.legend = TRUE, legend="right")
regression_option1<-function(db){
option1<-lm(aoa~ FrequencyLogMean, data=db)
option1
return(summary(option1)$adj.r.squared)
}
it_nouns_R2<-regression_option1(db_it_nouns)
it_verbs_R2<-regression_option1(db_it_verbs)
it_adj_R2<-regression_option1(db_it_adj)
it_fw_R2<-regression_option1(db_it_fw)
it_other_R2<-regression_option1(db_it_other)
fr_nouns_R2<-regression_option1(db_fr_nouns)
fr_adj_R2<-regression_option1(db_fr_adj)
fr_fw_R2<-regression_option1(db_fr_fw)
fr_other_R2<-regression_option1(db_fr_other)
ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_nouns), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_verbs), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_adj), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_fw), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_other), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_nouns), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_adj), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_fw), c("FrequencyLogMean")) %>% plot()
ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_other), c("FrequencyLogMean")) %>% plot()
#option2 <- glmer(value1 ~ aoa * FrequencyLogMean + (1|lexical_category), data=CHILDES_WB)
#anova(option2)
#qplot( x = CHILDES_WB$intercept, fill = CHILDES_WB$`value == "produces"`, geom = "histogram", main = "Frequency distribution for WB items at 17 months", xlab = "Frequency of WB items")
Adjusted R2 Italian nouns Adjusted R2 : 0.1730756 Italian adjectives Adjusted R2 : 0.2226383 Italian other Adjusted R2 : 0.4064931 Italian function words Adjusted R2 : -0.0159714
French nouns Adjusted R2 : 0.3402412 French adjectives Adjusted R2 : -0.0583493 French other Adjusted R2 : 0.0488026 French function words Adjusted R2 : -0.0342725
r_fr_nouns_aoa*r_fr_nouns
## [1] 0.8337529
fr_nouns_R2
## [1] 0.3402412
r_it_nouns_aoa*r_it_nouns
## [1] 0.7185229
it_nouns_R2
## [1] 0.1730756
r_fr_fw_aoa*r_fr_fw
## [1] 0.6872054
fr_fw_R2
## [1] -0.03427254
r_it_fw_aoa*r_it_fw
## [1] 0.8217032
it_fw_R2
## [1] -0.01597142