spacy_regression

Load saved CHILDES .csv corpus for a language (here French and Italian).

Rbind different corpora.

library(tidyverse)
library(lme4)
library(ggeffects)
library(wordbankr)
library(psych)
library(reshape2)
library(ggpubr)
library(rstatix)
library(data.table)
library(gridExtra)

clean_childes <- function(corpus_) {
  #annot <- c("xxx", "yyy", "www", "-", "'") 
  #annotUtt <- filter(corpus_, lemma %in% annot) 
  #annotUttID<- unique(annotUtt$utterance_id) 
  #corpus_ <- filter(corpus_, !(utterance_id  %in% annotUttID))  #remove utterances with annotations - incomplete info
  
  corpus_<-corpus_ %>% filter (speaker_code != "CHI") #remove target child utterances
  corpus_<-corpus_ %>% filter (pos != "PUNCT")  #remove punctuation 
  corpus_ %>% mutate(lemma = tolower(lemma))
}

load_data <- function(language_list) {
  for (lang in language_list){
    if (lang == "French")     { 
      french=read_csv("/Users/lscpuser/Documents/fyssen-project/french_1403.csv")
      french=clean_childes(french)
    }
    if (lang == "Italian")    { 
      italian=read_csv("/Users/lscpuser/Documents/fyssen-project/italian_1403.csv")
      italian=clean_childes(italian)
    }
  }
  return(list(french = data.table(french), italian =data.table(italian)))
}

Clean CHILDES corpus.

Clean utterances by removing puctuation, incomplete sentences and target-child speech.

corpus<-load_data(language_list) 

lapply(corpus, function(x) {
  summary(x)
})

## $french
##        X1              text              lemma               lex           
##  Min.   :     11   Length:1716925     Length:1716925     Length:1716925    
##  1st Qu.: 755447   Class :character   Class :character   Class :character  
##  Median :1442050   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1435194                                                           
##  3rd Qu.:2100678                                                           
##  Max.   :2970632                                                           
##      pos                tag             dependency           morph          
##  Length:1716925     Length:1716925     Length:1716925     Length:1716925    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      prefix            prefix_              suffix            suffix_         
##  Min.   :7.187e+15   Length:1716925     Min.   :4.050e+02   Length:1716925    
##  1st Qu.:2.985e+18   Class :character   1st Qu.:5.902e+18   Class :character  
##  Median :9.153e+18   Mode  :character   Median :1.114e+19   Mode  :character  
##  Mean   :9.142e+18                      Mean   :1.038e+19                     
##  3rd Qu.:1.537e+19                      3rd Qu.:1.462e+19                     
##  Max.   :1.800e+19                      Max.   :1.844e+19                     
##    sentiment  utterance_id      target_child_id speaker_code      
##  Min.   :0   Min.   :17476553   Min.   :23235   Length:1716925    
##  1st Qu.:0   1st Qu.:17654464   1st Qu.:23275   Class :character  
##  Median :0   Median :17798673   Median :23312   Mode  :character  
##  Mean   :0   Mean   :17787554   Mean   :23319                     
##  3rd Qu.:0   3rd Qu.:17921273   3rd Qu.:23373                     
##  Max.   :0   Max.   :18149594   Max.   :23412                     
##  corpus_name        transcript_id     language        
##  Length:1716925     Min.   :44628   Length:1716925    
##  Class :character   1st Qu.:44898   Class :character  
##  Mode  :character   Median :45107   Mode  :character  
##                     Mean   :45032                     
##                     3rd Qu.:45175                     
##                     Max.   :45263                     
## 
## $italian
##        X1             text              lemma               lex           
##  Min.   :     4   Length:247618      Length:247618      Length:247618     
##  1st Qu.:118387   Class :character   Class :character   Class :character  
##  Median :225572   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :224898                                                           
##  3rd Qu.:333171                                                           
##  Max.   :437481                                                           
##      pos                tag             dependency           morph          
##  Length:247618      Length:247618      Length:247618      Length:247618     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      prefix            prefix_              suffix            suffix_         
##  Min.   :7.187e+15   Length:247618      Min.   :1.675e+16   Length:247618     
##  1st Qu.:2.985e+18   Class :character   1st Qu.:5.098e+18   Class :character  
##  Median :1.190e+19   Mode  :character   Median :9.232e+18   Mode  :character  
##  Mean   :1.020e+19                      Mean   :9.781e+18                     
##  3rd Qu.:1.560e+19                      3rd Qu.:1.405e+19                     
##  Max.   :1.800e+19                      Max.   :1.845e+19                     
##    sentiment  utterance_id     target_child_id speaker_code      
##  Min.   :0   Min.   :7388853   Min.   :14314   Length:247618     
##  1st Qu.:0   1st Qu.:7425971   1st Qu.:14341   Class :character  
##  Median :0   Median :7450137   Median :14377   Mode  :character  
##  Mean   :0   Mean   :7453151   Mean   :14373                     
##  3rd Qu.:0   3rd Qu.:7481870   3rd Qu.:14412                     
##  Max.   :0   Max.   :7514844   Max.   :14421                     
##  corpus_name        transcript_id     language        
##  Length:247618      Min.   :22462   Length:247618     
##  Class :character   1st Qu.:22498   Class :character  
##  Mode  :character   Median :22531   Mode  :character  
##                     Mean   :22538                     
##                     3rd Qu.:22583                     
##                     Max.   :22616

Measure raw frequency for CHILDES lemmas.

Count number of times child hears a lemma, and divide it by the total number of lemma tokens heard. This creates a ‘rawFrequency’ column, which gives 1 when grouped by target child.

#Count each lemma and sum of lemmas for each target child

frequency_raw <- function(corpus_) {
  corpus_frequency<- corpus_ %>%  
  group_by(lemma, target_child_id, language) %>% 
  summarize(CountLemma=n()) #count each lemma for each child

  countAllLemmas<-corpus_frequency %>%  
  group_by(target_child_id, language) %>% 
  summarize(countAllLemmasChild=sum(CountLemma)) #count all lemma tokens for child 

  corpus_frequency<-corpus_frequency %>% 
  left_join(countAllLemmas) #join infos
  
  corpus_frequency<-corpus_frequency %>% 
  mutate (rawFrequency = CountLemma / countAllLemmasChild) %>% 
  mutate (FrequencyLog =log(1+ rawFrequency * 100)) #Convert rawFrequency to FrequencyLog to avoid very small values.

  corpus_frequency1<- corpus_frequency %>%  
  group_by(lemma, language) %>% 
  summarize(FrequencyLogMean=mean(FrequencyLog))

  corpus_frequency <- corpus_frequency %>% 
  left_join(corpus_frequency1) 

  return(corpus_frequency)
}

Model frequency for CHILDES lemmas and get intercept.

Model frequency by corpus size, and get intercept coefficient as a proxy for frequency:

frequency_model <- function(corpus) {
  corpus_frequency<-frequency_raw(corpus)
  
  models <- corpus_frequency %>%
  group_by(language, lemma) %>%  nest() 

  models_ <- models %>% 
  mutate (interceptmodel = map(.x=data, .f=~coef(lm(.x$rawFrequency ~(.x$countAllLemmasChild-.x$CountLemma), data=data))["(Intercept)"], na.rm = T))

  models_<-models_ %>% unnest(interceptmodel)

  corpus_frequency_<-corpus_frequency %>% 
  left_join(unique(models_)) 

  return(corpus_frequency_)}

frequency_it<- frequency_model(corpus$italian)
frequency_fr<- frequency_model(corpus$french)

# TEST1 frequency metric:
frequency_it %>% 
  group_by(target_child_id) %>% 
    summarize (sum(rawFrequency)) #Test frequence: should be 1 for each child

## # A tibble: 11 x 2
##    target_child_id `sum(rawFrequency)`
##  *           <dbl>               <dbl>
##  1           14314                   1
##  2           14325                   1
##  3           14333                   1
##  4           14341                   1
##  5           14357                   1
##  6           14377                   1
##  7           14406                   1
##  8           14408                   1
##  9           14412                   1
## 10           14415                   1
## 11           14421                   1

# TEST2 frequency metric:  
frequency_fr %>% 
  arrange(desc(FrequencyLog))#: maximum values

## # A tibble: 40,897 x 10
## # Groups:   lemma, target_child_id [40,897]
##    lemma target_child_id language CountLemma countAllLemmasC… rawFrequency
##    <chr>           <dbl> <chr>         <int>            <int>        <dbl>
##  1 être            23381 French         3626            54442       0.0666
##  2 le              23373 French         3792            61746       0.0614
##  3 être            23312 French         9213           155399       0.0593
##  4 le              23235 French         7965           138177       0.0576
##  5 être            23259 French          329             5856       0.0562
##  6 être            23288 French         8709           156818       0.0555
##  7 le              23243 French        12613           227150       0.0555
##  8 être            23296 French         5872           106207       0.0553
##  9 être            23235 French         7370           138177       0.0533
## 10 le              23288 French         8280           156818       0.0528
## # … with 40,887 more rows, and 4 more variables: FrequencyLog <dbl>,
## #   FrequencyLogMean <dbl>, data <list>, interceptmodel <dbl>

WordBank lemmas and AoA.

Get wordbank data for language and aoa for each lemma:

list_class<-c("nouns", "verbs", "adjectives", "function_words", "other")

load_wordbank<- function(lang_) {
  WB_tokens<- get_item_data(language = lang_, form = "WS")
  WB_tokens<-WB_tokens %>% 
  filter (type == "word")  #remove grammar

  data <- get_instrument_data(language =lang_, form = "WS", items = WB_tokens$item_id, administrations = TRUE, iteminfo=TRUE)
  names(data)[names(data) == "definition"] <- "lemma"

  aoa<- fit_aoa(data, measure = "produces", method = "glmrob", proportion = 0.5) # 145 NAs out of 680
  names(aoa)[names(aoa) == "definition"] <- "lemma"

  dataAoa <- unique(data) %>% left_join(unique(aoa))
  dataAoa <- dataAoa[!is.na(dataAoa$value), ]  #remove WB lemmas with no value
  dataAoa_class<-split(dataAoa, dataAoa$lexical_class)

  return(dataAoa_class)
}

aoa_it<-load_wordbank("Italian")
aoa_fr<-load_wordbank("French (French)")

Reliability_frequency: half-split and Spearman-Brown

same_size_df<- function(df1, df2) { # add lemmas of the first half not existing at the second half, with 1 
  firstlistlemma<-(df1$name)
  secondlistlemma<-(df2$name)
  diff1<-setdiff(firstlistlemma,secondlistlemma) 
  df<-as.data.frame(diff1)
  df[,2] <- NA
  df[,3] <- 1
  colnames(df)<- c("lemma","pos","CountLemma")
  df$name = df$lemma
  secondhalf<- rbind(df2, df)
  return(secondhalf)
}

split_half_cor <-function(dataAoa, corpus){
n<-nrow(corpus) #corpus size in word tokens

wblemmas<-unique(dataAoa$lemma) #unique wordbank lemmas

ind <- sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.5, 0.5)) #randomly split word tokens
firsthalf <- corpus[ind, ] #split in two
secondhalf <- corpus[!ind, ]

firsthalf <- firsthalf %>%  
  group_by(lemma, pos) %>% 
    summarize(CountLemma=n()) #group by lemma and pos and count raw frequency

secondhalf <- secondhalf %>%  
  group_by(lemma, pos) %>% 
    summarize(CountLemma=n()) 

secondhalf <- secondhalf[secondhalf$lemma %in% wblemmas, ] #keep only lemmas existing in wordbank
firsthalf <- firsthalf[firsthalf$lemma %in% wblemmas, ]

firsthalf$name <- paste(firsthalf$lemma, "-", firsthalf$pos) #merge lemma and pos to a new name, just in case
secondhalf$name <- paste(secondhalf$lemma, "-", secondhalf$pos)

firsthalf<-firsthalf[order(firsthalf$name),] #order vector alphabetically
secondhalf<-secondhalf[order(secondhalf$name),]

secondhalf<- same_size_df(firsthalf, secondhalf)
firsthalf<-same_size_df(secondhalf, firsthalf)

firsthalf<-firsthalf[order(firsthalf$name),] #order again
secondhalf<-secondhalf[order(secondhalf$name),]

r<-cor(firsthalf$CountLemma, secondhalf$CountLemma, method="kendall") #measure r
return(r)
}

sbformula <- function(r){  #adjust with spearman-brown formula
  r1<-(2*r)/(1+r)
  return(r1)
  }

r_fr_nouns <- split_half_cor (aoa_fr$nouns, corpus$french)
r_fr_adj <- split_half_cor (aoa_fr$adjectives, corpus$french)
r_fr_verbs <- split_half_cor (aoa_fr$verbs, corpus$french)
r_fr_other <- split_half_cor (aoa_fr$other, corpus$french)
r_fr_fw <- split_half_cor (aoa_fr$function_words, corpus$french)

r_it_nouns <- split_half_cor (aoa_it$nouns, corpus$italian)
r_it_adj <- split_half_cor (aoa_it$adjectives, corpus$italian)
r_it_verbs <- split_half_cor (aoa_it$verbs, corpus$italian)
r_it_other <- split_half_cor (aoa_it$other, corpus$italian)
r_it_fw <- split_half_cor (aoa_it$function_words, corpus$italian)

French nouns: 0.9091857, 0.952433 French adjectives: 0.9089918, 0.9523266 French verbs: 0.9361389, 0.9670163 French other: 0.8936086,0.9438155 French function words: 0.922108,0.9594757

Italian nouns: 0.7785536, 0.8754908 Italian adjectives: 0.7861569, 0.8802776 Italian verbs: 0.8135557, 0.8971941 Italian other: 0.8562832,0.9225782 Italian function words: 0.9075423,0.9515305

Reliability_frequency: cronebach’s alpha - treat each word as item and each child as subject

cronebach_alpha <-function(dataAoa, corpus_frequency_){
  corpus_frequency_reliability <- corpus_frequency_ %>% 
    ungroup() %>% 
      select(lemma, rawFrequency, target_child_id)

  wblemmas<-unique(dataAoa$lemma) #unique wordbank lemmas
  corpus_frequency_reliability  <- corpus_frequency_reliability [corpus_frequency_reliability $lemma %in% wblemmas, ] #keep only lemmas with corresponding items in wordbank

  lemma_<-corpus_frequency_reliability$lemma  #restructure dataframe
  target_child_id_<-corpus_frequency_reliability$target_child_id
  freq_<-corpus_frequency_reliability$rawFrequency

  df<-data.frame(lemma_, target_child_id_, freq_)
  corpus_frequency_reliability_<-tidyr::spread(df, target_child_id_, freq_)

  child_ids_<- unique(as.character(colnames(corpus_frequency_reliability_)[3:ncol(corpus_frequency_reliability_)]))

  child<-select(corpus_frequency_reliability_, child_ids_ )
  a<-alpha(child) 
  return(a$raw_)
}

r_fr_nouns_a <- cronebach_alpha (aoa_fr$nouns, frequency_fr)
r_fr_adj_a <- cronebach_alpha (aoa_fr$adjectives, frequency_fr)
r_fr_verbs_a <- cronebach_alpha (aoa_fr$verbs, frequency_fr)
r_fr_other_a <- cronebach_alpha (aoa_fr$other, frequency_fr)
r_fr_fw_a <- cronebach_alpha (aoa_fr$function_words, frequency_fr)
r_it_nouns_a <- cronebach_alpha (aoa_it$nouns, frequency_it)

French nouns a: ,
French adjectives a: , French verbs a: ,
French other a: , French function words a: ,

Italian nouns a: ,

Reliability_AoA:

split_half_cor_aoa <-function(lang_, clas_){
  
  i<-get_item_data(language = lang_,form="WS")
  i<-i %>% filter(type=="word" & lexical_class == clas_) #get item data and filter by lexical class
  ids<-unique(i$item_id)
  ids<-lapply(X = ids, FUN = function(t) gsub(pattern = "item_", replacement = "", x = t, fixed = TRUE))

  items<-get_instrument_data(language = lang_,form="WS", administrations = TRUE) #get instrument data and filter by item
  items<-items %>% filter(num_item_id %in% ids)

  admin<-as.data.frame(unique(items$data_id))
  n<-nrow(admin) #corpus size in word tokens
  ind <- sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.5, 0.5)) #randomly split administrations

  adminfirstnum <- admin[ind, ]
  adminsecondnum <- admin[!ind, ] #create two groups of administrations

  adminfirst<-items %>% filter(data_id %in% adminfirstnum) #filter items in administrations
  adminsecond<-items %>% filter(data_id %in% adminsecondnum)

  aoafirst<- fit_aoa(adminfirst, method = "glmrob", proportion = 0.5) # get aoa for each group
  aoasecond<- fit_aoa(adminsecond, method = "glmrob", proportion = 0.5) # 

  r<-cor(aoafirst$aoa, aoasecond$aoa, use="complete.obs", method="kendall") #measure r
  return(r)
}

r_fr_nouns_aoa <- split_half_cor_aoa ("French (French)", "nouns")
r_fr_adj_aoa <- split_half_cor_aoa ("French (French)", "adjectives")
#r_fr_verbs_aoa <- split_half_cor_aoa ("French (French)", "verbs")
r_fr_other_aoa <- split_half_cor_aoa ("French (French)", "other")
r_fr_fw_aoa <- split_half_cor_aoa ("French (French)", "function_words")


r_it_nouns_aoa <- split_half_cor_aoa ("Italian", "nouns")
r_it_adj_aoa <- split_half_cor_aoa ("Italian", "adjectives")
r_it_verbs_aoa <- split_half_cor_aoa ("Italian", "verbs")
r_it_other_aoa <- split_half_cor_aoa ("Italian", "other")
r_it_fw_aoa <- split_half_cor_aoa ("Italian", "function_words")

French nouns Aoa: 0.9170325, 0.9567209 French adjectives AoA: 0.8930176, 0.9434858 French other AoA: 0.9136377,0.9548701 French function words AoA: 0.7452548,0.8540355

Italian nouns Aoa: 0.9228946, 0.9599014 Italian adjectives AoA: 0.9383637, 0.9682019 Italian verbs AoA: 0.9187637, 0.9576621 Italian other AoA: 0.8921086,0.9429782 Italian function words AoA: 0.9054159,0.9503604

merge CHILDES and WORDBANK

Merge aoa and childes db, get final db “CHILDES_WB_short”:

mergeAoaChildes<- function(dataAoa, corpus_frequency_){
  wblemmas<-unique(dataAoa$lemma) #unique wordbank lemmas
  corpus_frequency_ <- corpus_frequency_[corpus_frequency_$lemma %in% wblemmas, ]
  corpus_frequency_short <- corpus_frequency_ %>% 
  ungroup() %>% 
    select(lemma, language, FrequencyLogMean, interceptmodel) #get only important columns 

  CHILDES_WB <- merge(x=dataAoa, y=unique(corpus_frequency_short), by="lemma") #number of lemmas after merging: 474
  CHILDES_WB_short <- CHILDES_WB %>% select(lemma, FrequencyLogMean, interceptmodel, lexical_category, aoa, num_item_id) 

  CHILDES_WB_short <- unique(CHILDES_WB_short)
  CHILDES_WB_short <- CHILDES_WB_short[!is.na(CHILDES_WB_short$aoa), ] 
  
  return(CHILDES_WB_short)
}

db_fr_nouns<- mergeAoaChildes(aoa_fr$nouns, frequency_fr)
db_fr_verbs<- mergeAoaChildes(aoa_fr$verbs, frequency_fr)
db_fr_adj<- mergeAoaChildes(aoa_fr$adjectives, frequency_fr)
db_fr_adj

##          lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1    attention      0.137263254   9.370544e-04       predicates  25         419
## 356       bien      0.415506458   8.158470e-03       predicates  27         425
## 769       bleu      0.054982879   5.779297e-04       predicates  26         428
## 1076     cassé      0.025427159   3.187934e-04       predicates  22         431
## 1751       dur      0.025259731   2.205437e-05       predicates  26         440
## 2232   fatigué      0.001476048   2.057404e-05       predicates  27         443
## 2509     froid      0.031991488   5.152067e-04       predicates  23         446
## 2945     jaune      0.047214506   5.768112e-04       predicates  27         450
## 3224      joli      0.050884202   7.027436e-04       predicates  26         451
## 3535    malade      0.017539046   4.384274e-04       predicates  27         455
## 4298      noir      0.019546901   2.178486e-04       predicates  29         463
## 4536    orange      0.022223556   2.706328e-04       predicates  29         464
## 4782     parti      0.003713586   5.507267e-05       predicates  22         465
## 5231    propre      0.017563599   3.976709e-04       predicates  27         470
## 5510     rouge      0.071982186   1.257882e-03       predicates  26         471
## 5823      sale      0.016988734   2.415072e-04       predicates  23         472
## 6666    triste      0.005230208   5.890709e-05       predicates  30         477
## 7073      vite      0.045267476   3.081426e-04       predicates  27         483

db_fr_other<- mergeAoaChildes(aoa_fr$other, frequency_fr)
db_fr_other

##          lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1          aie     0.0014554610   1.427080e-05            other  19           1
## 1051     après     0.1515661777   1.349419e-03            other  27         586
## 1642      bain     0.0408513657   3.355171e-04            other  19          16
## 2162      bébé     0.0890503501   1.111531e-03            other  18         391
## 2739   bonjour     0.0312476463   3.742174e-04            other  22          17
## 3191     bravo     0.0872096653   1.699627e-03            other  19          19
## 4028      chut     0.0182661550   2.054636e-04            other  21          21
## 4806     clown     0.0167824724   5.508127e-04            other  26         392
## 5136  cocorico     0.0015341776   2.851071e-05            other  30           4
## 5394    coucou     0.0557649675   6.703171e-04            other  18          22
## 6116    crèche     0.0155480380   4.968859e-04            other  29         184
## 6364      dame     0.0115265254   4.397470e-05            other  25         394
## 6722    dehors     0.0161243100   2.134732e-04            other  23         185
## 7346    demain     0.0166755362   1.703467e-04            other  28         590
## 7757   docteur     0.0195588967   4.144584e-04            other  26         395
## 8092     école     0.0483364719   8.096942e-04            other  25         186
## 8599    enfant     0.0392454472   6.486279e-04            other  28         396
## 9441     fille     0.0391946223   6.066743e-04            other  25         398
## 9786     forêt     0.0051735161   6.170795e-05            other  30         190
## 9968     frère     0.0234439532   4.191399e-04            other  30         399
## 10192   garçon     0.0351677325   6.467403e-04            other  25         400
## 10916   goûter     0.0280264382   6.277802e-04            other  26          27
## 11810    grrrr     0.0009068303   2.432962e-04            other  25           6
## 12487     jour     0.0289332472   2.505931e-04            other  30         593
## 12694  magasin     0.0043083112   5.888421e-05            other  28         192
## 13163   maison     0.0583039890   6.426059e-04            other  23         193
## 14227    matin     0.0264098573   1.876658e-04            other  30         595
## 14430    merci     0.0938108370   1.149518e-03            other  18          29
## 14976     meuh     0.0067746022   1.436260e-04            other  18           7
## 15482    miaou     0.0060345689   9.303134e-05            other  19           9
## 15966 monsieur     0.0624028506   1.252204e-03            other  24         407
## 17186     nuit     0.0172615812   3.329053e-04            other  26         596
## 17696      oui     0.7822438131   1.326424e-02            other  20          32
## 18837     parc     0.0094040014   2.569022e-04            other  28         194
## 19299    plage     0.0032233035   1.332759e-05            other  29         196
## 19707  pompier     0.0103292881   1.997968e-04            other  28         416
## 19968    salut     0.0067356567   1.110628e-04            other  28          35
## 20258   sieste     0.0089311129   1.853827e-04            other  27          36
## 20954  travail     0.0129630873   2.279375e-04            other  26         199

db_fr_fw<- mergeAoaChildes(aoa_fr$function_words, frequency_fr)
db_fr_fw

##          lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1            à      0.598376262   7.081328e-03   function_words  26         622
## 346      aller      0.999844152   1.775329e-02   function_words  27         598
## 869      aussi      0.182842101   2.105051e-03   function_words  28         649
## 1135     autre      0.213564269   2.513844e-03   function_words  30         650
## 1372      avec      0.325474003   4.549970e-03   function_words  28         630
## 1644        ça      0.036328125   4.392286e-04   function_words  25         665
## 2413      chez      0.041025686   4.343159e-04   function_words  28         631
## 2850      dans      0.410794314   5.532106e-03   function_words  26         632
## 3180        de      1.125054220   2.374640e-02   function_words  29         633
## 3425    dehors      0.016124310   2.134732e-04   function_words  25         634
## 3802  derrière      0.026845040   3.096356e-04   function_words  28         635
## 4203      elle      0.002310198   2.946515e-05   function_words  30         668
## 4421    encore      0.205448418   2.141529e-03   function_words  20         653
## 4958        et      0.869845097   1.602332e-02   function_words  28         687
## 5714      fait      0.073023150   7.606505e-04   function_words  28         607
## 5986       ici      0.075186547   9.055816e-04   function_words  26         638
## 6312        il      1.121415487   1.833189e-02   function_words  29         670
## 6547        je      0.892414340   1.260185e-02   function_words  30         672
## 6770        là      0.821261858   1.148032e-02   function_words  21         639
## 7383      loin      0.019324151   2.119081e-04   function_words  28         641
## 7637       lui      0.706998960   9.703350e-03   function_words  30         674
## 8035       moi      0.353769278   4.837002e-03   function_words  25         676
## 8871        où      0.330531743   4.104750e-03   function_words  25         617
## 9243       pas      0.958674305   1.606340e-02   function_words  22         656
## 9709      pour      0.334025982   4.064731e-03   function_words  28         643
## 9960  pourquoi      0.099399869   1.126199e-03   function_words  28         618
## 10369      qui      0.486452305   6.813911e-03   function_words  28         620
## 10645     quoi      0.375288566   5.200633e-03   function_words  26         621
## 11192     sous      0.028613700   4.268765e-04   function_words  29         645
## 11417      sur      0.249277162   3.254560e-03   function_words  27         646
## 11842  vouloir      0.547763634   7.799171e-03   function_words  30         614

db_it_nouns<- mergeAoaChildes(aoa_it$nouns, frequency_it)
db_it_verbs<- mergeAoaChildes(aoa_it$verbs, frequency_it)
db_it_adj<- mergeAoaChildes(aoa_it$adjectives, frequency_it)
db_it_adj

##              lemma FrequencyLogMean interceptmodel lexical_category aoa
## 1     addormentato      0.004596719   9.005842e-05       predicates  29
## 753           alto      0.025062460   7.166983e-05       predicates  25
## 1505         amaro      0.014263614   3.605804e-04       predicates  31
## 2257     arancione      0.040020598  -1.452827e-04       predicates  30
## 3009    arrabbiato      0.008127064   1.249584e-04       predicates  27
## 3761      asciutto      0.006160739   1.244247e-04       predicates  27
## 4513       attento      0.046821428   6.525205e-04       predicates  29
## 5265       bagnato      0.021042998   3.815002e-04       predicates  24
## 6017         bello      0.197788404   2.596453e-03       predicates  20
## 6769        bianco      0.045363704   3.328443e-04       predicates  26
## 7521           blu      0.029461868  -5.241462e-05       predicates  25
## 8273        brutto      0.033249824   3.546348e-04       predicates  21
## 9025          buio      0.013931490   1.505579e-04       predicates  22
## 9777         buono      0.067095096   7.006293e-04       predicates  22
## 10529        caldo      0.023497030   3.899878e-04       predicates  22
## 11281       carino      0.004313898   2.035831e-05       predicates  31
## 12033      cattivo      0.031166380   4.018006e-04       predicates  24
## 12785     contento      0.019213151   2.653005e-04       predicates  32
## 13537        corto      0.004983072   7.189403e-05       predicates  32
## 15041        dolce      0.016213803   2.785136e-04       predicates  27
## 15793         duro      0.009434323   5.777302e-05       predicates  28
## 16545       felice      0.005786867   1.148014e-04       predicates  32
## 17297       ferito      0.001326524   1.327404e-05       predicates  34
## 18049       finito      0.028672135   2.221979e-04       predicates  26
## 18801        forte      0.049944559   6.852048e-04       predicates  28
## 19553       freddo      0.007920054   1.405329e-04       predicates  23
## 20305      gentile      0.006904879   1.335528e-04       predicates  34
## 21057       giallo      0.052780705  -5.541887e-04       predicates  25
## 21809      leggero      0.019822200   1.866329e-04       predicates  33
## 22561        lento      0.001326524   1.327404e-05       predicates  34
## 23313        lungo      0.064604026   9.339161e-04       predicates  28
## 24065       malato      0.020486344   3.949564e-04       predicates  28
## 24817      marrone      0.028958927   3.212651e-04       predicates  31
## 25569      morbido      0.032777073   5.982313e-04       predicates  30
## 26321         nero      0.041683955   3.591475e-04       predicates  28
## 27073        nuovo      0.042920087   2.736452e-04       predicates  28
## 27825        piano      0.007932877   7.964426e-05       predicates  26
## 28577      piccolo      0.076396816   6.766792e-04       predicates  23
## 29329        pieno      0.013852750   9.354978e-05       predicates  29
## 30081       povero      0.032520483   2.240110e-04       predicates  33
## 30833       pulito      0.011548966   2.033521e-04       predicates  25
## 31585        rosso      0.067919229   1.049582e-04       predicates  24
## 32337        rotto      0.053233260   7.114556e-04       predicates  21
## 33089    sbagliato      0.003144953   3.149904e-05       predicates  31
## 33841   spaventato      0.006653384   6.675567e-05       predicates  33
## 34593       sporco      0.018276985   2.706008e-04       predicates  22
## 35345       stanco      0.013066198   1.932643e-04       predicates  26
## 36097      sveglio      0.026107485   5.587303e-04       predicates  28
## 36849       ultimo      0.005934764   3.027159e-05       predicates  34
## 37601      vecchio      0.017532868   3.421765e-04       predicates  31
## 38353        verde      0.039187319   1.837683e-05       predicates  26
## 39105        vuoto      0.010384558   1.370612e-04       predicates  28
##       num_item_id
## 1             504
## 753           505
## 1505          506
## 2257          507
## 3009          508
## 3761          509
## 4513          510
## 5265          511
## 6017          512
## 6769          513
## 7521          514
## 8273          515
## 9025          516
## 9777          517
## 10529         518
## 11281         520
## 12033         521
## 12785         522
## 13537         523
## 15041         525
## 15793         526
## 16545         527
## 17297         528
## 18049         529
## 18801         530
## 19553         531
## 20305         532
## 21057         533
## 21809         535
## 22561         536
## 23313         537
## 24065         538
## 24817         539
## 25569         540
## 26321         541
## 27073         542
## 27825         545
## 28577         546
## 29329         547
## 30081         548
## 30833         549
## 31585         550
## 32337         551
## 33089         553
## 33841         556
## 34593         557
## 35345         558
## 36097         560
## 36849         562
## 37601         563
## 38353         565
## 39105         566

db_it_other<- mergeAoaChildes(aoa_it$other, frequency_it)
db_it_other

##              lemma FrequencyLogMean interceptmodel lexical_category aoa
## 1            asilo      0.036044244   4.721329e-04            other  24
## 753            bar      0.019952716   4.592303e-04            other  29
## 1505         bosco      0.012504391   1.417388e-04            other  29
## 2257         bravo      0.171609239   1.709279e-03            other  21
## 3761      campagna      0.015291155   6.581188e-05            other  32
## 4513          casa      0.110395214   1.437522e-03            other  21
## 5265        chiesa      0.010498481   2.475443e-04            other  30
## 6769         città      0.014327904   1.758433e-04            other  32
## 7521       coccodè      0.007896842   1.518153e-04            other  21
## 8273        domani      0.034065088   4.019861e-04            other  26
## 9025         donna      0.011886646   1.698930e-04            other  33
## 9777       dottore      0.026658323   5.525189e-04            other  24
## 10529        festa      0.010485374   9.805064e-05            other  27
## 11281     fratello      0.016311700   2.589707e-04            other  31
## 12033     giardino      0.020516003   3.382131e-04            other  27
## 12785       giorno      0.040388542   3.715680e-04            other  29
## 13537      giostra      0.001326524   1.327404e-05            other  26
## 14289         ieri      0.041597656   4.335704e-04            other  31
## 15041       lavoro      0.008238747   1.517020e-04            other  25
## 16545         mare      0.089601214   1.371452e-03            other  21
## 17297      mattina      0.021013316   3.555752e-04            other  30
## 18049      mercato      0.005304219   5.622189e-05            other  30
## 19553     montagna      0.026023894   3.546883e-04            other  29
## 21809      negozio      0.001326524   1.327404e-05            other  29
## 24817        notte      0.014549982   1.049265e-04            other  26
## 25569         oggi      0.044771672   3.838566e-04            other  29
## 26321     ospedale      0.009559563   1.499621e-04            other  31
## 27825   poliziotto      0.006122968   6.141752e-05            other  31
## 28577   pomeriggio      0.001326524   1.327404e-05            other  34
## 29329       presto      0.008325382   3.739995e-04            other  30
## 30081       scuola      0.052558264   8.030118e-04            other  24
## 30833         sera      0.021862317   2.903378e-04            other  30
## 32337      soldato      0.012909772   2.612284e-04            other  36
## 33089      sorella      0.006683515   1.002088e-05            other  32
## 33841     spiaggia      0.007175454   8.622374e-05            other  29
## 34593 supermercato      0.001326524   1.327404e-05            other  30
## 35345         uomo      0.040327415   8.767560e-04            other  31
## 36097          via      0.241581043   2.941935e-03            other  19
## 36849       vigile      0.008896778   1.280033e-04            other  31
## 38353          zio      0.065971284   1.249610e-03            other  19
## 39105          zoo      0.013924973   9.582252e-05            other  32
##       num_item_id
## 1             325
## 753           326
## 1505          327
## 2257          378
## 3761          328
## 4513          329
## 5265          330
## 6769          332
## 7521          194
## 8273          568
## 9025          353
## 9777          354
## 10529         333
## 11281         355
## 12033         334
## 12785         570
## 13537         335
## 14289         571
## 15041         336
## 16545         337
## 17297         572
## 18049         338
## 19553         339
## 21809         340
## 24817         573
## 25569         574
## 26321         341
## 27825         366
## 28577         575
## 29329         576
## 30081         343
## 30833         577
## 32337         370
## 33089         371
## 33841         344
## 34593         345
## 35345         372
## 36097         398
## 36849         373
## 38353         375
## 39105         346

db_it_fw<- mergeAoaChildes(aoa_it$function_words, frequency_it)
db_it_fw

##         lemma FrequencyLogMean interceptmodel lexical_category aoa num_item_id
## 1           a      0.875920981   1.520108e-02   function_words  26         610
## 753       che      1.049254052   2.104496e-02   function_words  30         596
## 1505      chi      0.539141768   8.602536e-03   function_words  27         603
## 2257       ci      0.515593130   7.067654e-03   function_words  33         593
## 3009     come      0.626041950   9.114532e-03   function_words  31         604
## 3761      con      0.386757129   4.155337e-03   function_words  28         613
## 4513     così      0.262457972   2.799918e-03   function_words  29         651
## 5265       da      0.235618427   2.631727e-03   function_words  27         611
## 6017  davanti      0.017541638   2.576666e-04   function_words  30         624
## 6769   dentro      0.235308611   2.857147e-03   function_words  27         620
## 7521       di      0.607931639   8.052515e-03   function_words  25         609
## 8273   dietro      0.029341518   2.870030e-04   function_words  29         625
## 9025     dove      0.455214158   6.606138e-03   function_words  27         605
## 9777        e      1.097594481   2.129571e-02   function_words  26         638
## 9780        e      1.097594481   2.129571e-02   function_words  27         652
## 11281    ecco      0.271886870   2.842194e-03   function_words  23         660
## 12033   fuori      0.104895533   1.402436e-03   function_words  25         621
## 12785     giù      0.059061199   4.354619e-04   function_words  23         615
## 13537      il      1.633086989   4.365417e-02   function_words  28         626
## 14289      in      0.364076692   3.522000e-03   function_words  31         612
## 15041      io      0.327471159   4.255063e-03   function_words  21         579
## 15793      la      1.451234715   3.560798e-02   function_words  26         628
## 16545     lei      0.084878496   1.117875e-03   function_words  32         582
## 17297      lo      0.063386184   1.003811e-03   function_words  29         627
## 18049 lontano      0.009306079   7.083074e-05   function_words  29         622
## 18801    loro      0.064918040   9.564582e-04   function_words  35         585
## 19553     lui      0.116291219   1.312676e-03   function_words  30         581
## 20305      ma      0.659845246   1.086208e-02   function_words  31         653
## 21057   molto      0.212765819   2.736212e-03   function_words  30         662
## 21809 nessuno      0.016320038   2.082957e-04   function_words  30         663
## 22561  niente      0.081139531   9.974170e-04   function_words  27         664
## 23313     noi      0.043998556   5.685382e-04   function_words  31         583
## 24065     per      0.285452224   3.447167e-03   function_words  30         616
## 24817    poco      0.023979613   1.374663e-04   function_words  24         665
## 25569   quale      0.094342866   1.422346e-03   function_words  31         607
## 26321  quando      0.184732854   2.456190e-03   function_words  31         608
## 27073      se      0.287755796   2.598505e-03   function_words  34         656
## 27825      si      0.765124391   1.226932e-02   function_words  29         595
## 28577   sopra      0.067055126   7.508139e-04   function_words  26         618
## 29329   sotto      0.066725183   4.618105e-04   function_words  25         619
## 30081      su      0.156877035   1.707746e-03   function_words  26         614
## 30833   tanto      0.073912172   4.174869e-04   function_words  24         667
## 31585  troppo      0.048910912   5.569208e-04   function_words  31         668
## 32337      tu      0.191673021   1.668384e-03   function_words  24         580
## 33089   tutto      0.459518294   5.737280e-03   function_words  25         669
## 34593  vicino      0.038381948   3.747489e-04   function_words  29         623

Plot frequency

Plot frequency and aoa using log frequency and model intercept frequency:

plot_frequency1<-function(db){
ggplot(db, 
       aes(FrequencyLogMean, aoa, label=lemma)) + 
    geom_point()  +
    geom_text(aes(label=lemma),hjust=0, vjust=0) +xlim(0,0.6) #+ facet_wrap(~lexical_category, nrow=2)
}

plot_frequency2<-function(db){
ggplot(db, 
       aes(interceptmodel, aoa, label=lemma)) + 
  geom_point()  +
  geom_text(aes(label=lemma),hjust=0, vjust=0) # + facet_wrap(~lexical_category, nrow=2)
}

plot_frequency3<-function(db){
ggplot(db, 
       aes(x = interceptmodel, y = FrequencyLogMean, label = lemma)) + 
  geom_point() + 
  geom_smooth(method = "lm")# +   facet_wrap(~lexical_category)
}

ggarrange(plot_frequency1(db_fr_nouns), plot_frequency1(db_fr_adj), plot_frequency1(db_fr_fw), plot_frequency1(db_fr_other), ncol=2, nrow=2, common.legend = TRUE, legend="right")

ggarrange(plot_frequency1(db_it_nouns), plot_frequency1(db_it_adj), plot_frequency1(db_it_fw), plot_frequency1(db_it_other), plot_frequency1(db_it_verbs), ncol=2, nrow=3, common.legend = TRUE, legend="right")

ggarrange(plot_frequency3(db_fr_nouns), plot_frequency3(db_fr_adj), plot_frequency3(db_fr_fw), plot_frequency3(db_fr_other), ncol=2, nrow=3, common.legend = TRUE, legend="right")

ggarrange(plot_frequency3(db_it_nouns), plot_frequency3(db_it_adj), plot_frequency3(db_it_fw), plot_frequency3(db_it_other), plot_frequency3(db_it_verbs), ncol=2, nrow=3, common.legend = TRUE, legend="right")

Regression

regression_option1<-function(db){
option1<-lm(aoa~ FrequencyLogMean, data=db) 
option1
return(summary(option1)$adj.r.squared)
}

it_nouns_R2<-regression_option1(db_it_nouns)
it_verbs_R2<-regression_option1(db_it_verbs)
it_adj_R2<-regression_option1(db_it_adj)
it_fw_R2<-regression_option1(db_it_fw)
it_other_R2<-regression_option1(db_it_other)

fr_nouns_R2<-regression_option1(db_fr_nouns)
fr_adj_R2<-regression_option1(db_fr_adj)
fr_fw_R2<-regression_option1(db_fr_fw)
fr_other_R2<-regression_option1(db_fr_other)

ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_nouns), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_verbs), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_adj), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_fw), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_it_other), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_nouns), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_adj), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_fw), c("FrequencyLogMean")) %>%  plot()

ggpredict(lm(aoa~ FrequencyLogMean, data=db_fr_other), c("FrequencyLogMean")) %>%  plot()

#option2 <- glmer(value1 ~ aoa * FrequencyLogMean + (1|lexical_category), data=CHILDES_WB) 
#anova(option2)

#qplot( x = CHILDES_WB$intercept, fill = CHILDES_WB$`value == "produces"`, geom = "histogram", main = "Frequency distribution for WB items at 17 months",  xlab = "Frequency of WB items")

Adjusted R2 Italian nouns Adjusted R2 : 0.1730756 Italian adjectives Adjusted R2 : 0.2226383 Italian other Adjusted R2 : 0.4064931 Italian function words Adjusted R2 : -0.0159714

French nouns Adjusted R2 : 0.3402412 French adjectives Adjusted R2 : -0.0583493 French other Adjusted R2 : 0.0488026 French function words Adjusted R2 : -0.0342725

r_fr_nouns_aoa*r_fr_nouns

## [1] 0.8337529

fr_nouns_R2

## [1] 0.3402412

r_it_nouns_aoa*r_it_nouns

## [1] 0.7185229

it_nouns_R2

## [1] 0.1730756

r_fr_fw_aoa*r_fr_fw

## [1] 0.6872054

fr_fw_R2

## [1] -0.03427254

r_it_fw_aoa*r_it_fw

## [1] 0.8217032

it_fw_R2

## [1] -0.01597142