childes_pipeline

https://rpubs.com/gloukatou/739504

library(tidyverse)
library(childesr)
library(data.table)
library(stringr)

word=""

get_childes_metrics <- function(args_, column="language", freq=FALSE, uttlength=FALSE, wordlen = FALSE ){
  data_<- do.call(get_data, args_)
  if (freq == TRUE){
  wordcount = frequency(data_$tokens, column)
  }
  if (uttlength == TRUE){
  mlu = mlu(data_$utterances, data_$tokens, column)
  }
  if (wordlen == TRUE){
  wlength = wordlength( data_$tokens)
  }
  args = as.character(args_["lang"])
  metrics <- full_join(wordcount, mlu) %>% full_join(wlength) %>% distinct()
  output<- metrics %>% mutate(args = as.character(args_["lang"]))
  return(output)
}

get_data<-function(lang = NULL,
                   corpus = NULL,
                   speaker_role = NULL, 
                   speaker_role_exclude = "Target_Child", 
                   child_age = NULL, 
                   child_sex = NULL, 
                   pos = NULL, 
                   word)
                   {
speakers<-get_speaker_statistics(corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex)

utterances <-get_utterances(language = lang, corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex)

tokens_ <-get_tokens(language = lang, corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex, token="*")

tokens1<-tokens_ %>% group_by(corpus_name) %>%  count()  %>% rename(corpuscount = n)
tokens2<-tokens_ %>% group_by(speaker_role) %>%  count()  %>% rename(speakercount = n)
tokens3<-tokens_ %>% group_by(target_child_name) %>%  count()  %>% rename(targetchildcount = n)
tokens_ <- tokens_ %>% left_join(tokens1)  %>% left_join(tokens2)  %>% left_join(tokens3)

if (word != ""){
tokens <-get_tokens(language = lang, corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex, part_of_speech = pos, token = word) %>%
  left_join(tokens_)}
else {tokens <- tokens_}

return(list(utterances = data.table(utterances), speakers =data.table(speakers), tokens=data.table(tokens))) #return a list of dataframes
}

#predictors:word_counts sent_lengths final_counts solo_counts concreteness frequency word_length valence babiness
#level on which user wants their predictor e.g. target_child_id, speaker_id, transcript_id

frequency<-function(tokens, column = "language"){ #target_child_id, speaker_id corpus_name
total<-nrow(tokens)  
tokens_fr <- tokens %>% 
  group_by(!!column, gloss)  %>% #corpuscount
    count()  %>% 
      rename(wordcount = n) %>% 
        mutate(freq= wordcount/total) %>%  #to fix with speakercount, targetchildcount, corpuscount
         mutate(logfreq= log(freq)) %>%
          ungroup() %>% 
            select(gloss, freq, logfreq)
return(tokens_fr)
} 
  

mlu <- function(utterances, tokens, column = "language"){
utterances_mlu<- utterances %>%
    mutate(utt_length = sapply(strsplit(utterances$gloss, " "), length)) %>%
     select(id, utt_length) %>%  
      rename(utterance_id = id)
tokens_mlu<- tokens %>% 
    left_join(utterances_mlu) %>%
     group_by(gloss)  %>% 
      summarise(mlu = mean(utt_length)) %>%  ###tofix column
        select(gloss, mlu)
return(tokens_mlu)
}

wordlength <- function(tokens){
tokens_length <- tokens %>%
   mutate(wordcount= str_count(gloss)) %>%
   distinct() %>%
   select(gloss, wordcount)
return(tokens_length)  
}

args_<-list(lang = "ita", word="")

output<- get_childes_metrics(args_, uttlength = TRUE, freq = TRUE, wordlen = TRUE)

output

## # A tibble: 10,424 x 6
##    gloss        freq logfreq   mlu wordcount args 
##    <chr>       <dbl>   <dbl> <dbl>     <int> <chr>
##  1 'un    0.00000340  -12.6   5            3 ita  
##  2 a      0.0128       -4.36  8.86         1 ita  
##  3 A      0.0000136   -11.2   8            1 ita  
##  4 a'     0.0000170   -11.0   7.6          2 ita  
##  5 aaah   0.00000340  -12.6   3            4 ita  
##  6 aaam   0.00000340  -12.6  11            4 ita  
##  7 aah    0.0000986    -9.22  5.14         3 ita  
##  8 aamm   0.00000340  -12.6  10            4 ita  
##  9 Ababwa 0.00000340  -12.6   2            6 ita  
## 10 abbaia 0.0000238   -10.6   4.14         6 ita  
## # … with 10,414 more rows

#write.csv(output,"/Users/lscpuser/Documents/output_.csv" )