https://rpubs.com/gloukatou/739504
library(tidyverse)
library(childesr)
library(data.table)
library(stringr)
word=""
get_childes_metrics <- function(args_, column="language", freq=FALSE, uttlength=FALSE, wordlen = FALSE ){
data_<- do.call(get_data, args_)
if (freq == TRUE){
wordcount = frequency(data_$tokens, column)
}
if (uttlength == TRUE){
mlu = mlu(data_$utterances, data_$tokens, column)
}
if (wordlen == TRUE){
wlength = wordlength( data_$tokens)
}
args = as.character(args_["lang"])
metrics <- full_join(wordcount, mlu) %>% full_join(wlength) %>% distinct()
output<- metrics %>% mutate(args = as.character(args_["lang"]))
return(output)
}
get_data<-function(lang = NULL,
corpus = NULL,
speaker_role = NULL,
speaker_role_exclude = "Target_Child",
child_age = NULL,
child_sex = NULL,
pos = NULL,
word)
{
speakers<-get_speaker_statistics(corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex)
utterances <-get_utterances(language = lang, corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex)
tokens_ <-get_tokens(language = lang, corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex, token="*")
tokens1<-tokens_ %>% group_by(corpus_name) %>% count() %>% rename(corpuscount = n)
tokens2<-tokens_ %>% group_by(speaker_role) %>% count() %>% rename(speakercount = n)
tokens3<-tokens_ %>% group_by(target_child_name) %>% count() %>% rename(targetchildcount = n)
tokens_ <- tokens_ %>% left_join(tokens1) %>% left_join(tokens2) %>% left_join(tokens3)
if (word != ""){
tokens <-get_tokens(language = lang, corpus = corpus, role = speaker_role, role_exclude = speaker_role_exclude, age = child_age, sex = child_sex, part_of_speech = pos, token = word) %>%
left_join(tokens_)}
else {tokens <- tokens_}
return(list(utterances = data.table(utterances), speakers =data.table(speakers), tokens=data.table(tokens))) #return a list of dataframes
}
#predictors:word_counts sent_lengths final_counts solo_counts concreteness frequency word_length valence babiness
#level on which user wants their predictor e.g. target_child_id, speaker_id, transcript_id
frequency<-function(tokens, column = "language"){ #target_child_id, speaker_id corpus_name
total<-nrow(tokens)
tokens_fr <- tokens %>%
group_by(!!column, gloss) %>% #corpuscount
count() %>%
rename(wordcount = n) %>%
mutate(freq= wordcount/total) %>% #to fix with speakercount, targetchildcount, corpuscount
mutate(logfreq= log(freq)) %>%
ungroup() %>%
select(gloss, freq, logfreq)
return(tokens_fr)
}
mlu <- function(utterances, tokens, column = "language"){
utterances_mlu<- utterances %>%
mutate(utt_length = sapply(strsplit(utterances$gloss, " "), length)) %>%
select(id, utt_length) %>%
rename(utterance_id = id)
tokens_mlu<- tokens %>%
left_join(utterances_mlu) %>%
group_by(gloss) %>%
summarise(mlu = mean(utt_length)) %>% ###tofix column
select(gloss, mlu)
return(tokens_mlu)
}
wordlength <- function(tokens){
tokens_length <- tokens %>%
mutate(wordcount= str_count(gloss)) %>%
distinct() %>%
select(gloss, wordcount)
return(tokens_length)
}
args_<-list(lang = "ita", word="")
output<- get_childes_metrics(args_, uttlength = TRUE, freq = TRUE, wordlen = TRUE)
output
## # A tibble: 10,424 x 6
## gloss freq logfreq mlu wordcount args
## <chr> <dbl> <dbl> <dbl> <int> <chr>
## 1 'un 0.00000340 -12.6 5 3 ita
## 2 a 0.0128 -4.36 8.86 1 ita
## 3 A 0.0000136 -11.2 8 1 ita
## 4 a' 0.0000170 -11.0 7.6 2 ita
## 5 aaah 0.00000340 -12.6 3 4 ita
## 6 aaam 0.00000340 -12.6 11 4 ita
## 7 aah 0.0000986 -9.22 5.14 3 ita
## 8 aamm 0.00000340 -12.6 10 4 ita
## 9 Ababwa 0.00000340 -12.6 2 6 ita
## 10 abbaia 0.0000238 -10.6 4.14 6 ita
## # … with 10,414 more rows
#write.csv(output,"/Users/lscpuser/Documents/output_.csv" )