library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = T, tidy = F)

library(tidyverse)
library(langcog)
library(feather)
library(lme4)

theme_set(theme_classic(base_size = 10))
item_key <- read_csv("data/item_key.csv")  %>%
  mutate(num_item_id = as.character(num_item_id))

item_data <- read_csv("data/item_data.csv") %>%
  select(1,4) %>%
  select(num_item_id, category) %>%
  mutate(num_item_id = as.character(num_item_id))

SUBTLEXUS_PATH <- "data/SUBTLEX-NL.cd-above2.with-pos.txt"
subtlexus_norms <- read_tsv(SUBTLEXUS_PATH) %>%
  janitor::clean_names()  %>%
  select(word, lg10cd) %>%
  rename(log_subt_cd = lg10cd)

SEMANTIC_DIV_PATH <- "data/semD.txt"
semantic_diversity <- read_tsv(SEMANTIC_DIV_PATH) %>%
    janitor::clean_names()  %>%
   select(word, sem_d)

word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
  filter(uni_lemma != "feet") %>%
  select(uni_lemma, hypernyms)  %>%
  left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
  left_join(item_data, by = "num_item_id")   %>%
  select(num_item_id, uni_lemma, category, hypernyms) %>%
  mutate_if(is.character, as.factor) %>%
  left_join(subtlexus_norms, by = c("uni_lemma" = "word")) %>%
  left_join(semantic_diversity, by = c("uni_lemma" = "word")) %>%
  ungroup()

Contextual Diversity and Hypernyms

Hypernyms and semantic diversity

ggplot(word_bank_hyper, aes(x = sem_d, y = hypernyms)) +
  geom_point() +
  geom_smooth(method = "lm")

cor.test(word_bank_hyper$sem_d, word_bank_hyper$hypernyms)
## 
##  Pearson's product-moment correlation
## 
## data:  word_bank_hyper$sem_d and word_bank_hyper$hypernyms
## t = -10.331, df = 379, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5436068 -0.3864686
## sample estimates:
##        cor 
## -0.4687378

Hypernyms and contextual diversity

ggplot(word_bank_hyper, aes(x = log_subt_cd, y = hypernyms)) +
  geom_point() +
  geom_smooth(method = "lm")

cor.test(word_bank_hyper$log_subt_cd, word_bank_hyper$hypernyms)
## 
##  Pearson's product-moment correlation
## 
## data:  word_bank_hyper$log_subt_cd and word_bank_hyper$hypernyms
## t = 1.3131, df = 321, p-value = 0.1901
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0363272  0.1807786
## sample estimates:
##        cor 
## 0.07309155

Semantic and contextual diversity

ggplot(word_bank_hyper, aes(x = sem_d, y = log_subt_cd)) +
  geom_point() +
  geom_smooth(method = "lm")

cor.test(word_bank_hyper$sem_d, word_bank_hyper$log_subt_cd)
## 
##  Pearson's product-moment correlation
## 
## data:  word_bank_hyper$sem_d and word_bank_hyper$log_subt_cd
## t = 1.6073, df = 329, p-value = 0.109
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01972118  0.19421922
## sample estimates:
##        cor 
## 0.08826686

Predicting vocab change at the kid/session level

mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"

cdi_data <- read_csv(mcdi_path) %>%
  bind_rows(read_csv(mcdi_path2)) %>%
  select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
         -num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
   arrange(child_id, session_num)

clean_items  <- cdi_data %>%
  distinct(item) %>%
  rowwise()%>%
  mutate(item_clean = str_trim(str_split(item, "\\(")[[1]][1])) 

# get produced words by kid
produced_words <- cdi_data %>%
  filter(value > 0)  %>%
  select(-value)  %>%
  left_join(item_key %>% select(item, num_item_id), by = "item")  %>%
  mutate_if(is.character, as.factor) %>%
  left_join(clean_items)

#write_csv(produced_words, "data/produced_words_tidy.csv")
#produced_words <- read_csv("data/produced_words_tidy.csv") %>%
#    mutate_if(is.character, as.factor) 

freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
  select(word, log_freq)
# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
  left_join(freqs, by = c("item_clean" = "word")) %>%
  left_join(word_bank_hyper, by = "num_item_id") %>%
  group_by(child_id, session_num) %>%
  summarize(
            mean_hypernyms = mean(hypernyms, na.rm = T),
            mean_freq = mean(log_freq, na.rm = T),
            mean_sem_d = mean(sem_d, na.rm = T),
            mean_log_subt_cd = mean(log_subt_cd, na.rm = T))
#Get time point data
# timepoint data
demographic_data <- cdi_data %>%
  select(-item, -value) %>%
  distinct(child_id, session_num, .keep_all = T)  %>%
  group_by(child_id) %>%
  mutate(subsequent_age = lead(age), by = "session_num",
         subsequent_percentile = lead(percentile), by = "session_num",
         subsequent_words_spoken = lead(words_spoken), by = "session_num",
         delta_age = subsequent_age - age,
         delta_percentile = subsequent_percentile - percentile,
         delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
  select(-by)
 
# join together       
full_df <- hypernyms_score_by_kid %>%
  left_join(demographic_data) 

Models

lmer(delta_words_spoken ~ mean_hypernyms + delta_age +  mean_freq + words_spoken + session_num + (session_num|child_id), full_df) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + mean_freq +  
##     words_spoken + session_num + (session_num | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 12529.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.1168 -0.4896 -0.1355  0.2990  9.4906 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  244.231 15.628        
##           session_num    6.574  2.564   -1.00
##  Residual             1943.152 44.081        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)    439.48463   38.29184  11.477
## mean_hypernyms -12.32378    2.15716  -5.713
## delta_age       31.92327    1.91363  16.682
## mean_freq      -38.15424    4.65113  -8.203
## words_spoken    -0.10633    0.01208  -8.806
## session_num      0.90468    0.61017   1.483
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp delt_g mn_frq wrds_s
## mn_hyprnyms -0.298                            
## delta_age   -0.082  0.006                     
## mean_freq   -0.848 -0.246  0.031              
## words_spokn -0.625  0.527  0.040  0.336       
## session_num  0.040  0.018 -0.129 -0.083 -0.455
lmer(delta_words_spoken ~ mean_sem_d + delta_age +  mean_freq + + words_spoken +session_num + (session_num|child_id), full_df ) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## delta_words_spoken ~ mean_sem_d + delta_age + mean_freq + +words_spoken +  
##     session_num + (session_num | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 12540.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.1267 -0.4887 -0.1248  0.3042  9.1894 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  288.437 16.983        
##           session_num    8.454  2.908   -1.00
##  Residual             1959.912 44.271        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  160.29958   66.04919   2.427
## mean_sem_d   127.58856   33.32163   3.829
## delta_age     32.10020    1.92512  16.674
## mean_freq    -42.84973    4.55703  -9.403
## words_spoken  -0.08531    0.01124  -7.590
## session_num    0.95470    0.62441   1.529
## 
## Correlation of Fixed Effects:
##             (Intr) mn_sm_ delt_g mn_frq wrds_s
## mean_sem_d  -0.830                            
## delta_age   -0.056  0.012                     
## mean_freq   -0.618  0.077  0.034              
## words_spokn  0.025 -0.388  0.036  0.492       
## session_num  0.018  0.009 -0.131 -0.081 -0.498
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
lmer(delta_words_spoken ~ mean_sem_d + mean_hypernyms + delta_age +  mean_freq +  words_spoken +session_num + (session_num|child_id), full_df ) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_sem_d + mean_hypernyms + delta_age +  
##     mean_freq + words_spoken + session_num + (session_num | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 12518
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.1135 -0.4849 -0.1244  0.3041  9.4596 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  225.73  15.024        
##           session_num    5.95   2.439   -1.00
##  Residual             1945.14  44.104        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)    334.17038   76.32705   4.378
## mean_sem_d      59.21143   36.77396   1.610
## mean_hypernyms -10.68012    2.39794  -4.454
## delta_age       31.96669    1.91341  16.707
## mean_freq      -38.49742    4.64703  -8.284
## words_spoken    -0.10935    0.01217  -8.989
## session_num      0.92165    0.60664   1.519
## 
## Correlation of Fixed Effects:
##             (Intr) mn_sm_ mn_hyp delt_g mn_frq wrds_s
## mean_sem_d  -0.866                                   
## mn_hyprnyms -0.515  0.441                            
## delta_age   -0.055  0.016  0.012                     
## mean_freq   -0.392 -0.037 -0.239  0.030              
## words_spokn -0.191 -0.137  0.408  0.038  0.337       
## session_num  0.008  0.014  0.022 -0.129 -0.084 -0.455
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
lmer(delta_words_spoken ~ mean_log_subt_cd + delta_age +  mean_freq + + words_spoken + session_num + (session_num|child_id), full_df) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_log_subt_cd + delta_age + mean_freq +  
##     +words_spoken + session_num + (session_num | child_id)
##    Data: full_df
## 
## REML criterion at convergence: 12555.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.1095 -0.4700 -0.1195  0.3007  9.1973 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  355.25  18.848        
##           session_num   11.24   3.353   -1.00
##  Residual             1964.78  44.326        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##                   Estimate Std. Error t value
## (Intercept)      379.25563   39.13249   9.692
## mean_log_subt_cd -15.20996   13.46592  -1.130
## delta_age         32.10693    1.93282  16.611
## mean_freq        -42.38422    4.70243  -9.013
## words_spoken      -0.06734    0.01046  -6.441
## session_num        0.88173    0.64257   1.372
## 
## Correlation of Fixed Effects:
##             (Intr) mn_l__ delt_g mn_frq wrds_s
## mn_lg_sbt_c -0.316                            
## delta_age   -0.071 -0.026                     
## mean_freq   -0.847 -0.230  0.038              
## words_spokn -0.560  0.026  0.042  0.549       
## session_num  0.033  0.026 -0.133 -0.085 -0.523
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling