library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = T, tidy = F)

library(tidyverse)
library(langcog)
library(feather)
library(lme4)
library(here)

theme_set(theme_classic(base_size = 10))
item_key <- read_csv("data/item_key.csv")  %>%
  mutate(num_item_id = as.character(num_item_id))

item_data <- read_csv("data/item_data.csv") %>%
  select(1,4) %>%
  select(num_item_id, category) %>%
  mutate(num_item_id = as.character(num_item_id))

word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
  filter(uni_lemma != "feet") %>%
  select(uni_lemma, hypernyms)  %>%
  left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
  left_join(item_data, by = "num_item_id")   %>%
  select(num_item_id, uni_lemma, category, hypernyms) %>%
  mutate_if(is.character, as.factor) %>%
  ungroup()
mcdi_path <- "../7_mcdi/data/train_sample_longitud_mcdi.csv"
mcdi_path2 <- "../7_mcdi/data/test_sample_longitud_mcdi.csv"

cdi_data <- read_csv(mcdi_path) %>%
  bind_rows(read_csv(mcdi_path2)) %>%
  select(-study_id, -study, -birthday, -session_date, -total_num_sessions,
         -num_langs, -hard_of_hearing, -mcdi_type, -languages, -extra_categories) %>%
   arrange(child_id, session_num)

# get produced words by kid
#produced_words <- cdi_data %>%
#  filter(value > 0)  %>%
#  select(-value)  %>%
#  left_join(item_key %>% select(item, num_item_id), by = "item")  %>%
#  mutate_if(is.character, as.factor) %>%
#  rowwise() %>% 
#  mutate(item_clean = str_trim(str_split(item, "\\(")[[1]][1])) 

# get produced words by kid
produced_words <- read_csv("data/produced_words_tidy.csv") %>%
    mutate_if(is.character, as.factor)  %>%
    mutate(num_item_id = as.factor(num_item_id))
freqs <- read_csv("../3_kid_vocabs/data/childes_adult_word_freq.csv") %>%
  select(word, log_freq)

HUMAN_RATING_PATH <- here("data/tidy_ratings.csv")
human_ratings <- read_csv(HUMAN_RATING_PATH) %>%
  mutate(num_item_id = as.character(num_item_id))
# get mean hypernyms score by kid
hypernyms_score_by_kid <- produced_words %>%
  left_join(freqs, by = c("item_clean" = "word")) %>%
  left_join(word_bank_hyper, by = "num_item_id") %>%
  left_join(human_ratings,  by = "num_item_id") %>%
  group_by(child_id, session_num) %>%
  summarize(mean_hypernyms = mean(hypernyms, na.rm = TRUE),
            mean_freq = mean(log_freq, na.rm = T),
            mean_human_ratings = mean(mean_rating, na.rm  = T))
#Get time point data

# timepoint data
demographic_data <- cdi_data %>%
  select(-item, -value) %>%
  distinct(child_id, session_num, .keep_all = T)  %>%
  group_by(child_id) %>%
  mutate(subsequent_age = lead(age), by = "session_num",
         subsequent_words_spoken = lead(words_spoken), by = "session_num",
         delta_age = subsequent_age - age,
         delta_words_spoken = subsequent_words_spoken - words_spoken) %>%
  select(child_id, session_num, age, subsequent_age, delta_age, words_spoken, subsequent_words_spoken, delta_words_spoken)
 
# join together       
full_df <- hypernyms_score_by_kid %>%
  left_join(demographic_data)
summary(full_df)
##    child_id          session_num    mean_hypernyms     mean_freq    
##  Length:1398        Min.   : 1.00   Min.   : 7.068   Min.   :6.945  
##  Class :character   1st Qu.: 2.00   1st Qu.: 7.764   1st Qu.:7.517  
##  Mode  :character   Median : 4.00   Median : 8.252   Median :7.644  
##                     Mean   : 5.04   Mean   : 8.541   Mean   :7.756  
##                     3rd Qu.: 7.00   3rd Qu.: 9.036   3rd Qu.:7.871  
##                     Max.   :12.00   Max.   :13.400   Max.   :9.660  
##                                     NA's   :2                       
##  mean_human_ratings      age        subsequent_age    delta_age      
##  Min.   :2.030      Min.   :15.38   Min.   :16.60   Min.   :-5.0625  
##  1st Qu.:2.509      1st Qu.:20.05   1st Qu.:20.78   1st Qu.: 0.8876  
##  Median :2.597      Median :22.12   Median :22.75   Median : 1.0191  
##  Mean   :2.585      Mean   :22.60   Mean   :23.19   Mean   : 1.1138  
##  3rd Qu.:2.673      3rd Qu.:24.94   3rd Qu.:25.41   3rd Qu.: 1.2820  
##  Max.   :3.250      Max.   :33.07   Max.   :33.07   Max.   : 7.1663  
##                                     NA's   :195     NA's   :195      
##   words_spoken   subsequent_words_spoken delta_words_spoken
##  Min.   :  2.0   Min.   :  3.0           Min.   :-286.00   
##  1st Qu.: 83.0   1st Qu.:111.0           1st Qu.:  17.00   
##  Median :249.0   Median :297.0           Median :  36.00   
##  Mean   :285.0   Mean   :312.1           Mean   :  48.46   
##  3rd Qu.:471.8   3rd Qu.:500.0           3rd Qu.:  66.00   
##  Max.   :680.0   Max.   :680.0           Max.   : 545.00   
##                  NA's   :195             NA's   :195
resid_model <- lm(delta_words_spoken ~ delta_age + words_spoken + mean_freq, 
                  data = full_df)


kid_level <- full_df %>%
  modelr::add_residuals(resid_model,"delta_words_spoken_resid") %>%
  select(child_id, delta_words_spoken_resid, mean_hypernyms)  %>%
  group_by(child_id)  %>%
  summarize_all(mean, na.rm = TRUE)

ggplot(kid_level, aes(x = mean_hypernyms,
                      y = delta_words_spoken_resid)) +
         geom_point(alpha = .8) + 
         geom_smooth(alpha = .2) +
         geom_smooth(method = "lm", color = "red", alpha = .2) +
         ylab("N new words at t (residualized)") +
         xlab("Mean hypernym level of vocabulary at t-1") +
  ggtitle("Hypernym level at previous timepoint\npredicts number of new words learned") +
         theme_classic(base_size = 14)
Each point corresponds to a child. Y-axis shows number of new words learned at t relative to t-1, after residualizing out number of words at t-1, mean frequency of words at t-1, and time between t and t-1. X-axis shows mean hypernym level of vocabulary at t-1. Red line shows linear fit; blue line shows loess fit.

Each point corresponds to a child. Y-axis shows number of new words learned at t relative to t-1, after residualizing out number of words at t-1, mean frequency of words at t-1, and time between t and t-1. X-axis shows mean hypernym level of vocabulary at t-1. Red line shows linear fit; blue line shows loess fit.

Wordnet mean hypernyms

## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms + delta_age + words_spoken +  
##     mean_freq + (session_num | child_id)
##    Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 12532.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.0944 -0.4855 -0.1355  0.3020  9.5272 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  219.515 14.816        
##           session_num    6.959  2.638   -1.00
##  Residual             1948.642 44.143        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)    431.93255   38.00127  11.366
## mean_hypernyms -12.20446    2.14382  -5.693
## delta_age       32.33158    1.89772  17.037
## words_spoken    -0.09603    0.01078  -8.910
## mean_freq      -37.17925    4.60125  -8.080
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp delt_g wrds_s
## mn_hyprnyms -0.299                     
## delta_age   -0.077  0.007              
## words_spokn -0.682  0.600 -0.022       
## mean_freq   -0.848 -0.246  0.020  0.337
## convergence code: 0
## boundary (singular) fit: see ?isSingular
## Linear mixed model fit by REML ['lmerMod']
## Formula: delta_words_spoken ~ mean_hypernyms * words_spoken + delta_age +  
##     mean_freq + (session_num | child_id)
##    Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 12466.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9779 -0.4766 -0.1440  0.3318  9.6623 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)    6.895  2.626        
##           session_num    3.782  1.945   -1.00
##  Residual             1835.309 42.841        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##                              Estimate Std. Error t value
## (Intercept)                 269.94535   40.68661   6.635
## mean_hypernyms              -17.47718    2.12860  -8.211
## words_spoken                 -1.34238    0.13767  -9.751
## delta_age                    32.29388    1.83954  17.555
## mean_freq                   -13.52618    5.20383  -2.599
## mean_hypernyms:words_spoken   0.16612    0.01825   9.101
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp wrds_s delt_g mn_frq
## mn_hyprnyms -0.114                            
## words_spokn  0.408  0.303                     
## delta_age   -0.070  0.006 -0.005              
## mean_freq   -0.882 -0.362 -0.490  0.018       
## mn_hyprny:_ -0.454 -0.260 -0.997  0.003  0.513
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular

Human hypernym ratings

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## delta_words_spoken ~ mean_human_ratings + delta_age + words_spoken +  
##     mean_freq + (session_num | child_id)
##    Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 12565.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.0949 -0.4738 -0.1249  0.3069  9.3995 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  299.84  17.316        
##           session_num    9.72   3.118   -1.00
##  Residual             1954.96  44.215        
## Number of obs: 1203, groups:  child_id, 195
## 
## Fixed effects:
##                     Estimate Std. Error t value
## (Intercept)        300.54703   40.26124   7.465
## mean_human_ratings  60.92191   15.62467   3.899
## delta_age           32.20772    1.90643  16.894
## words_spoken        -0.09338    0.01251  -7.467
## mean_freq          -54.05892    5.27929 -10.240
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hm_ delt_g wrds_s
## mn_hmn_rtng -0.411                     
## delta_age   -0.062 -0.020              
## words_spokn -0.130 -0.711 -0.008       
## mean_freq   -0.563 -0.519  0.029  0.744
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular

Both:

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## delta_words_spoken ~ mean_hypernyms + mean_human_ratings + delta_age +  
##     words_spoken + scale(mean_freq) + (session_num | child_id)
##    Data: full_df
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 12517.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.0932 -0.4806 -0.1293  0.3077  9.6416 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  child_id (Intercept)  210.059 14.493        
##           session_num    6.085  2.467   -1.00
##  Residual             1938.999 44.034        
## Number of obs: 1201, groups:  child_id, 195
## 
## Fixed effects:
##                     Estimate Std. Error t value
## (Intercept)         -4.25817   53.40857  -0.080
## mean_hypernyms      -9.99964    2.27482  -4.396
## mean_human_ratings  52.63135   17.39825   3.025
## delta_age           32.23877    1.89272  17.033
## words_spoken        -0.12024    0.01313  -9.160
## scale(mean_freq)   -17.66341    2.11649  -8.346
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hyp mn_hm_ delt_g wrds_s
## mn_hyprnyms -0.668                            
## mn_hmn_rtng -0.924  0.340                     
## delta_age   -0.030  0.003 -0.012              
## words_spokn  0.324  0.263 -0.580 -0.010       
## scl(mn_frq)  0.601 -0.389 -0.594  0.024  0.564
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## boundary (singular) fit: see ?isSingular