library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))

Read in data

freq_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
pos_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/prop_pos_by_kid_t1.csv")
hyp_by_kid <- read_csv("data/hypernym_by_kid_childes.csv")

groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")%>%
  select(1:7) %>%
  mutate(log_mtld_t1 = log(mtld_t1 + 1),
         log_mtld_t2 = log(mtld_t2 + 1))

all_df <- groups_info %>%
  left_join(hyp_by_kid) %>%
  left_join(pos_by_kid %>% select(target_child_id, prop_noun_t1)) %>%
  left_join(freq_by_kid)

MTLD at t2 as a function of mean hypernym at t1

ggplot(all_df, aes(x = mean_hypernym_t1, y = mtld_t2)) +
  geom_point() + 
  geom_smooth(method = "lm")

Controlling for stuff, there’s no relationship between a childs mean hypernym score at t1 and mtld at t2 (this is true even if you exclude the outlier). If anything, it looks like kids who have higher hypernym score at t1 have greater mtld at t2.

all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(log_mtld_t2 ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1, .)   %>%
  summary
## 
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52243 -0.17096  0.01841  0.14875  0.84391 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.235161   1.672016   1.337    0.186    
## mean_hypernym_t1  0.004260   0.014721   0.289    0.773    
## age_t1           -0.001834   0.002177  -0.842    0.402    
## age_diff          0.001106   0.001545   0.716    0.477    
## log_mtld_t1       0.637121   0.061394  10.378 7.09e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2458 on 71 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.6078, Adjusted R-squared:  0.5857 
## F-statistic:  27.5 on 4 and 71 DF,  p-value: 8.403e-14
all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(log_mtld_t2 ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .)   %>%
  summary
## 
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59282 -0.13564  0.01264  0.15027  0.67725 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.7377927  1.6712605   0.441  0.66026    
## mean_hypernym_t1  0.0138693  0.0149448   0.928  0.35662    
## age_t1           -0.0004198  0.0021401  -0.196  0.84506    
## age_diff          0.0019734  0.0015192   1.299  0.19828    
## log_mtld_t1       0.6629636  0.0614043  10.797  < 2e-16 ***
## prop_noun_t1      0.5860208  0.2082378   2.814  0.00637 ** 
## mean_freq_t1      0.0069765  0.0455611   0.153  0.87875    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2335 on 69 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.6262 
## F-statistic: 21.94 on 6 and 69 DF,  p-value: 2.925e-14
all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(mtld_diff ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1, .)   %>%
  summary
## 
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6594  -2.6528  -0.6694   2.2970  17.9063 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      54.05902   34.09451   1.586    0.117    
## mean_hypernym_t1 -0.05920    0.30018  -0.197    0.844    
## age_t1           -0.04747    0.04439  -1.069    0.289    
## age_diff         -0.01519    0.03151  -0.482    0.631    
## log_mtld_t1      -6.51632    1.25191  -5.205  1.8e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.012 on 71 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.3695, Adjusted R-squared:  0.3339 
## F-statistic:  10.4 on 4 and 71 DF,  p-value: 1.096e-06
all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(mtld_diff ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .)   %>%
  summary
## 
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.7559  -2.2066  -0.2581   2.3597  15.5441 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      26.994717  34.850098   0.775   0.4412    
## mean_hypernym_t1  0.164545   0.311638   0.528   0.5992    
## age_t1           -0.030238   0.044627  -0.678   0.5003    
## age_diff         -0.006011   0.031680  -0.190   0.8501    
## log_mtld_t1      -6.395506   1.280438  -4.995 4.26e-06 ***
## prop_noun_t1     10.781386   4.342296   2.483   0.0155 *  
## mean_freq_t1      0.869412   0.950066   0.915   0.3633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.869 on 69 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.4218, Adjusted R-squared:  0.3715 
## F-statistic:  8.39 on 6 and 69 DF,  p-value: 7.7e-07

As predictor of t:

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_600_900.csv")

hypernyms <- read_csv( "data/childes_hypernyms.csv") %>%
  select(word, hypernyms) %>%
  filter(hypernyms > 0)

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")


concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(hypernyms)

word_coeffs_min5_t2_with_vars %>%
  ggplot(aes(x = hypernyms, y = t)) +
  geom_point() + 
  geom_smooth(method = "lm")

cor.test(word_coeffs_min5_t2_with_vars$t, word_coeffs_min5_t2_with_vars$hypernyms, na.action = "use.complete")
## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -3.7616, df = 856, p-value = 0.0001803
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.19280380 -0.06111263
## sample estimates:
##        cor 
## -0.1275202

Controlling for stuff:

lm(t ~ centrality + log_freq + hypernyms + Conc.M, word_coeffs_min5_t2_with_vars) %>%
  summary()
## 
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M, 
##     data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88110 -0.42339  0.03599  0.43863  1.64878 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.68123    0.38617  -1.764  0.07817 .  
## centrality   1.02839    1.93721   0.531  0.59569    
## log_freq     0.10775    0.01532   7.033 4.88e-12 ***
## hypernyms   -0.02725    0.00853  -3.194  0.00147 ** 
## Conc.M       0.10902    0.04432   2.460  0.01413 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5846 on 691 degrees of freedom
##   (1521 observations deleted due to missingness)
## Multiple R-squared:  0.08511,    Adjusted R-squared:  0.07981 
## F-statistic: 16.07 on 4 and 691 DF,  p-value: 1.369e-12