Mean hypernym at t1 as predictor of mtld/t at t2

Hypernym at t1 as predictor of MTLD at t2
Hypernym as predictor of t
Comparing t vs. hypernym as kid-level predictor
- mtld t2, 600-900
- mtld diff, 600-900

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 10))

Read in data

freq_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
pos_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/prop_pos_by_kid_t1.csv")
hyp_by_kid <- read_csv("data/hypernym_by_kid_childes.csv")

groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")%>%
  select(1:7) %>%
  mutate(log_mtld_t1 = log(mtld_t1 + 1),
         log_mtld_t2 = log(mtld_t2 + 1))

all_df <- groups_info %>%
  left_join(hyp_by_kid) %>%
  left_join(pos_by_kid %>% select(target_child_id, prop_noun_t1)) %>%
  left_join(freq_by_kid)

Hypernym at t1 as predictor of MTLD at t2

600-900 (scaled by CDI category)

ggplot(all_df, aes(x = mean_hypernym_t1, y = mtld_t2)) +
  geom_point() + 
  geom_smooth(method = "lm")

Controlling for stuff, there’s no relationship between a childs mean hypernym score at t1 and mtld at t2 (this is true even if you exclude the outlier). If anything, it looks like kids who have higher hypernym score at t1 have greater mtld at t2.

all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(log_mtld_t2 ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1, .)   %>%
  summary

## 
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53355 -0.15790  0.01727  0.13047  0.81582 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.057334   1.717061   1.198    0.235    
## mean_hypernym_t1  0.015402   0.044482   0.346    0.730    
## age_t1           -0.001482   0.002208  -0.671    0.505    
## age_diff          0.001142   0.001569   0.728    0.469    
## log_mtld_t1       0.627730   0.064977   9.661 2.55e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2461 on 67 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.6121, Adjusted R-squared:  0.5889 
## F-statistic: 26.43 on 4 and 67 DF,  p-value: 3.59e-13

all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(log_mtld_t2 ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .)   %>%
  summary

## 
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57104 -0.13520  0.01017  0.14466  0.69648 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.458e-01  1.736e+00   0.314   0.7542    
## mean_hypernym_t1 1.200e-02  4.310e-02   0.278   0.7816    
## age_t1           9.024e-05  2.213e-03   0.041   0.9676    
## age_diff         2.076e-03  1.571e-03   1.321   0.1911    
## log_mtld_t1      6.629e-01  6.647e-02   9.974 9.89e-15 ***
## prop_noun_t1     5.419e-01  2.059e-01   2.632   0.0106 *  
## mean_freq_t1     4.985e-03  4.554e-02   0.109   0.9132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2358 on 65 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.6543, Adjusted R-squared:  0.6224 
## F-statistic: 20.51 on 6 and 65 DF,  p-value: 2.589e-13

all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(mtld_diff ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1, .)   %>%
  summary

## 
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.8239  -2.5596  -0.2942   2.1515  17.1573 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      52.27505   35.43314   1.475    0.145    
## mean_hypernym_t1  0.17020    0.91792   0.185    0.853    
## age_t1           -0.04460    0.04557  -0.979    0.331    
## age_diff         -0.01569    0.03238  -0.484    0.630    
## log_mtld_t1      -6.78291    1.34085  -5.059 3.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.078 on 67 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.365,  Adjusted R-squared:  0.3271 
## F-statistic: 9.628 on 4 and 67 DF,  p-value: 3.268e-06

all_df %>%
  #filter(mean_hypernym_t1 > 5) %>%
  lm(mtld_diff ~ mean_hypernym_t1  + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .)   %>%
  summary

## 
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + 
##     log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4601  -2.4195  -0.2728   2.2346  15.7698 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      26.012798  36.496133   0.713   0.4785    
## mean_hypernym_t1  0.235241   0.906329   0.260   0.7960    
## age_t1           -0.026054   0.046531  -0.560   0.5774    
## age_diff         -0.006909   0.033035  -0.209   0.8350    
## log_mtld_t1      -6.600282   1.397606  -4.723 1.29e-05 ***
## prop_noun_t1      9.922289   4.329322   2.292   0.0252 *  
## mean_freq_t1      0.940614   0.957522   0.982   0.3296    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.958 on 65 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.4125, Adjusted R-squared:  0.3583 
## F-statistic: 7.608 on 6 and 65 DF,  p-value: 3.321e-06

Hypernym as predictor of t

mtld t2, 600-900

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_600_900.csv")

item_data <- read_csv("data/item_data.csv") %>%
  select(num_item_id, category)
item_key <- read_csv("data/item_key.csv")

POS <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt"

pos_data <- read_tsv(POS) %>%
  select(Word, Dom_PoS_SUBTLEX) %>%
  rename(pos_dom = Dom_PoS_SUBTLEX,
         word = Word)

hypernyms_scaled_pos <- read_csv( "data/wordbank_hypernyms.csv") %>%
    left_join(item_key %>% select(num_item_id, uni_lemma)) %>%
    rowwise() %>%
    mutate(word = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
    left_join(pos_data) %>%
    mutate(pos_cat  = case_when(pos_dom == "Noun"~ "n",
                          pos_dom == "Verb"~ "v",
                          TRUE ~ "o"),
        pos_cat = as.factor(pos_cat)) %>%
    group_by(pos_cat) %>%
    mutate(hypernyms_scaled_pos = scale(hypernyms))  %>%
    select(word, hypernyms_scaled_pos) %>%
    ungroup()

hypernyms_scaled_cat <- read_csv( "data/wordbank_hypernyms.csv") %>%
    left_join(item_key %>% select(num_item_id, uni_lemma)) %>%
    rowwise() %>%
    mutate(word = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
    left_join(item_data) %>%
    group_by(category) %>%
    mutate(hypernyms_scaled_cat = scale(hypernyms))  %>%
    select(word, category, hypernyms_scaled_cat, hypernyms) %>%
  ungroup()

freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")

density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")

concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
  rename(word = Word) %>%
  select(word, Conc.M)

concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
  select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
  mutate(Concept = tolower(Concept),
         Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
  rename(word = Concept) %>%
  select(word, Mean_Distinct_No_Tax)

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(hypernyms_scaled_cat) %>%
  left_join(hypernyms_scaled_pos)


cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -3.7694, df = 373, p-value = 0.0001902
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.28725404 -0.09207047
## sample estimates:
##        cor 
## -0.1915554

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = -1.6212, df = 368, p-value = 0.1058
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.18457905  0.01789752
## sample estimates:
##         cor 
## -0.08420995

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -2.8996, df = 373, p-value = 0.003958
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.24604101 -0.04791879
## sample estimates:
##        cor 
## -0.1484693

Controlling for stuff:

lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M, 
##     data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.04719 -0.40170  0.00438  0.44715  1.42282 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.339801   0.630823  -2.124   0.0344 *  
## centrality  -2.094417   2.830911  -0.740   0.4599    
## log_freq     0.211301   0.029704   7.114 6.22e-12 ***
## hypernyms   -0.038757   0.009802  -3.954 9.27e-05 ***
## Conc.M       0.230557   0.070626   3.264   0.0012 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5968 on 357 degrees of freedom
##   (1879 observations deleted due to missingness)
## Multiple R-squared:  0.1624, Adjusted R-squared:  0.1531 
## F-statistic: 17.31 on 4 and 357 DF,  p-value: 5.44e-13

diff, 600-900

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_diff_600_900.csv")

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(hypernyms_scaled_cat) %>%
  left_join(hypernyms_scaled_pos)


cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -2.2743, df = 373, p-value = 0.02352
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2156655 -0.0158655
## sample estimates:
##        cor 
## -0.1169486

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = -2.3738, df = 368, p-value = 0.01812
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.22198007 -0.02111613
## sample estimates:
##        cor 
## -0.1228056

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -2.2425, df = 373, p-value = 0.02552
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.21410628 -0.01423121
## sample estimates:
##       cor 
## -0.115336

Controlling for stuff:

lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M, 
##     data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.33581 -0.42688  0.04234  0.40419  1.51011 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.09580    0.67867  -1.615 0.107275    
## centrality  -6.24629    3.04563  -2.051 0.041006 *  
## log_freq     0.20565    0.03196   6.435 3.97e-10 ***
## hypernyms   -0.03968    0.01055  -3.762 0.000197 ***
## Conc.M       0.33514    0.07598   4.411 1.37e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6421 on 357 degrees of freedom
##   (1879 observations deleted due to missingness)
## Multiple R-squared:  0.1517, Adjusted R-squared:  0.1422 
## F-statistic: 15.96 on 4 and 357 DF,  p-value: 4.948e-12

mtld t2, 900-1200

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_900_1200.csv")

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(hypernyms_scaled_cat) %>%
  left_join(hypernyms_scaled_pos)


cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -3.4454, df = 433, p-value = 0.0006259
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2534773 -0.0704111
## sample estimates:
##      cor 
## -0.16335

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = 1.9346, df = 428, p-value = 0.0537
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.001471881  0.186034871
## sample estimates:
##        cor 
## 0.09310697

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -1.5095, df = 433, p-value = 0.1319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.16524631  0.02181863
## sample estimates:
##         cor 
## -0.07235008

Controlling for stuff:

lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M, 
##     data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.01844 -0.41125 -0.05125  0.37098  2.35790 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.904978   0.587541   1.540   0.1243  
## centrality  -1.529888   2.588257  -0.591   0.5548  
## log_freq    -0.011646   0.025230  -0.462   0.6446  
## hypernyms   -0.021155   0.009228  -2.292   0.0224 *
## Conc.M      -0.066997   0.065052  -1.030   0.3037  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.59 on 412 degrees of freedom
##   (3425 observations deleted due to missingness)
## Multiple R-squared:  0.031,  Adjusted R-squared:  0.02159 
## F-statistic: 3.295 on 4 and 412 DF,  p-value: 0.01125

mtld diff, 900-1200

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_diff_900_1200.csv")

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) %>%
  left_join(concepts) %>%
  left_join(concreteness) %>%
  left_join(hypernyms_scaled_cat) %>%
  left_join(hypernyms_scaled_pos)


cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -2.4841, df = 433, p-value = 0.01337
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2102140 -0.0247925
## sample estimates:
##        cor 
## -0.1185365

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = 1.5697, df = 428, p-value = 0.1172
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01904598  0.16901286
## sample estimates:
##        cor 
## 0.07565617

cor.test(word_coeffs_min5_t2_with_vars$t, 
         word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos, 
         na.action = "use.complete")

## 
##  Pearson's product-moment correlation
## 
## data:  word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -1.2377, df = 433, p-value = 0.2165
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.15254433  0.03483916
## sample estimates:
##         cor 
## -0.05937562

Controlling for stuff:

lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M, 
##     data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.31866 -0.45606  0.01513  0.40766  2.25769 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.709200   0.616989   1.149  0.25104   
## centrality  -6.033367   2.717981  -2.220  0.02698 * 
## log_freq     0.077090   0.026495   2.910  0.00381 **
## hypernyms   -0.023504   0.009691  -2.425  0.01572 * 
## Conc.M       0.070916   0.068313   1.038  0.29983   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6196 on 412 degrees of freedom
##   (3425 observations deleted due to missingness)
## Multiple R-squared:  0.04766,    Adjusted R-squared:  0.03842 
## F-statistic: 5.155 on 4 and 412 DF,  p-value: 0.0004624

Comparing t vs. hypernym as kid-level predictor

mtld t2, 600-900

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_600_900.csv")

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) 

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2_with_vars %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t_t1 = sum(t, na.rm = T),
            mean_t_t1 = mean(t, na.rm = T)) 

t1_word_counts_with_ts %>%
    left_join(all_df) %>%
    lm(log_mtld_t2~ mean_hypernym_t1 + mean_t_t1 + mean_freq_t1 + age_diff + age_t1, data = .) %>%
    summary()

## 
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + mean_t_t1 + mean_freq_t1 + 
##     age_diff + age_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63235 -0.15458 -0.01212  0.12091  0.70747 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.488e+00  1.996e+00   0.746    0.459    
## mean_hypernym_t1 4.627e-02  5.132e-02   0.902    0.371    
## mean_t_t1        6.854e-01  9.790e-02   7.001 1.61e-09 ***
## mean_freq_t1     6.338e-02  4.781e-02   1.326    0.190    
## age_diff         2.385e-03  1.908e-03   1.250    0.216    
## age_t1           1.252e-05  2.635e-03   0.005    0.996    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2852 on 66 degrees of freedom
##   (18 observations deleted due to missingness)
## Multiple R-squared:  0.4866, Adjusted R-squared:  0.4477 
## F-statistic: 12.51 on 5 and 66 DF,  p-value: 1.494e-08

mtld diff, 600-900

t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_diff_600_900.csv")

word_coeffs_min5_t2_with_vars <- t_scores %>%
  mutate(word = tolower(word)) %>%
  left_join(density_norms) %>%
  left_join(freq) 

all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") 
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
  filter(tbin == "t1") %>%
  mutate(gloss_clean = tolower(gloss))   %>%
  group_by(target_child_id, gloss_clean) %>%
  summarize(count = sum(count)) %>%
  filter(count >= MINWORDSFORVOCAB)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2_with_vars %>% select(word, t), 
            by = c("gloss_clean" = "word")) %>%
  select(-gloss_clean, -count) %>%
  group_by(target_child_id) %>%
  summarize(sum_t_t1 = sum(t, na.rm = T),
            mean_t_t1 = mean(t, na.rm = T)) 

t1_word_counts_with_ts %>%
    left_join(all_df) %>%
    lm(I(log(mtld_diff))~ mean_hypernym_t1 + mean_t_t1 + mean_freq_t1 + age_diff + age_t1, data = .) %>%
    summary()

## 
## Call:
## lm(formula = I(log(mtld_diff)) ~ mean_hypernym_t1 + mean_t_t1 + 
##     mean_freq_t1 + age_diff + age_t1, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41208 -0.28677  0.03776  0.29869  1.75229 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.5942397  4.6261945   0.777    0.440    
## mean_hypernym_t1  0.0371343  0.1187329   0.313    0.756    
## mean_t_t1         1.3278402  0.2813734   4.719 1.54e-05 ***
## mean_freq_t1      0.1905339  0.1218443   1.564    0.123    
## age_diff          0.0004601  0.0044322   0.104    0.918    
## age_t1           -0.0069686  0.0061846  -1.127    0.264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6436 on 58 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.3467, Adjusted R-squared:  0.2904 
## F-statistic: 6.156 on 5 and 58 DF,  p-value: 0.0001219

Mean hypernym at t1 as predictor of mtld/t at t2

2018-11-21

Hypernym at t1 as predictor of MTLD at t2

Hypernym as predictor of t

mtld t2, 600-900

diff, 600-900

mtld t2, 900-1200

mtld diff, 900-1200

Comparing t vs. hypernym as kid-level predictor

mtld t2, 600-900

mtld diff, 600-900