library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))Read in data
freq_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
pos_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/prop_pos_by_kid_t1.csv")
hyp_by_kid <- read_csv("data/hypernym_by_kid_childes.csv")
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")%>%
select(1:7) %>%
mutate(log_mtld_t1 = log(mtld_t1 + 1),
log_mtld_t2 = log(mtld_t2 + 1))
all_df <- groups_info %>%
left_join(hyp_by_kid) %>%
left_join(pos_by_kid %>% select(target_child_id, prop_noun_t1)) %>%
left_join(freq_by_kid)600-900 (scaled by CDI category)
ggplot(all_df, aes(x = mean_hypernym_t1, y = mtld_t2)) +
geom_point() +
geom_smooth(method = "lm")Controlling for stuff, there’s no relationship between a childs mean hypernym score at t1 and mtld at t2 (this is true even if you exclude the outlier). If anything, it looks like kids who have higher hypernym score at t1 have greater mtld at t2.
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1, .) %>%
summary##
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53355 -0.15790 0.01727 0.13047 0.81582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.057334 1.717061 1.198 0.235
## mean_hypernym_t1 0.015402 0.044482 0.346 0.730
## age_t1 -0.001482 0.002208 -0.671 0.505
## age_diff 0.001142 0.001569 0.728 0.469
## log_mtld_t1 0.627730 0.064977 9.661 2.55e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2461 on 67 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.6121, Adjusted R-squared: 0.5889
## F-statistic: 26.43 on 4 and 67 DF, p-value: 3.59e-13
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .) %>%
summary##
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57104 -0.13520 0.01017 0.14466 0.69648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.458e-01 1.736e+00 0.314 0.7542
## mean_hypernym_t1 1.200e-02 4.310e-02 0.278 0.7816
## age_t1 9.024e-05 2.213e-03 0.041 0.9676
## age_diff 2.076e-03 1.571e-03 1.321 0.1911
## log_mtld_t1 6.629e-01 6.647e-02 9.974 9.89e-15 ***
## prop_noun_t1 5.419e-01 2.059e-01 2.632 0.0106 *
## mean_freq_t1 4.985e-03 4.554e-02 0.109 0.9132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2358 on 65 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.6543, Adjusted R-squared: 0.6224
## F-statistic: 20.51 on 6 and 65 DF, p-value: 2.589e-13
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1, .) %>%
summary##
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8239 -2.5596 -0.2942 2.1515 17.1573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.27505 35.43314 1.475 0.145
## mean_hypernym_t1 0.17020 0.91792 0.185 0.853
## age_t1 -0.04460 0.04557 -0.979 0.331
## age_diff -0.01569 0.03238 -0.484 0.630
## log_mtld_t1 -6.78291 1.34085 -5.059 3.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.078 on 67 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.365, Adjusted R-squared: 0.3271
## F-statistic: 9.628 on 4 and 67 DF, p-value: 3.268e-06
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .) %>%
summary##
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4601 -2.4195 -0.2728 2.2346 15.7698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.012798 36.496133 0.713 0.4785
## mean_hypernym_t1 0.235241 0.906329 0.260 0.7960
## age_t1 -0.026054 0.046531 -0.560 0.5774
## age_diff -0.006909 0.033035 -0.209 0.8350
## log_mtld_t1 -6.600282 1.397606 -4.723 1.29e-05 ***
## prop_noun_t1 9.922289 4.329322 2.292 0.0252 *
## mean_freq_t1 0.940614 0.957522 0.982 0.3296
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.958 on 65 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.4125, Adjusted R-squared: 0.3583
## F-statistic: 7.608 on 6 and 65 DF, p-value: 3.321e-06
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_600_900.csv")
item_data <- read_csv("data/item_data.csv") %>%
select(num_item_id, category)
item_key <- read_csv("data/item_key.csv")
POS <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt"
pos_data <- read_tsv(POS) %>%
select(Word, Dom_PoS_SUBTLEX) %>%
rename(pos_dom = Dom_PoS_SUBTLEX,
word = Word)
hypernyms_scaled_pos <- read_csv( "data/wordbank_hypernyms.csv") %>%
left_join(item_key %>% select(num_item_id, uni_lemma)) %>%
rowwise() %>%
mutate(word = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
left_join(pos_data) %>%
mutate(pos_cat = case_when(pos_dom == "Noun"~ "n",
pos_dom == "Verb"~ "v",
TRUE ~ "o"),
pos_cat = as.factor(pos_cat)) %>%
group_by(pos_cat) %>%
mutate(hypernyms_scaled_pos = scale(hypernyms)) %>%
select(word, hypernyms_scaled_pos) %>%
ungroup()
hypernyms_scaled_cat <- read_csv( "data/wordbank_hypernyms.csv") %>%
left_join(item_key %>% select(num_item_id, uni_lemma)) %>%
rowwise() %>%
mutate(word = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
left_join(item_data) %>%
group_by(category) %>%
mutate(hypernyms_scaled_cat = scale(hypernyms)) %>%
select(word, category, hypernyms_scaled_cat, hypernyms) %>%
ungroup()
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(hypernyms_scaled_cat) %>%
left_join(hypernyms_scaled_pos)
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -3.7694, df = 373, p-value = 0.0001902
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.28725404 -0.09207047
## sample estimates:
## cor
## -0.1915554
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = -1.6212, df = 368, p-value = 0.1058
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.18457905 0.01789752
## sample estimates:
## cor
## -0.08420995
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -2.8996, df = 373, p-value = 0.003958
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.24604101 -0.04791879
## sample estimates:
## cor
## -0.1484693
Controlling for stuff:
lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M,
## data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.04719 -0.40170 0.00438 0.44715 1.42282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.339801 0.630823 -2.124 0.0344 *
## centrality -2.094417 2.830911 -0.740 0.4599
## log_freq 0.211301 0.029704 7.114 6.22e-12 ***
## hypernyms -0.038757 0.009802 -3.954 9.27e-05 ***
## Conc.M 0.230557 0.070626 3.264 0.0012 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5968 on 357 degrees of freedom
## (1879 observations deleted due to missingness)
## Multiple R-squared: 0.1624, Adjusted R-squared: 0.1531
## F-statistic: 17.31 on 4 and 357 DF, p-value: 5.44e-13
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_diff_600_900.csv")
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(hypernyms_scaled_cat) %>%
left_join(hypernyms_scaled_pos)
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -2.2743, df = 373, p-value = 0.02352
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2156655 -0.0158655
## sample estimates:
## cor
## -0.1169486
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = -2.3738, df = 368, p-value = 0.01812
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22198007 -0.02111613
## sample estimates:
## cor
## -0.1228056
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -2.2425, df = 373, p-value = 0.02552
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.21410628 -0.01423121
## sample estimates:
## cor
## -0.115336
Controlling for stuff:
lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M,
## data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.33581 -0.42688 0.04234 0.40419 1.51011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.09580 0.67867 -1.615 0.107275
## centrality -6.24629 3.04563 -2.051 0.041006 *
## log_freq 0.20565 0.03196 6.435 3.97e-10 ***
## hypernyms -0.03968 0.01055 -3.762 0.000197 ***
## Conc.M 0.33514 0.07598 4.411 1.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6421 on 357 degrees of freedom
## (1879 observations deleted due to missingness)
## Multiple R-squared: 0.1517, Adjusted R-squared: 0.1422
## F-statistic: 15.96 on 4 and 357 DF, p-value: 4.948e-12
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_900_1200.csv")
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(hypernyms_scaled_cat) %>%
left_join(hypernyms_scaled_pos)
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -3.4454, df = 433, p-value = 0.0006259
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2534773 -0.0704111
## sample estimates:
## cor
## -0.16335
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = 1.9346, df = 428, p-value = 0.0537
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.001471881 0.186034871
## sample estimates:
## cor
## 0.09310697
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -1.5095, df = 433, p-value = 0.1319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.16524631 0.02181863
## sample estimates:
## cor
## -0.07235008
Controlling for stuff:
lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M,
## data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.01844 -0.41125 -0.05125 0.37098 2.35790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.904978 0.587541 1.540 0.1243
## centrality -1.529888 2.588257 -0.591 0.5548
## log_freq -0.011646 0.025230 -0.462 0.6446
## hypernyms -0.021155 0.009228 -2.292 0.0224 *
## Conc.M -0.066997 0.065052 -1.030 0.3037
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.59 on 412 degrees of freedom
## (3425 observations deleted due to missingness)
## Multiple R-squared: 0.031, Adjusted R-squared: 0.02159
## F-statistic: 3.295 on 4 and 412 DF, p-value: 0.01125
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_diff_900_1200.csv")
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(hypernyms_scaled_cat) %>%
left_join(hypernyms_scaled_pos)
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -2.4841, df = 433, p-value = 0.01337
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2102140 -0.0247925
## sample estimates:
## cor
## -0.1185365
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_cat
## t = 1.5697, df = 428, p-value = 0.1172
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01904598 0.16901286
## sample estimates:
## cor
## 0.07565617
cor.test(word_coeffs_min5_t2_with_vars$t,
word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos,
na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms_scaled_pos
## t = -1.2377, df = 433, p-value = 0.2165
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.15254433 0.03483916
## sample estimates:
## cor
## -0.05937562
Controlling for stuff:
lm(t ~ centrality + log_freq + hypernyms+ Conc.M, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M,
## data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.31866 -0.45606 0.01513 0.40766 2.25769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.709200 0.616989 1.149 0.25104
## centrality -6.033367 2.717981 -2.220 0.02698 *
## log_freq 0.077090 0.026495 2.910 0.00381 **
## hypernyms -0.023504 0.009691 -2.425 0.01572 *
## Conc.M 0.070916 0.068313 1.038 0.29983
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6196 on 412 degrees of freedom
## (3425 observations deleted due to missingness)
## Multiple R-squared: 0.04766, Adjusted R-squared: 0.03842
## F-statistic: 5.155 on 4 and 412 DF, p-value: 0.0004624
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_600_900.csv")
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq)
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2_with_vars %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t_t1 = sum(t, na.rm = T),
mean_t_t1 = mean(t, na.rm = T))
t1_word_counts_with_ts %>%
left_join(all_df) %>%
lm(log_mtld_t2~ mean_hypernym_t1 + mean_t_t1 + mean_freq_t1 + age_diff + age_t1, data = .) %>%
summary()##
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + mean_t_t1 + mean_freq_t1 +
## age_diff + age_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63235 -0.15458 -0.01212 0.12091 0.70747
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.488e+00 1.996e+00 0.746 0.459
## mean_hypernym_t1 4.627e-02 5.132e-02 0.902 0.371
## mean_t_t1 6.854e-01 9.790e-02 7.001 1.61e-09 ***
## mean_freq_t1 6.338e-02 4.781e-02 1.326 0.190
## age_diff 2.385e-03 1.908e-03 1.250 0.216
## age_t1 1.252e-05 2.635e-03 0.005 0.996
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2852 on 66 degrees of freedom
## (18 observations deleted due to missingness)
## Multiple R-squared: 0.4866, Adjusted R-squared: 0.4477
## F-statistic: 12.51 on 5 and 66 DF, p-value: 1.494e-08
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_diff_600_900.csv")
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq)
all_types <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv")
MINWORDSFORVOCAB <- 5
word_counts <- all_types %>%
filter(tbin == "t1") %>%
mutate(gloss_clean = tolower(gloss)) %>%
group_by(target_child_id, gloss_clean) %>%
summarize(count = sum(count)) %>%
filter(count >= MINWORDSFORVOCAB)
t1_word_counts_with_ts <- word_counts %>%
left_join(word_coeffs_min5_t2_with_vars %>% select(word, t),
by = c("gloss_clean" = "word")) %>%
select(-gloss_clean, -count) %>%
group_by(target_child_id) %>%
summarize(sum_t_t1 = sum(t, na.rm = T),
mean_t_t1 = mean(t, na.rm = T))
t1_word_counts_with_ts %>%
left_join(all_df) %>%
lm(I(log(mtld_diff))~ mean_hypernym_t1 + mean_t_t1 + mean_freq_t1 + age_diff + age_t1, data = .) %>%
summary()##
## Call:
## lm(formula = I(log(mtld_diff)) ~ mean_hypernym_t1 + mean_t_t1 +
## mean_freq_t1 + age_diff + age_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41208 -0.28677 0.03776 0.29869 1.75229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5942397 4.6261945 0.777 0.440
## mean_hypernym_t1 0.0371343 0.1187329 0.313 0.756
## mean_t_t1 1.3278402 0.2813734 4.719 1.54e-05 ***
## mean_freq_t1 0.1905339 0.1218443 1.564 0.123
## age_diff 0.0004601 0.0044322 0.104 0.918
## age_t1 -0.0069686 0.0061846 -1.127 0.264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6436 on 58 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.3467, Adjusted R-squared: 0.2904
## F-statistic: 6.156 on 5 and 58 DF, p-value: 0.0001219