library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 10))Read in data
freq_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/frequency_based_on_input_by_kid.csv")
pos_by_kid <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/prop_pos_by_kid_t1.csv")
hyp_by_kid <- read_csv("data/hypernym_by_kid_childes.csv")
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900_corrected.csv")%>%
select(1:7) %>%
mutate(log_mtld_t1 = log(mtld_t1 + 1),
log_mtld_t2 = log(mtld_t2 + 1))
all_df <- groups_info %>%
left_join(hyp_by_kid) %>%
left_join(pos_by_kid %>% select(target_child_id, prop_noun_t1)) %>%
left_join(freq_by_kid)MTLD at t2 as a function of mean hypernym at t1
ggplot(all_df, aes(x = mean_hypernym_t1, y = mtld_t2)) +
geom_point() +
geom_smooth(method = "lm")Controlling for stuff, there’s no relationship between a childs mean hypernym score at t1 and mtld at t2 (this is true even if you exclude the outlier). If anything, it looks like kids who have higher hypernym score at t1 have greater mtld at t2.
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1, .) %>%
summary##
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52243 -0.17096 0.01841 0.14875 0.84391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.235161 1.672016 1.337 0.186
## mean_hypernym_t1 0.004260 0.014721 0.289 0.773
## age_t1 -0.001834 0.002177 -0.842 0.402
## age_diff 0.001106 0.001545 0.716 0.477
## log_mtld_t1 0.637121 0.061394 10.378 7.09e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2458 on 71 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.6078, Adjusted R-squared: 0.5857
## F-statistic: 27.5 on 4 and 71 DF, p-value: 8.403e-14
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .) %>%
summary##
## Call:
## lm(formula = log_mtld_t2 ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59282 -0.13564 0.01264 0.15027 0.67725
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7377927 1.6712605 0.441 0.66026
## mean_hypernym_t1 0.0138693 0.0149448 0.928 0.35662
## age_t1 -0.0004198 0.0021401 -0.196 0.84506
## age_diff 0.0019734 0.0015192 1.299 0.19828
## log_mtld_t1 0.6629636 0.0614043 10.797 < 2e-16 ***
## prop_noun_t1 0.5860208 0.2082378 2.814 0.00637 **
## mean_freq_t1 0.0069765 0.0455611 0.153 0.87875
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2335 on 69 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.6262
## F-statistic: 21.94 on 6 and 69 DF, p-value: 2.925e-14
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1, .) %>%
summary##
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.6594 -2.6528 -0.6694 2.2970 17.9063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.05902 34.09451 1.586 0.117
## mean_hypernym_t1 -0.05920 0.30018 -0.197 0.844
## age_t1 -0.04747 0.04439 -1.069 0.289
## age_diff -0.01519 0.03151 -0.482 0.631
## log_mtld_t1 -6.51632 1.25191 -5.205 1.8e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.012 on 71 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.3695, Adjusted R-squared: 0.3339
## F-statistic: 10.4 on 4 and 71 DF, p-value: 1.096e-06
all_df %>%
#filter(mean_hypernym_t1 > 5) %>%
lm(mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff + log_mtld_t1 + prop_noun_t1 + mean_freq_t1, .) %>%
summary##
## Call:
## lm(formula = mtld_diff ~ mean_hypernym_t1 + age_t1 + age_diff +
## log_mtld_t1 + prop_noun_t1 + mean_freq_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.7559 -2.2066 -0.2581 2.3597 15.5441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.994717 34.850098 0.775 0.4412
## mean_hypernym_t1 0.164545 0.311638 0.528 0.5992
## age_t1 -0.030238 0.044627 -0.678 0.5003
## age_diff -0.006011 0.031680 -0.190 0.8501
## log_mtld_t1 -6.395506 1.280438 -4.995 4.26e-06 ***
## prop_noun_t1 10.781386 4.342296 2.483 0.0155 *
## mean_freq_t1 0.869412 0.950066 0.915 0.3633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.869 on 69 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.4218, Adjusted R-squared: 0.3715
## F-statistic: 8.39 on 6 and 69 DF, p-value: 7.7e-07
As predictor of t:
t_scores <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/word_coeffs_log_mtld_t2_600_900.csv")
hypernyms <- read_csv( "data/childes_hypernyms.csv") %>%
select(word, hypernyms) %>%
filter(hypernyms > 0)
freq <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/childes_adult_word_freq.csv")
density_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/bills_density_norms.csv")
concreteness <- read_csv("/Users/mollylewis/Documents/research/Projects/2_published/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv") %>%
rename(word = Word) %>%
select(word, Conc.M)
concepts <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/CONCS_brm.txt") %>%
select(Concept, Familiarity, Length_Syllables, Bigram, 14:33) %>%
mutate(Concept = tolower(Concept),
Concept = map_chr(Concept, ~ pluck(str_split(., "_"),1,1))) %>%
rename(word = Concept) %>%
select(word, Mean_Distinct_No_Tax)
word_coeffs_min5_t2_with_vars <- t_scores %>%
mutate(word = tolower(word)) %>%
left_join(density_norms) %>%
left_join(freq) %>%
left_join(concepts) %>%
left_join(concreteness) %>%
left_join(hypernyms)
word_coeffs_min5_t2_with_vars %>%
ggplot(aes(x = hypernyms, y = t)) +
geom_point() +
geom_smooth(method = "lm")cor.test(word_coeffs_min5_t2_with_vars$t, word_coeffs_min5_t2_with_vars$hypernyms, na.action = "use.complete")##
## Pearson's product-moment correlation
##
## data: word_coeffs_min5_t2_with_vars$t and word_coeffs_min5_t2_with_vars$hypernyms
## t = -3.7616, df = 856, p-value = 0.0001803
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.19280380 -0.06111263
## sample estimates:
## cor
## -0.1275202
Controlling for stuff:
lm(t ~ centrality + log_freq + hypernyms + Conc.M, word_coeffs_min5_t2_with_vars) %>%
summary()##
## Call:
## lm(formula = t ~ centrality + log_freq + hypernyms + Conc.M,
## data = word_coeffs_min5_t2_with_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88110 -0.42339 0.03599 0.43863 1.64878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.68123 0.38617 -1.764 0.07817 .
## centrality 1.02839 1.93721 0.531 0.59569
## log_freq 0.10775 0.01532 7.033 4.88e-12 ***
## hypernyms -0.02725 0.00853 -3.194 0.00147 **
## Conc.M 0.10902 0.04432 2.460 0.01413 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5846 on 691 degrees of freedom
## (1521 observations deleted due to missingness)
## Multiple R-squared: 0.08511, Adjusted R-squared: 0.07981
## F-statistic: 16.07 on 4 and 691 DF, p-value: 1.369e-12