SES_data <- read_csv("SES_data.csv")
task_data <- read_csv("combined_data_wide.csv")
names(task_data) <- c("subjCode","childAge_days_t1","childAge_days_t2","childAge_months_t1","childAge_months_t2",
"childAge_years_t1","childAge_years_t2", "subjCode_lab_t2", "ASB5", "Color",
"ppvt_overlapping_survey", "Shape","WJ", "WJ_scaled", "ppvt_first_set", "ppvt_n_correct",
"ppvt_n_attempted", "ppvt_standard_score", "ppvt_percentile", "ppvt_prop_correct",
"total_understands_control_t1", "total_understands_control_t2", "total_understands_seed_t1",
"total_understands_seed_t2", "total_says_control_t1", "total_says_control_t2", "total_says_seed_t1",
"total_says_seed_t2")
non_vocab_survey_data <- read_csv("non_vocab_survey_data.csv")
current_task_data <- task_data %>%
left_join(SES_data, by="subjCode") %>%
left_join(non_vocab_survey_data, by="subjCode")
vocab_survey_data <- read_csv("vocab_data_by_word.csv") %>% filter(!is.na(subjCode))
vocab_tasks_byword <- read_csv("vocab_task_data_by_word.csv")
word_importance <- vocab_survey_data %>%
select(-seedword) %>%
distinct() %>%
filter(!is.na(importance_to_teach)) %>%
mutate(importance_num = case_when(
importance_to_teach == "Not at all important" ~ 1,
importance_to_teach == "Slightly important" ~ 2,
importance_to_teach == "Moderately important" ~ 3,
importance_to_teach == "Very important" ~ 4,
importance_to_teach == "Extremely important" ~ 5
)) %>%
group_by(word) %>%
summarize(mean_parent_importance = mean(importance_num), n_importance_ratings = n())
helpfulness <- read_csv("../word_norms/helpfulness_ratings.csv") %>% select(word, mean_helpfulness = resp_mean) %>%
group_by(word) %>%
summarise(mean_helpfulness = mean(mean_helpfulness)) %>%
ungroup()
babiness_preschoolness <- read_csv("../word_norms/babiness_ratings.csv") %>%
select(word, mean_babiness = babiness_mean, mean_preschoolness = preschoolness_mean)
defs_synsets <- read_csv("../word_norms/all_words_synsets_defs.csv")
no_generality <- read_csv("../word_norms/old_files/no_generality.csv") %>% select(-mean_generality, -num_item_id)
generality_cdi_childes <- read_csv("../word_norms/generality_ratings_byWord_cdi_childes.csv")
generality_otherwords <- read_csv("../word_norms/new_generality_ratings.csv") %>%
rename(pos_fullword = pos, word_descriptive = word, word = word2) %>%
mutate(pos = case_when(
pos_fullword == "noun" ~ "n",
pos_fullword == "verb" ~ "v"
))
generality_otherwords_getpos <- generality_otherwords %>% left_join(no_generality, by="word") %>%
mutate(pos_resolved = coalesce(pos.x, pos.y)) %>%
select(word, pos = pos_resolved, mean_generality, n_generality_ratings = n_gen_ratings) %>%
distinct()
concreteness <- read_tsv("../word_norms/concreteness_brysbaert.csv") %>%
select(word = Word, concreteness = Conc.M)
#used to select seedword but now we don't care about those so removed it - 351 obs
word_info <- read_csv("../word_norms/word_info_with_cdi_ids.csv") %>%
select(num_item_id,word, type, pos, num_hypernyms, num_hyponyms, aoa, adult_log_freq) %>%
distinct() %>%
left_join(defs_synsets, by=c("word", "pos")) %>%
left_join(concreteness, by="word") %>%
left_join(helpfulness, by="word") %>%
left_join(babiness_preschoolness, by="word") %>%
left_join(word_importance, by="word") %>%
left_join(generality_cdi_childes, by="num_item_id") %>%
select(-word.y, word=word.x) %>%
left_join(generality_otherwords_getpos, by=c("word", "pos")) %>%
mutate(n_generality_ratings = coalesce(n_generality_ratings.x, n_generality_ratings.y),
mean_generality = coalesce(mean_generality.x, mean_generality.y)) %>%
select(-n_generality_ratings.x, -n_generality_ratings.y, -mean_generality.x, -mean_generality.y) %>%
group_by(pos) %>%
mutate(pos_scale_hypernyms = scale(num_hypernyms),
pos_scale_log_hyponyms = scale(log(1+num_hyponyms)),
pos_scale_synsets = scale(n_synsets),
pos_scale_defs = scale(n_definitions),
pos_scale_generality = scale(mean_generality)) %>%
ungroup()
child_mean_word_info <- word_info %>%
select(word, mean_helpfulness, mean_generality, pos_scale_generality, adult_log_freq) %>%
left_join(vocab_survey_data, by="word") %>%
select(subjCode, timepoint, childAge_days, word, understands, says, everything()) %>%
distinct() %>%
filter(!is.na(subjCode) & says==1) %>%
group_by(subjCode, timepoint) %>%
summarize(mean_helpfulness_of_vocab = mean(mean_helpfulness, na.rm=TRUE),
mean_aoa_of_vocab = mean(aoa, na.rm=TRUE),
mean_freq_of_vocab = mean(adult_log_freq, na.rm=TRUE),
mean_hypernymy_of_vocab_unscaled = mean(num_hypernyms, na.rm=TRUE),
mean_hypernymy_of_vocab = mean(pos_scale_hypernyms, na.rm=TRUE),
mean_generality_of_vocab_unscaled = mean(mean_generality, na.rm=TRUE),
mean_generality_of_vocab = mean(pos_scale_generality, na.rm=TRUE),
) %>%
pivot_wider(names_from = timepoint, names_prefix="t",
values_from = mean_helpfulness_of_vocab:mean_generality_of_vocab)
child_mean_word_info_unknown_known <- word_info %>%
select(word, mean_helpfulness, mean_generality, pos_scale_generality, adult_log_freq) %>%
left_join(vocab_survey_data, by="word") %>%
select(subjCode, timepoint, childAge_days, word, understands, says, everything()) %>%
distinct() %>%
filter(!is.na(subjCode)) %>%
mutate(says_character = ifelse(says==0, "unknown","known")) %>%
group_by(subjCode, timepoint, says_character) %>%
summarize(mean_helpfulness_of_vocab = mean(mean_helpfulness, na.rm=TRUE),
mean_aoa_of_vocab = mean(aoa, na.rm=TRUE),
mean_freq_of_vocab = mean(adult_log_freq, na.rm=TRUE),
mean_hypernymy_of_vocab_unscaled = mean(num_hypernyms, na.rm=TRUE),
mean_hypernymy_of_vocab = mean(pos_scale_hypernyms, na.rm=TRUE),
mean_generality_of_vocab_unscaled = mean(mean_generality, na.rm=TRUE),
mean_generality_of_vocab = mean(pos_scale_generality, na.rm=TRUE)
) %>%
pivot_wider(names_from = c("timepoint","says_character"), names_prefix="t",
values_from = mean_helpfulness_of_vocab:mean_generality_of_vocab)
Hypernymy is doing stuff: kids’ mean hypernymy of their vocabulary at T1 predicts ASB5 performance at T2 (controlling for age and T1 vocab size), and when we look at individual word knowledge as a predictor of ASB5 performance, hypernymy predicts the t-values of those words (controlling for word frequency and AoA), at p = .049.
Unresolved potential issues:
- In mean vocabulary models, VIF is high – total vocab, mean_aoa, and mean_frequency are all highly intercorrelated (which makes sense). How to address?
- These models don’t replicate if we look at T2 vocabulary characteristics instead (ASB5 ~ T2 age + T2 total vocab + T2 mean hypernymy, etc.). It doesn’t seem like much is going on if we compare the predictiveness of knowing a word at T1 vs. only knowing it at T2 (see very last analysis) so I don’t think it’s about how long you’ve known the word. Also doesn’t seem to be due to potential outliers. Power issue?
The goal of these analyses is to investigate how 2-4yo’s vocabulary knowledge might predict performance on cognitive tasks. We investigate vocabulary at the composite level (e.g., how many total seed/control words) as well as individual word knowledge to identify which specific words might be most helpful for kids to know.
t1: first timepoint, when parents did online survey (summer 2019)
t2: second timepoint, when parents brought kids into lab (fall 2019/winter 2020)
ASB5: number of items child got correct on our adapted SB-5 – max of 12
mean_aoa_of_vocab: mean age of acquisition of all the words child is reported to know (from Kuperman norms)
mean_freq_of_vocab: mean frequency of all the words child is reported to know (from adult speech in CHILDES, log-transformed)
mean_hypernymy_of_vocab (scaled by pos): mean number of hypernyms of all the words child is reported to know (from Wordnet)
Regressed age out of ASB5; add other characteristics of kids’ vocabularies
ASB5.age: ASB5 performance, controlling for age
age_sb5 <- lm(ASB5 ~ childAge_days_t2, data=current_task_data)
#age_wj <- lm(WJ ~ childAge_days_t2, data=current_task_data_forcorr)
resid_task_data <- current_task_data %>%
left_join(child_mean_word_info, by="subjCode") %>%
mutate(ASB5.age = age_sb5$residuals,
total_says_t1 = total_says_seed_t1 + total_says_control_t1,
total_says_t2 = total_says_seed_t2 + total_says_control_t2) %>%
rename(mean_helpfulness_of_vocab_t2 = mean_helpfulness_of_vocab_t3,
mean_aoa_of_vocab_t2 = mean_aoa_of_vocab_t3, mean_freq_of_vocab_t2 = mean_freq_of_vocab_t3,
mean_hypernymy_of_vocab_t2 = mean_hypernymy_of_vocab_t3)
resid_task_data_forcorr <- resid_task_data %>%
select(childAge_days_t2, ASB5.age, total_says_t1, total_says_t2,
mean_aoa_of_vocab_t1, mean_freq_of_vocab_t1, mean_hypernymy_of_vocab_t1,
mean_aoa_of_vocab_t2, mean_freq_of_vocab_t2,
mean_hypernymy_of_vocab_t2)
resid_data_corrmat <- cor(resid_task_data_forcorr, use="pairwise.complete.obs")
resid_pmat <- cor.mtest(resid_task_data_forcorr)
resid_sig_matrix <- resid_pmat$p
corrplot(corr = resid_data_corrmat, method="color", type="lower",
diag=FALSE, addCoef.col = "black", p.mat=resid_sig_matrix, insig="blank",
tl.srt = 45, tl.col="black", tl.cex=1.1, number.cex = .9)
Note: Issues with collinearity among predictors. Total vocab, mean_aoa, and mean_freq highly intercorrelated. The model that contains age, T1 vocab size, and mean T1 vocab hypernymy has the highest adjusted R-squared.
# all_t1_characteristics <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_helpfulness_of_vocab_t1 + mean_aoa_of_vocab_t1 +
# mean_freq_of_vocab_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_aoa <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_aoa_of_vocab_t1, data = resid_task_data)
t1voc_freq <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_freq_of_vocab_t1, data = resid_task_data)
t1voc_aoa_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1 + mean_aoa_of_vocab_t1,
data = resid_task_data)
t1voc_freq_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1 + mean_freq_of_vocab_t1,
data = resid_task_data)
ASB5_t1voc_hyper <- lm(mean_hypernymy_of_vocab_t1 ~ ASB5 + childAge_days_t2 + total_says_t1, data = resid_task_data)
tab_model(t1voc_aoa, t1voc_freq, t1voc_hyper, t1voc_aoa_hyper, t1voc_freq_hyper)
| ASB 5 | ASB 5 | ASB 5 | ASB 5 | ASB 5 | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 15.49 | -13.56 – 44.53 | 0.285 | -55.15 | -101.67 – -8.62 | 0.022 | -1.80 | -4.84 – 1.25 | 0.238 | -10.74 | -42.69 – 21.22 | 0.498 | -23.06 | -76.59 – 30.46 | 0.386 |
| childAge_days_t2 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.00 | 0.00 – 0.01 | <0.001 | 0.00 | 0.00 – 0.01 | 0.001 | 0.00 | 0.00 – 0.01 | 0.001 |
| total_says_t1 | 0.02 | -0.02 – 0.06 | 0.250 | 0.04 | 0.00 – 0.07 | 0.034 | 0.00 | -0.01 – 0.02 | 0.449 | -0.01 | -0.05 – 0.04 | 0.749 | 0.02 | -0.02 – 0.06 | 0.318 |
| mean_aoa_of_vocab_t1 | -4.29 | -11.47 – 2.88 | 0.232 | 2.22 | -5.69 – 10.13 | 0.571 | |||||||||
| mean_freq_of_vocab_t1 | 7.11 | 0.93 – 13.29 | 0.026 | 2.83 | -4.29 – 9.95 | 0.423 | |||||||||
| mean_hypernymy_of_vocab_t1 | -18.57 | -30.39 – -6.74 | 0.003 | -20.94 | -35.58 – -6.30 | 0.007 | -15.31 | -29.76 – -0.85 | 0.039 | ||||||
| Observations | 36 | 36 | 36 | 36 | 36 | ||||||||||
| R2 / R2 adjusted | 0.551 / 0.509 | 0.599 / 0.562 | 0.644 / 0.611 | 0.648 / 0.603 | 0.652 / 0.607 | ||||||||||
t1_age_voc <- lm(ASB5 ~ childAge_days_t2 + total_says_t1, data = resid_task_data)
residual_plots <- resid_task_data %>% mutate(ASB5.age.voc = t1_age_voc$residuals)
cor.test(residual_plots$ASB5.age.voc, residual_plots$mean_hypernymy_of_vocab_t1)
##
## Pearson's product-moment correlation
##
## data: residual_plots$ASB5.age.voc and residual_plots$mean_hypernymy_of_vocab_t1
## t = -2.9794, df = 34, p-value = 0.005299
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6816495 -0.1487060
## sample estimates:
## cor
## -0.455012
ggplot(residual_plots, aes(x = mean_hypernymy_of_vocab_t1, y = ASB5.age.voc))+
geom_point(size=3)+
geom_smooth(method=lm)+
theme_classic()+
#geom_label()+
labs(x = "Mean hypernymy of vocabulary at T1", y = "Induction task performance (controlling for age + vocab size")+
scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
theme(text = element_text(size=18))
## `geom_smooth()` using formula 'y ~ x'
ggplot(residual_plots, aes(x = mean_freq_of_vocab_t1, y = ASB5.age.voc))+
geom_point(size=3)+
geom_smooth(method=lm)+
theme_classic()+
#geom_label()+
labs(x = "Mean frequency of vocabulary at T1", y = "Induction task performance (controlling for age + vocab size")+
#scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
theme(text = element_text(size=18))
## `geom_smooth()` using formula 'y ~ x'
ggplot(residual_plots, aes(x = mean_aoa_of_vocab_t1, y = ASB5.age.voc))+
geom_point(size=3)+
geom_smooth(method=lm)+
theme_classic()+
#geom_label()+
labs(x = "Mean AoA of vocabulary at T1", y = "Induction task performance (controlling for age + vocab size")+
#scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
theme(text = element_text(size=18))
## `geom_smooth()` using formula 'y ~ x'
xHats <- modelCaseAnalysis(t1voc_hyper, Type = "HATVALUES", ID=as.character(resid_task_data$subjCode))
# need to click on cases to investigate when running this
# there's one extreme case - subjCode 217
xResids <- modelCaseAnalysis(t1voc_hyper, Type="RESIDUALS",ID=as.character(resid_task_data$subjCode))
# studentized residuals look okay (barely) - possible extreme case is subjCode 240
xCooks <- modelCaseAnalysis(t1voc_hyper, Type="COOKSD",ID=as.character(resid_task_data$subjCode))
# cook's distance looks okay - no observations beyond rule-of-thumb threshold
# influence plot
xInfs <- modelCaseAnalysis(t1voc_hyper, Type="INFLUENCEPLOT",ID=as.character(resid_task_data$subjCode))
# subjCode 217 is having a lot of influence and might be an outlier. what happens if we remove it?
d2 <- dfRemoveCases(resid_task_data, 25)
t1voc_hyper_2 <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1, data = d2)
summary(t1voc_hyper)
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1,
## data = resid_task_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5029 -0.6391 0.0266 0.8475 2.4544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.798650 1.494708 -1.203 0.237670
## childAge_days_t2 0.004911 0.001268 3.873 0.000499 ***
## total_says_t1 0.004585 0.005976 0.767 0.448566
## mean_hypernymy_of_vocab_t1 -18.568876 5.805520 -3.198 0.003109 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.646 on 32 degrees of freedom
## Multiple R-squared: 0.6443, Adjusted R-squared: 0.6109
## F-statistic: 19.32 on 3 and 32 DF, p-value: 2.475e-07
summary(t1voc_hyper_2)
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1,
## data = d2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4989 -0.6802 0.0042 0.9200 2.4826
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.571240 1.702260 -0.923 0.363120
## childAge_days_t2 0.005004 0.001325 3.778 0.000675 ***
## total_says_t1 0.003417 0.007250 0.471 0.640735
## mean_hypernymy_of_vocab_t1 -17.365579 7.171754 -2.421 0.021506 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.67 on 31 degrees of freedom
## Multiple R-squared: 0.5737, Adjusted R-squared: 0.5325
## F-statistic: 13.91 on 3 and 31 DF, p-value: 6.412e-06
# R-squared goes down when 217 is excluded, but hypernymy still matters over and above total vocab
Note: Issues with collinearity among predictors. Total vocab, mean_aoa, and mean_freq highly intercorrelated. AoA, freq, hypernymy don’t significantly predict ASB5 at T2.
# all_t2_characteristics <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_helpfulness_of_vocab_t2 + mean_aoa_of_vocab_t2 +
# mean_freq_of_vocab_t2 + mean_hypernymy_of_vocab_t2, data = resid_task_data)
t2voc_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_hypernymy_of_vocab_t2, data = resid_task_data)
t2voc_aoa <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_aoa_of_vocab_t2, data = resid_task_data)
t2voc_freq <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_freq_of_vocab_t2, data = resid_task_data)
t2voc_aoa_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_hypernymy_of_vocab_t2 + mean_aoa_of_vocab_t2,
data = resid_task_data)
t2voc_freq_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_hypernymy_of_vocab_t2 + mean_freq_of_vocab_t2,
data = resid_task_data)
tab_model(t2voc_aoa, t2voc_freq, t2voc_hyper, t2voc_aoa_hyper, t2voc_freq_hyper)
| ASB 5 | ASB 5 | ASB 5 | ASB 5 | ASB 5 | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 3.27 | -30.91 – 37.45 | 0.847 | -40.80 | -110.95 – 29.34 | 0.245 | -4.07 | -8.74 – 0.61 | 0.086 | -5.91 | -41.99 – 30.16 | 0.740 | -30.53 | -102.32 – 41.25 | 0.392 |
| childAge_days_t2 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 |
| total_says_t2 | 0.01 | -0.04 – 0.05 | 0.704 | 0.03 | -0.02 – 0.08 | 0.255 | 0.01 | -0.01 – 0.02 | 0.453 | 0.00 | -0.04 – 0.05 | 0.878 | 0.02 | -0.03 – 0.07 | 0.348 |
| mean_aoa_of_vocab_t2 | -1.34 | -10.05 – 7.36 | 0.755 | 0.46 | -8.48 – 9.41 | 0.917 | |||||||||
| mean_freq_of_vocab_t2 | 5.13 | -4.12 – 14.38 | 0.267 | 3.54 | -6.03 – 13.11 | 0.457 | |||||||||
| mean_hypernymy_of_vocab_t2 | -11.21 | -26.45 – 4.03 | 0.144 | -11.45 | -27.62 – 4.73 | 0.159 | -9.57 | -25.56 – 6.43 | 0.232 | ||||||
| Observations | 36 | 36 | 36 | 36 | 36 | ||||||||||
| R2 / R2 adjusted | 0.533 / 0.489 | 0.549 / 0.507 | 0.562 / 0.521 | 0.562 / 0.506 | 0.570 / 0.515 | ||||||||||
Because the long-term goal is to identify specific words that could be helpful for cognitive/vocabulary development, shift analyses to look at the role of individual words
## put everything together into df
task_data_wide <- current_task_data %>%
mutate(total_says_t1 = total_says_control_t1+total_says_seed_t1,
total_says_t2 = total_says_control_t2+total_says_seed_t2,
total_understands_t1 = total_understands_control_t1+total_understands_seed_t1,
total_understands_t2 = total_understands_control_t2+total_understands_seed_t2) %>%
select(subjCode, childAge_days_t1, childAge_days_t2, total_understands_t1, total_says_t1,total_understands_t2, total_says_t2,
ASB5, WJ, WJ_scaled, ppvt_prop_correct, ppvt_standard_score, ppvt_percentile)
##inner join to only include kids who've come into lab
df_to_analyze <- vocab_survey_data %>%
select(-seedword) %>%
distinct() %>%
filter(!is.na(word)) %>%
inner_join(task_data_wide, by="subjCode") %>%
select(-importance_to_teach, -childAge_days) %>%
rename(item=word)
###code from Molly
#params
predict_asb5 <- "ASB5 ~ says + childAge_days_t1 + total_says_t1"
predict_asb5_fromt2 <- "ASB5 ~ says + childAge_days_t2 + total_says_t2"
# coefficient function
get_word_beta <- function(word, mod_formula, time, df){
relevant_df <- df %>%
filter(item == word & timepoint == time)
model <- lm(mod_formula, relevant_df)
summary(model)$coefficients %>%
data.frame() %>%
rownames_to_column("term") %>%
filter(term == 'says') %>%
mutate(item = word)
}
#test case
#get_word_beta("squeak", as.formula(predict_t2_vocab), 3, df_to_analyze)
t2_knowledge <- df_to_analyze %>%
group_by(item) %>%
filter(timepoint==3) %>%
summarise(n_say_t2=sum(says))
## `summarise()` ungrouping output (override with `.groups` argument)
word_knowledge <- df_to_analyze %>%
group_by(item) %>%
filter(timepoint==1) %>%
summarise(n_say_t1=sum(says)) %>%
left_join(t2_knowledge, by="item")
## `summarise()` ungrouping output (override with `.groups` argument)
DT::datatable(word_knowledge)
word_coeffs_asb5_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_asb5),
1,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_asb5_t1)
word_coeffs_asb5_t2 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_asb5_fromt2),
3,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_asb5_t2)
The t-values above indicate the degree/reliability that knowledge of a specific word boosts performance on the ASB5 and WJ, controlling for age and total vocabulary knowledge. Now we want to test the hypothesis that lower hypernymy is a characteristic of these more helpful words (i.e. words with higher estimates). So, we look at the relation between hypernymy and the t-value for each word.
It looks like there is a relationship here:
sb5_coeffs_t1 <- word_coeffs_asb5_t1 %>%
rename(word=item) %>%
left_join(word_info, by="word")
ggplot(sb5_coeffs_t1, aes(x = pos_scale_hypernyms, y = tval, label=word))+
geom_point(size=3)+
geom_smooth(method=lm)+
theme_classic()+
#geom_label()+
labs(x = "Hypernyms (scaled by pos)", y = "t-value of Coefficient Estimate")+
scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
theme(text = element_text(size=25))
## `geom_smooth()` using formula 'y ~ x'
Does hypernymy predict t-value? Yes
#t-value of word knowledge at T1 (how well it predicts ASB5 at t2) is predicted by hypernymy, controlling for pos
sb5_coeffs_t1_alldata <- sb5_coeffs_t1 %>%
filter(!is.na(pos_scale_hypernyms),
!is.na(mean_preschoolness), !is.na(concreteness))
sb5_t1_mod1 <- lm(tval ~ pos_scale_hypernyms, data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod1)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms, data = sb5_coeffs_t1_alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0252 -0.7974 -0.0464 0.7499 3.7109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11113 0.06514 1.706 0.088978 .
## pos_scale_hypernyms -0.24966 0.06685 -3.735 0.000222 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.165 on 318 degrees of freedom
## Multiple R-squared: 0.04202, Adjusted R-squared: 0.03901
## F-statistic: 13.95 on 1 and 318 DF, p-value: 0.0002225
…but the effect weakens when you account for frequency & AoA. We want to account for these things because they are related to hypernymy and we’re interested in the effect of hypernymy alone.
# note: aoa and frequency are correlated at -.62, but VIF isn't too concerning, so okay to have them in same model
sb5_t1_mod2 <- lm(tval ~ pos_scale_hypernyms + adult_log_freq + aoa,
data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod2)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + adult_log_freq + aoa,
## data = sb5_coeffs_t1_alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9747 -0.7371 -0.0541 0.7894 3.4922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.57513 0.51602 -1.115 0.265929
## pos_scale_hypernyms -0.12981 0.06750 -1.923 0.055383 .
## adult_log_freq 0.16378 0.04891 3.349 0.000915 ***
## aoa -0.04774 0.04998 -0.955 0.340312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.114 on 302 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.11, Adjusted R-squared: 0.1011
## F-statistic: 12.44 on 3 and 302 DF, p-value: 1.087e-07
sb5_t1_mod3 <- lm(tval ~ adult_log_freq + pos_scale_hypernyms, data = filter(sb5_coeffs_t1_alldata, !is.na(aoa)))
summary(sb5_t1_mod3)
##
## Call:
## lm(formula = tval ~ adult_log_freq + pos_scale_hypernyms, data = filter(sb5_coeffs_t1_alldata,
## !is.na(aoa)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9150 -0.7693 -0.0453 0.8274 3.4289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.01150 0.23979 -4.218 3.26e-05 ***
## adult_log_freq 0.19161 0.03927 4.879 1.72e-06 ***
## pos_scale_hypernyms -0.13295 0.06741 -1.972 0.0495 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.114 on 303 degrees of freedom
## Multiple R-squared: 0.1073, Adjusted R-squared: 0.1014
## F-statistic: 18.2 on 2 and 303 DF, p-value: 3.424e-08
freq.resid <- lm(tval ~ adult_log_freq, sb5_coeffs_t1_alldata)
conc.resid <- lm(tval ~ concreteness, sb5_coeffs_t1_alldata)
hyper.resid <- lm(tval ~ pos_scale_hypernyms, sb5_coeffs_t1_alldata)
anova(sb5_t1_mod2, sb5_t1_mod3)
## Analysis of Variance Table
##
## Model 1: tval ~ pos_scale_hypernyms + adult_log_freq + aoa
## Model 2: tval ~ adult_log_freq + pos_scale_hypernyms
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 302 374.63
## 2 303 375.76 -1 -1.1315 0.9121 0.3403
sb5_coeffs_resid <- sb5_coeffs_t1_alldata %>%
mutate(tval_control_for_freq = freq.resid$residuals,
tval_control_for_hyper = hyper.resid$residuals)
training_words <- c("address", "body", "bunch", "cloth", "end", "land", "list", "meal", "month", "number", "piece", "place", "sound", "vegetable", "way", "work")
# ggplot(data = sb5_coeffs_resid, aes(x = pos_scale_hypernyms, y = tval, label = word))+
# geom_point()+
# geom_label(size=3)+
# geom_smooth(method="lm")+
# theme_classic()
ggplot(sb5_coeffs_resid, aes(x = adult_log_freq, y = tval_control_for_hyper, label=word))+
geom_label()+
geom_smooth(method="lm")+
theme_classic()+
labs(x = "Frequency", y = "t-value (controlling for hypernyms)")
## `geom_smooth()` using formula 'y ~ x'
ggplot(sb5_coeffs_resid, aes(x = pos_scale_hypernyms, y = tval_control_for_freq, label=word))+
geom_point()+
geom_label()+
geom_smooth(method="lm")+
theme_classic()+
labs(x = "Hypernyms (scaled by pos)", y = "t-value (controlling for frequency)")
## `geom_smooth()` using formula 'y ~ x'
# ggplot(sb5_coeffs_resid, aes(x = adult_log_freq, y = Estimate, label = word))+
# geom_point()+
# geom_label(data= filter(sb5_coeffs_resid, word %in% training_words), size=5)+
# geom_smooth(method="lm")+
# theme_classic()
xHats_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type = "HATVALUES",
ID=as.character(filter(sb5_coeffs_t1_alldata, !is.na(aoa))$word))
# need to click on cases to investigate when running this
sb5_coeffs_t1_alldata[xHats_tvals$Rownames,]
## [1] word Estimate SE
## [4] tval p num_item_id
## [7] type pos num_hypernyms
## [10] num_hyponyms aoa adult_log_freq
## [13] n_synsets n_definitions concreteness
## [16] mean_helpfulness mean_babiness mean_preschoolness
## [19] mean_parent_importance n_importance_ratings n_generality_ratings
## [22] mean_generality pos_scale_hypernyms pos_scale_log_hyponyms
## [25] pos_scale_synsets pos_scale_defs pos_scale_generality
## <0 rows> (or 0-length row.names)
xResids_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type="RESIDUALS",
ID=as.character(filter(sb5_coeffs_t1_alldata, !is.na(aoa))$word))
# studentized residuals look okay
xCooks_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type="COOKSD",ID=as.character(resid_task_data$subjCode))
# cook's distance looks okay - a couple observations beyond rule-of-thumb threshold though
sb5_coeffs_t1_alldata[xCooks_tvals$Rownames,]
## [1] word Estimate SE
## [4] tval p num_item_id
## [7] type pos num_hypernyms
## [10] num_hyponyms aoa adult_log_freq
## [13] n_synsets n_definitions concreteness
## [16] mean_helpfulness mean_babiness mean_preschoolness
## [19] mean_parent_importance n_importance_ratings n_generality_ratings
## [22] mean_generality pos_scale_hypernyms pos_scale_log_hyponyms
## [25] pos_scale_synsets pos_scale_defs pos_scale_generality
## <0 rows> (or 0-length row.names)
# influence plot
xInfs_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type="INFLUENCEPLOT",
ID=as.character(filter(sb5_coeffs_t1_alldata, !is.na(aoa))$word))
# none of these words appear to be having undue influence so I think we're okay
Does hypernymy predict coefficients? Again, yes, but goes away when you account for other word features. R-squared quite small too.
sb5_coeffs_t2 <- word_coeffs_asb5_t2 %>%
rename(word=item) %>%
left_join(word_info, by="word")
sb5_coeffs_t2_alldata <- sb5_coeffs_t2 %>%
filter(!is.na(pos_scale_hypernyms), !is.na(aoa))
sb5_t2_mod1 <- lm(tval ~ pos_scale_hypernyms, data=sb5_coeffs_t2_alldata)
summary(sb5_t2_mod1)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms, data = sb5_coeffs_t2_alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6758 -0.9300 -0.0780 0.8429 2.9634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11753 0.06910 1.701 0.0900 .
## pos_scale_hypernyms -0.16666 0.06822 -2.443 0.0152 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.193 on 296 degrees of freedom
## Multiple R-squared: 0.01976, Adjusted R-squared: 0.01645
## F-statistic: 5.968 on 1 and 296 DF, p-value: 0.01515
sb5_t2_mod2 <- lm(tval ~ pos_scale_hypernyms + adult_log_freq + aoa,
data=sb5_coeffs_t2_alldata)
summary(sb5_t2_mod2)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + adult_log_freq + aoa,
## data = sb5_coeffs_t2_alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8208 -0.8967 0.0160 0.8863 3.2628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.48639 0.54640 -0.890 0.37410
## pos_scale_hypernyms -0.08680 0.06881 -1.262 0.20811
## adult_log_freq 0.14728 0.05137 2.867 0.00444 **
## aoa -0.04590 0.05276 -0.870 0.38506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.16 on 294 degrees of freedom
## Multiple R-squared: 0.07953, Adjusted R-squared: 0.07014
## F-statistic: 8.468 on 3 and 294 DF, p-value: 2.055e-05
sb5_t2_mod3 <- lm(tval ~ adult_log_freq, data=sb5_coeffs_t2_alldata)
summary(sb5_t2_mod3)
##
## Call:
## lm(formula = tval ~ adult_log_freq, data = sb5_coeffs_t2_alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8430 -0.8740 -0.0041 0.8857 3.3558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.99172 0.24122 -4.111 5.1e-05 ***
## adult_log_freq 0.18840 0.03937 4.785 2.7e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.161 on 296 degrees of freedom
## Multiple R-squared: 0.0718, Adjusted R-squared: 0.06866
## F-statistic: 22.9 on 1 and 296 DF, p-value: 2.704e-06
freq.resid.t2 <- lm(tval ~ adult_log_freq, sb5_coeffs_t2_alldata)
hyper.resid.t2 <- lm(tval ~ pos_scale_hypernyms, sb5_coeffs_t2_alldata)
sb5_coeffs_resid_t2 <- sb5_coeffs_t2_alldata %>%
mutate(tval_control_for_freq = freq.resid.t2$residuals,
tval_control_for_hyper = hyper.resid.t2$residuals)
training_words <- c("address", "body", "bunch", "cloth", "end", "land", "list", "meal", "month", "number", "piece", "place", "sound", "vegetable", "way", "work")
#
# ggplot(data = sb5_coeffs_resid_t2, aes(x = pos_scale_hypernyms, y = tval, label = word))+
# geom_point()+
# geom_label(size=3)+
# geom_smooth(method="lm")+
# theme_classic()
ggplot(sb5_coeffs_resid_t2, aes(x = adult_log_freq, y = tval_control_for_hyper))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()+
labs(x = "Frequency", y = "t-value (controlling for hypernyms)")
## `geom_smooth()` using formula 'y ~ x'
ggplot(sb5_coeffs_resid_t2, aes(x = pos_scale_hypernyms, y = tval_control_for_freq))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()+
labs(x = "Hypernyms (scaled by pos)", y = "t-value (controlling for frequency)")
## `geom_smooth()` using formula 'y ~ x'
# ggplot(sb5_coeffs_resid, aes(x = adult_log_freq, y = Estimate, label = word))+
# geom_point()+
# geom_label(data= filter(sb5_coeffs_resid, word %in% training_words), size=5)+
# geom_smooth(method="lm")+
# theme_classic()
How are kids’ vocabularies changing between T1 and T2?
mean_word_info_plot <- child_mean_word_info %>%
pivot_longer(cols = mean_helpfulness_of_vocab_t1:mean_generality_of_vocab_t3, names_to = "measure", values_to = "value") %>%
separate(measure, into = c("measure","timepoint"),sep = -3) %>%
filter(subjCode %in% task_data$subjCode)
mean_word_info_plot$timepoint = toupper(str_replace(mean_word_info_plot$timepoint, "_", ""))
mean_word_info_plot$timepoint = str_replace(mean_word_info_plot$timepoint, "3", "2")
ggplot(filter(mean_word_info_plot, measure=="mean_hypernymy_of_vocab"), aes(x = as.factor(timepoint), y = value, group = subjCode))+
geom_point()+
geom_line()+
theme_classic()+
labs(x = "Timepoint", y = "Mean hypernymy of vocab (scaled by PoS)")
ggplot(filter(mean_word_info_plot, measure=="mean_freq_of_vocab"), aes(x = as.factor(timepoint), y = value, group = subjCode))+
geom_point()+
geom_line()+
theme_classic()+
labs(x = "Timepoint", y = "Mean frequency of vocab")
knowledge_factor_coding: never learned = -1, knew at T1 = 0, learned at T2 = 1
dummy_coded_data <- vocab_tasks_byword %>%
filter(subjCode %in% df_to_analyze$subjCode) %>%
select(subjCode, timepoint, word, says) %>%
distinct() %>%
pivot_wider(names_from="timepoint", values_from="says", names_prefix="t") %>%
mutate(
knew_at_t1 = ifelse(t1==1, 1, 0),
learned_at_t2 = ifelse((t1==0 & t3==1),1,0),
never_learned = ifelse((t1==0 & t3==0),1,0),
knowledge_factor_coding = case_when(
never_learned == 1 ~ -1,
knew_at_t1 == 1 ~ 0,
learned_at_t2 == 1 ~ 1 )
) %>%
select(-t1, -t3)
word_knowledge_summary <- dummy_coded_data %>% group_by(word, knowledge_factor_coding) %>% summarise(n=n()) %>%
pivot_wider(names_from = knowledge_factor_coding, names_prefix = "knew", values_from = n) %>%
rename("never_knew" = "knew-1", "knew_at_t1" = "knew0", "learned_at_t2" = "knew1")
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
words_to_use <- word_knowledge_summary %>% filter(knew_at_t1 > 8 & learned_at_t2 > 8)
dummy_coding_analysis <- dummy_coded_data %>%
left_join(task_data_wide, by="subjCode") %>%
rename(item = word)
dummy_coding_analysis$knowledge_factor_coding <- factor(as.character(dummy_coding_analysis$knowledge_factor_coding))
predict_asb5_effect <- "ASB5 ~ knowledge_factor_coding + childAge_days_t2 + total_says_t1"
# new coefficient function
get_word_beta_2 <- function(word, mod_formula, df){
relevant_df <- df %>%
filter(item == word)
model <- lm(mod_formula, relevant_df)
summary(model)$coefficients %>%
data.frame() %>%
rownames_to_column("term")%>%
filter(term == 'knowledge_factor_coding0' | term == "knowledge_factor_coding1") %>%
mutate(item = word)
}
#test case
vegetable <- get_word_beta_2("vegetable", predict_asb5_effect, dummy_coding_analysis)
#filter out words with no variability
word_knowledge_variability <- dummy_coding_analysis %>% group_by(item) %>% filter(!is.na(knowledge_factor_coding)) %>%
filter(!is.na(ASB5)) %>% summarize(factor_variability = n_distinct(knowledge_factor_coding))
## `summarise()` ungrouping output (override with `.groups` argument)
problem_words <- word_knowledge_variability %>% filter(factor_variability==1)
dummy_coding_noproblems <- dummy_coding_analysis %>% filter(!(item %in% problem_words$item))
##alternative: filter out words with kind of meaningless groups
dummy_coding_words_to_use <- dummy_coding_analysis %>% filter(item %in% words_to_use$word)
# do the new thing
word_coeffs_asb5_effect <- map_df(paste("",unique(dummy_coding_words_to_use$item),"",sep=""),
get_word_beta_2,
as.formula(predict_asb5_effect),
dummy_coding_words_to_use) %>%
select(item,term,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
group_by(item) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_asb5_effect)
word_coeffs_never_vs_t1 <- word_coeffs_asb5_effect %>% filter(term=="knowledge_factor_coding0") %>%
rename(word=item) %>%
left_join(word_info, by="word")
word_coeffs_never_vs_t2 <- word_coeffs_asb5_effect %>% filter(term=="knowledge_factor_coding1") %>%
rename(word=item) %>%
left_join(word_info, by="word")
never_vs_t1_mod1 <- lm(tval ~ pos_scale_hypernyms + aoa + concreteness + mean_helpfulness + mean_preschoolness+
adult_log_freq, data=word_coeffs_never_vs_t1)
summary(never_vs_t1_mod1)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa + concreteness +
## mean_helpfulness + mean_preschoolness + adult_log_freq, data = word_coeffs_never_vs_t1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1214 -0.5419 -0.0472 0.5417 3.8334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.49167 1.87314 0.262 0.794
## pos_scale_hypernyms -0.12109 0.14398 -0.841 0.403
## aoa -0.19821 0.13091 -1.514 0.135
## concreteness 0.01433 0.18025 0.079 0.937
## mean_helpfulness 0.07208 0.33262 0.217 0.829
## mean_preschoolness -0.01157 0.21663 -0.053 0.958
## adult_log_freq 0.05987 0.12859 0.466 0.643
##
## Residual standard error: 1.068 on 64 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1263, Adjusted R-squared: 0.04443
## F-statistic: 1.542 on 6 and 64 DF, p-value: 0.1788
never_vs_t1_mod2 <- lm(tval ~ pos_scale_hypernyms + aoa, data=word_coeffs_never_vs_t1)
summary(never_vs_t1_mod2)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa, data = word_coeffs_never_vs_t1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0234 -0.5323 -0.0115 0.5235 3.9279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3972 0.5680 2.460 0.0164 *
## pos_scale_hypernyms -0.1387 0.1333 -1.040 0.3018
## aoa -0.2476 0.0985 -2.514 0.0143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.041 on 68 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1183, Adjusted R-squared: 0.09232
## F-statistic: 4.56 on 2 and 68 DF, p-value: 0.01386
never_vs_t2_mod1 <- lm(tval ~ pos_scale_hypernyms + aoa + concreteness + mean_helpfulness + mean_preschoolness+
adult_log_freq, data=word_coeffs_never_vs_t2)
summary(never_vs_t2_mod1)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa + concreteness +
## mean_helpfulness + mean_preschoolness + adult_log_freq, data = word_coeffs_never_vs_t2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8774 -0.8406 -0.1364 0.6600 3.2891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.852602 1.856967 2.075 0.04198 *
## pos_scale_hypernyms -0.003576 0.142744 -0.025 0.98009
## aoa -0.413546 0.129778 -3.187 0.00221 **
## concreteness -0.147383 0.177399 -0.831 0.40913
## mean_helpfulness -0.330542 0.329537 -1.003 0.31956
## mean_preschoolness -0.030715 0.214317 -0.143 0.88648
## adult_log_freq 0.047544 0.127415 0.373 0.71026
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.059 on 65 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1756, Adjusted R-squared: 0.09949
## F-statistic: 2.307 on 6 and 65 DF, p-value: 0.04422
never_vs_t2_mod2 <- lm(tval ~ aoa, data=word_coeffs_never_vs_t2)
summary(never_vs_t2_mod2)
##
## Call:
## lm(formula = tval ~ aoa, data = word_coeffs_never_vs_t2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7794 -0.8126 -0.0837 0.7233 3.1931
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.05154 0.54473 3.766 0.000342 ***
## aoa -0.33912 0.09502 -3.569 0.000653 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.033 on 70 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1539, Adjusted R-squared: 0.1419
## F-statistic: 12.74 on 1 and 70 DF, p-value: 0.0006525
Factors: knew at T1 = 0, knew at (only) T2 = 1. So coefficients generated here show the effect of knowing the word at T2, relative to kids who knew it at T1. T-values generally small and not very reliable.
t1_t2_effect_comparison <- dummy_coding_words_to_use %>%
filter(knowledge_factor_coding != -1) %>%
mutate(t1_t2_factor_coding = ifelse(knowledge_factor_coding==1, "1","0"))
t1_t2_effect_comparison$t1_t2_factor_coding = factor(as.character(t1_t2_effect_comparison$t1_t2_factor_coding))
get_word_beta_3 <- function(word, mod_formula, df){
relevant_df <- df %>%
filter(item == word)
model <- lm(mod_formula, relevant_df)
summary(model)$coefficients %>%
data.frame() %>%
rownames_to_column("term") %>%
filter(term=="t1_t2_factor_coding1") %>%
mutate(item = word)
}
word_coeffs_asb5_t1_v_t2 <- map_df(paste("",unique(t1_t2_effect_comparison$item),"",sep=""),
get_word_beta_3,
as.formula("ASB5 ~ t1_t2_factor_coding + childAge_days_t2 + total_says_t1"),
t1_t2_effect_comparison) %>%
select(item,term,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
group_by(item) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_asb5_t1_v_t2)
hist(word_coeffs_asb5_t1_v_t2$tval)
(what does knowing a word earlier at T1 get you compared to if you only knew it at T2?) (absolutely nothing - adjusted R-squared is negligible)
t1_vs_t2_coeffs <- word_coeffs_asb5_t1_v_t2 %>%
rename(word = item) %>%
left_join(word_info, by="word")
t1_vs_t2_mod1 <- lm(tval ~ pos_scale_hypernyms + aoa + concreteness + mean_helpfulness + mean_preschoolness+
adult_log_freq, data=t1_vs_t2_coeffs)
summary(t1_vs_t2_mod1)
##
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa + concreteness +
## mean_helpfulness + mean_preschoolness + adult_log_freq, data = t1_vs_t2_coeffs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.75505 -0.45506 -0.00322 0.57751 2.36287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.08533 1.90584 2.144 0.0358 *
## pos_scale_hypernyms 0.15040 0.14650 1.027 0.3084
## aoa -0.29306 0.13319 -2.200 0.0313 *
## concreteness -0.18006 0.18207 -0.989 0.3263
## mean_helpfulness -0.42828 0.33821 -1.266 0.2099
## mean_preschoolness 0.03702 0.21996 0.168 0.8669
## adult_log_freq -0.06649 0.13077 -0.508 0.6129
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.087 on 65 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.08899, Adjusted R-squared: 0.004893
## F-statistic: 1.058 on 6 and 65 DF, p-value: 0.3967