SES_data <- read_csv("SES_data.csv")
## Parsed with column specification:
## cols(
## subjCode = col_double(),
## parentEd = col_character(),
## parentIncome = col_character(),
## childGender = col_character(),
## childRace = col_character(),
## childHispanic = col_character(),
## parentEd_num = col_double(),
## parentIncome_num = col_double(),
## parentEd_z = col_double(),
## parentIncome_z = col_double()
## )
task_data <- read_csv("combined_data_wide.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
names(task_data) <- c("subjCode","childAge_days_t1","childAge_days_t3","childAge_months_t1","childAge_months_t3",
"childAge_years_t1","childAge_years_t3", "subjCode_lab_t3", "ASB5", "Color",
"ppvt_overlapping_survey", "Shape","WJ", "WJ_scaled", "ppvt_first_set", "ppvt_n_correct",
"ppvt_n_attempted", "ppvt_standard_score", "ppvt_percentile", "ppvt_prop_correct",
"total_understands_control_t1", "total_understands_control_t3", "total_understands_seed_t1",
"total_understands_seed_t3", "total_says_control_t1", "total_says_control_t3", "total_says_seed_t1",
"total_says_seed_t3")
non_vocab_survey_data <- read_csv("non_vocab_survey_data.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## subjCode = col_double(),
## timepoint = col_double(),
## parentAge = col_logical(),
## childNumSibs = col_double(),
## childBirthOrder = col_double(),
## recordedDate = col_date(format = ""),
## childAge_days = col_double(),
## surgency_score = col_double(),
## LangUse_sum = col_double(),
## child_L2 = col_logical(),
## percent_english = col_double(),
## percent_L2 = col_logical(),
## age_began_daycare = col_logical(),
## age_began_preschool = col_double(),
## age_began_formal_school = col_logical(),
## est_threeyo_vocab = col_double()
## )
## See spec(...) for full column specifications.
current_task_data <- task_data %>%
left_join(SES_data, by="subjCode") %>%
left_join(non_vocab_survey_data, by="subjCode")
vocab_survey_data <- read_csv("vocab_data_by_word.csv") %>% filter(!is.na(subjCode))
## Parsed with column specification:
## cols(
## subjCode = col_double(),
## timepoint = col_double(),
## childAge_days = col_double(),
## word = col_character(),
## understands = col_double(),
## says = col_double(),
## importance_to_teach = col_character(),
## seedword = col_character(),
## type = col_character(),
## pos = col_character(),
## num_hypernyms = col_double(),
## num_hyponyms = col_double(),
## aoa = col_double(),
## pos_scale_hypernyms = col_double(),
## pos_scale_log_hyponyms = col_double()
## )
vocab_tasks_byword <- read_csv("vocab_task_data_by_word.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## word = col_character(),
## importance_to_teach = col_character(),
## seedword = col_character(),
## type = col_character(),
## pos = col_character()
## )
## See spec(...) for full column specifications.
word_importance <- vocab_survey_data %>%
select(-seedword) %>%
distinct() %>%
filter(!is.na(importance_to_teach)) %>%
mutate(importance_num = case_when(
importance_to_teach == "Not at all important" ~ 1,
importance_to_teach == "Slightly important" ~ 2,
importance_to_teach == "Moderately important" ~ 3,
importance_to_teach == "Very important" ~ 4,
importance_to_teach == "Extremely important" ~ 5
)) %>%
group_by(word) %>%
summarize(mean_parent_importance = mean(importance_num), n_importance_ratings = n())
helpfulness <- read_csv("../word_norms/helpfulness_ratings.csv") %>% select(word, mean_helpfulness = resp_mean) %>%
group_by(word) %>%
summarise(mean_helpfulness = mean(mean_helpfulness)) %>%
ungroup()
## Parsed with column specification:
## cols(
## word = col_character(),
## resp_mean = col_double(),
## resp_sd = col_double(),
## num_resp = col_double()
## )
babiness_preschoolness <- read_csv("../word_norms/babiness_ratings.csv") %>%
select(word, mean_babiness = babiness_mean, mean_preschoolness = preschoolness_mean)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## word = col_character(),
## babiness_mean = col_double(),
## babiness_sd = col_double(),
## preschoolness_mean = col_double(),
## preschoolness_sd = col_double(),
## num_resp = col_double()
## )
defs_synsets <- read_csv("../word_norms/all_words_synsets_defs.csv")
## Parsed with column specification:
## cols(
## word = col_character(),
## pos = col_character(),
## n_synsets = col_double(),
## n_definitions = col_double()
## )
no_generality <- read_csv("../word_norms/old_files/no_generality.csv") %>% select(-mean_generality, -num_item_id)
## Parsed with column specification:
## cols(
## num_item_id = col_character(),
## word = col_character(),
## pos = col_character(),
## mean_generality = col_logical()
## )
generality_cdi_childes <- read_csv("../word_norms/generality_ratings_byWord_cdi_childes.csv")
## Parsed with column specification:
## cols(
## word = col_character(),
## num_item_id = col_character(),
## mean_generality = col_double(),
## n_generality_ratings = col_double()
## )
generality_otherwords <- read_csv("../word_norms/new_generality_ratings.csv") %>%
rename(pos_fullword = pos, word_descriptive = word, word = word2) %>%
mutate(pos = case_when(
pos_fullword == "noun" ~ "n",
pos_fullword == "verb" ~ "v"
))
## Parsed with column specification:
## cols(
## word2 = col_character(),
## word = col_character(),
## pos = col_character(),
## mean_generality = col_double(),
## n_gen_ratings = col_double()
## )
generality_otherwords_getpos <- generality_otherwords %>% left_join(no_generality, by="word") %>%
mutate(pos_resolved = coalesce(pos.x, pos.y)) %>%
select(word, pos = pos_resolved, mean_generality, n_generality_ratings = n_gen_ratings) %>%
distinct()
word_info <- read_csv("../word_norms/word_info_with_cdi_ids.csv") %>%
select(num_item_id,word, seedword, type, pos, num_hypernyms, num_hyponyms, aoa, adult_log_freq) %>%
distinct() %>%
left_join(defs_synsets, by=c("word", "pos")) %>%
left_join(helpfulness, by="word") %>%
left_join(babiness_preschoolness, by="word") %>%
left_join(word_importance, by="word") %>%
left_join(generality_cdi_childes, by="num_item_id") %>%
select(-word.y, word=word.x) %>%
left_join(generality_otherwords_getpos, by=c("word", "pos")) %>%
mutate(n_generality_ratings = coalesce(n_generality_ratings.x, n_generality_ratings.y),
mean_generality = coalesce(mean_generality.x, mean_generality.y)) %>%
select(-n_generality_ratings.x, -n_generality_ratings.y, -mean_generality.x, -mean_generality.y) %>%
group_by(pos) %>%
mutate(pos_scale_hypernyms = scale(num_hypernyms),
pos_scale_log_hyponyms = scale(log(1+num_hyponyms)),
pos_scale_synsets = scale(n_synsets),
pos_scale_defs = scale(n_definitions),
pos_scale_generality = scale(mean_generality)) %>%
ungroup()
## Parsed with column specification:
## cols(
## word = col_character(),
## num_item_id = col_character(),
## seedword = col_character(),
## pos = col_character(),
## control_def = col_character(),
## child_know = col_double(),
## kid_log_freq = col_double(),
## adult_log_freq = col_double(),
## num_peers = col_double(),
## num_hypernyms = col_double(),
## type = col_character(),
## aoa = col_double(),
## FREQcount = col_double(),
## logFreq_subtlex = col_double(),
## num_hyponyms = col_double(),
## log_hyponyms = col_double()
## )
child_mean_word_info <- word_info %>%
select(word, mean_helpfulness, mean_generality, pos_scale_generality, adult_log_freq) %>%
left_join(vocab_survey_data, by="word") %>%
select(subjCode, timepoint, childAge_days, word, understands, says, everything()) %>%
distinct() %>%
group_by(subjCode, timepoint) %>%
filter(says==1) %>%
summarize(mean_helpfulness_of_vocab = mean(mean_helpfulness, na.rm=TRUE),
mean_aoa_of_vocab = mean(aoa, na.rm=TRUE),
mean_freq_of_vocab = mean(adult_log_freq, na.rm=TRUE),
mean_hypernymy_of_vocab = mean(pos_scale_hypernyms, na.rm=TRUE),
mean_generality_of_vocab_unscaled = mean(mean_generality, na.rm=TRUE),
mean_generality_of_vocab = mean(pos_scale_generality, na.rm=TRUE)) %>%
pivot_wider(names_from = timepoint, names_prefix="t", values_from = mean_helpfulness_of_vocab:mean_generality_of_vocab)
The goal of these analyses is to investigate how 2-4yo’s vocabulary knowledge might predict performance on cognitive tasks. We investigate vocabulary at the composite level (e.g., how many total seed/control words) as well as individual word knowledge to identify which specific words might be most helpful for kids to know.
t1: first timepoint, when parents did online survey (summer 2019)
t3: third timepoint, when parents brought kids into lab (fall 2019/winter 2020)
(t2 is online language use survey, summer 2019, not included here because we didn’t ask about vocab)
ASB5: number of items child got correct on our adapted SB-5 – max of 12
WJ: number of items child got correct on Woodcock-Johnson Concept Formation test – max of 40
Color: number of items child got correct on productive color test – max of 5
Shape: number of items child got correct on productive shape test – max of 8
ppvt_overlapping_survey: number of items with same response (knows or doesn’t know) on PPVT (child behavioral measure) and parent report (survey) – max of 9
ppvt_first_set: number of starting set when child did PPVT
mean_helpfulness_of_vocab: mean helpfulness rating of all the words child is reported to know (helpfulness ratings provided by adults who were asked “How helpful would it be for a preschooler to know the word ____?”)
mean_aoa_of_vocab: mean age of acquisition of all the words child is reported to know (from Kuperman norms)
mean_freq_of_vocab: mean frequency of all the words child is reported to know (from adult speech in CHILDES, log-transformed)
mean_hypernymy_of_vocab: mean number of hypernyms of all the words child is reported to know (from Wordnet) mean_generality_of_vocab: mean generality rating of all the words child is reported to know (generality ratings provided by adults on a scale of 1-5, 1 being most specific; currently have ratings for 133 of 371 words)
mean_parent_importance: for words that parents reported their children not knowing, we asked them to rate on a scale of 1-5 how important it would be to teach their child that word. This is used as a predictor in the word-level analyses.
When kids came into the lab at T3, they completed a subset of trials from the Early SB-5, the Concept Formation test of the Woodcock-Johnson, a productive color vocabulary test, productive shape vocabulary test, and the PPVT. 9 items on the PPVT were also on the parent-report vocabulary survey, so we can look at how much overlap there is between children’s parent-reported vs. actual knowledge.
hist(current_task_data$ASB5)
hist(current_task_data$WJ)
hist(current_task_data$Color)
hist(current_task_data$Shape)
hist(current_task_data$ppvt_overlapping_survey)
Unsurprisingly, it’s Madison - parent education and income are both high. This likely explains the lack of correlation between vocabulary and SES measures below.
hist(current_task_data$parentEd_num)
hist(current_task_data$parentIncome_num)
current_task_data_forcorr <- current_task_data %>%
ungroup() %>%
left_join(child_mean_word_info, by="subjCode") %>%
mutate(avg_SES_z = (parentEd_z + parentIncome_z)/2) %>%
select(childAge_days_t1, childAge_days_t3, avg_SES_z, ASB5, WJ, Color, Shape, ppvt_overlapping_survey,
ppvt_standard_score, ppvt_percentile, ppvt_prop_correct,
total_says_seed_t1, total_understands_seed_t1, total_says_control_t1, total_understands_control_t1,
total_says_seed_t3, total_understands_seed_t3, total_says_control_t3, total_understands_control_t3)
current_data_corrmat <- cor(current_task_data_forcorr, use="pairwise.complete.obs")
pmat <- cor.mtest(current_task_data_forcorr)
sig_matrix <- pmat$p
corrplot(corr = current_data_corrmat, method="color", type="lower",
diag=FALSE, addCoef.col = "black", p.mat=sig_matrix, insig="blank",
tl.srt = 45, tl.col="black", tl.cex=.8, number.cex = .5)
ASB5.age: ASB5 performance, controlling for age
WJ.age: WJ performance, controlling for age
age_sb5 <- lm(ASB5 ~ childAge_days_t3, data=current_task_data_forcorr)
age_wj <- lm(WJ ~ childAge_days_t3, data=current_task_data_forcorr)
resid_task_data <- current_task_data %>%
left_join(child_mean_word_info, by="subjCode") %>%
mutate(ASB5.age = age_sb5$residuals,
WJ.age = age_wj$residuals,
total_vocab_t1 = total_says_seed_t1 + total_says_control_t1,
total_vocab_t3 = total_says_seed_t3 + total_says_control_t3)
resid_task_data_forcorr <- resid_task_data %>%
select(childAge_days_t3, ASB5.age, WJ.age, ppvt_prop_correct, ppvt_standard_score, ppvt_percentile,
total_says_seed_t1, total_says_control_t1, total_says_seed_t3, total_says_control_t3, mean_helpfulness_of_vocab_t1,
mean_aoa_of_vocab_t1, mean_freq_of_vocab_t1, mean_hypernymy_of_vocab_t1, mean_generality_of_vocab_t1,
mean_helpfulness_of_vocab_t3, mean_aoa_of_vocab_t3,
mean_freq_of_vocab_t3, mean_hypernymy_of_vocab_t3, mean_generality_of_vocab_t3)
resid_data_corrmat <- cor(resid_task_data_forcorr, use="pairwise.complete.obs")
resid_pmat <- cor.mtest(resid_task_data_forcorr)
resid_sig_matrix <- resid_pmat$p
corrplot(corr = resid_data_corrmat, method="color", type="lower",
diag=FALSE, addCoef.col = "black", p.mat=resid_sig_matrix, insig="blank",
tl.srt = 45, tl.col="black", tl.cex=.8, number.cex = .5)
resid_task_data <- resid_task_data %>%
mutate(prop_seed_known_t1 = total_says_seed_t1/72,
prop_control_known_t1 = total_says_control_t1/251,
prop_seed_known_t3 = total_says_seed_t3/72,
prop_control_known_t3 = total_says_control_t3/251)
t.test(resid_task_data$prop_seed_known_t1, resid_task_data$prop_control_known_t1, paired=TRUE)
##
## Paired t-test
##
## data: resid_task_data$prop_seed_known_t1 and resid_task_data$prop_control_known_t1
## t = 4.2335, df = 35, p-value = 0.0001581
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03076256 0.08744978
## sample estimates:
## mean of the differences
## 0.05910617
t.test(resid_task_data$prop_seed_known_t3, resid_task_data$prop_control_known_t3, paired=TRUE)
##
## Paired t-test
##
## data: resid_task_data$prop_seed_known_t3 and resid_task_data$prop_control_known_t3
## t = 2.0213, df = 35, p-value = 0.05095
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.0001465045 0.0674421368
## sample estimates:
## mean of the differences
## 0.03364782
t.test(resid_task_data$mean_hypernymy_of_vocab_t1, resid_task_data$mean_hypernymy_of_vocab_t3, paired=TRUE)
##
## Paired t-test
##
## data: resid_task_data$mean_hypernymy_of_vocab_t1 and resid_task_data$mean_hypernymy_of_vocab_t3
## t = -0.40011, df = 35, p-value = 0.6915
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02398338 0.01608620
## sample estimates:
## mean of the differences
## -0.003948589
t.test(resid_task_data$mean_generality_of_vocab_t1, resid_task_data$mean_generality_of_vocab_t3, paired=TRUE)
##
## Paired t-test
##
## data: resid_task_data$mean_generality_of_vocab_t1 and resid_task_data$mean_generality_of_vocab_t3
## t = -1.663, df = 35, p-value = 0.1052
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.048796655 0.004851023
## sample estimates:
## mean of the differences
## -0.02197282
At T1, kids know 72% of seed words and 66% of control words (made proportion to account for different numbers of seed and control words).
At T3, this gap has closed a bit and difference is no longer statistically significant (know 79% of seed and 76% of control).
Vocabulary hypernymy and generality don’t change across timepoints.
asb5_wj_age <- current_task_data %>%
mutate(ASB5.age = age_sb5$residuals,
WJ.age = age_wj$residuals,
log_WJ = log(WJ+1)) %>%
select(subjCode, ASB5, ASB5.age, WJ, WJ_scaled, log_WJ, WJ.age, childAge_days_t3)
ggpairs(asb5_wj_age, columns = c(2, 3, 4, 7))
predict_wj <- lm(WJ ~ ASB5 + childAge_days_t3, data=asb5_wj_age)
summary(predict_wj)
##
## Call:
## lm(formula = WJ ~ ASB5 + childAge_days_t3, data = asb5_wj_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7223 -3.0578 -0.0357 1.6218 16.6507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.279986 4.173201 -3.422 0.00168 **
## ASB5 0.197919 0.426039 0.465 0.64530
## childAge_days_t3 0.011434 0.003805 3.005 0.00505 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.557 on 33 degrees of freedom
## Multiple R-squared: 0.4213, Adjusted R-squared: 0.3862
## F-statistic: 12.01 on 2 and 33 DF, p-value: 0.0001205
predict_asb5 <- lm(ASB5 ~ WJ + childAge_days_t3, data=asb5_wj_age)
summary(predict_asb5)
##
## Call:
## lm(formula = ASB5 ~ WJ + childAge_days_t3, data = asb5_wj_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6545 -1.0931 0.3525 1.0771 3.4288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.280224 1.965688 -0.651 0.519372
## WJ 0.032828 0.070665 0.465 0.645300
## childAge_days_t3 0.006088 0.001391 4.376 0.000115 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.856 on 33 degrees of freedom
## Multiple R-squared: 0.5336, Adjusted R-squared: 0.5053
## F-statistic: 18.87 on 2 and 33 DF, p-value: 3.429e-06
It seems like age is just a very strong predictor of both ASB5 and WJ, so after you control for age, performance on one measure doesn’t account for any additional variance in performance on the other.
Issues with collinearity among predictors, so full model isn’t appropriate. The model that contains age, T1 vocab size, and mean T1 vocab hypernymy has the highest adjusted R-squared.
# all_t1_characteristics <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_helpfulness_of_vocab_t1 + mean_aoa_of_vocab_t1 +
# mean_freq_of_vocab_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_hyper <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_aoa <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_aoa_of_vocab_t1, data = resid_task_data)
t1voc_freq <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_freq_of_vocab_t1, data = resid_task_data)
t1voc_help <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_helpfulness_of_vocab_t1, data = resid_task_data)
t1voc_general <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_generality_of_vocab_t1, data = resid_task_data)
t1vocchange_general <- lm(ASB5 ~ mean_generality_of_vocab_t1 + total_vocab_t1 + I(total_vocab_t3-total_vocab_t1), data = resid_task_data)
tab_model(t1voc_help, t1voc_aoa, t1voc_freq, t1voc_hyper, t1voc_general, t1vocchange_general)
| ASB 5 | ASB 5 | ASB 5 | ASB 5 | ASB 5 | ASB 5 | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | -39.10 | -97.00 – 18.79 | 0.178 | 15.49 | -13.56 – 44.53 | 0.285 | -55.15 | -101.67 – -8.62 | 0.022 | -1.80 | -4.84 – 1.25 | 0.238 | -2.10 | -5.86 – 1.66 | 0.263 | 3.95 | -0.90 – 8.80 | 0.107 |
| childAge_days_t3 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.00 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | |||
| total_vocab_t1 | 0.03 | -0.02 – 0.07 | 0.214 | 0.02 | -0.02 – 0.06 | 0.250 | 0.04 | 0.00 – 0.07 | 0.034 | 0.00 | -0.01 – 0.02 | 0.449 | 0.00 | -0.01 – 0.02 | 0.835 | 0.02 | 0.00 – 0.04 | 0.030 |
| mean_helpfulness_of_vocab_t1 | 9.86 | -5.40 – 25.12 | 0.198 | |||||||||||||||
| mean_aoa_of_vocab_t1 | -4.29 | -11.47 – 2.88 | 0.232 | |||||||||||||||
| mean_freq_of_vocab_t1 | 7.11 | 0.93 – 13.29 | 0.026 | |||||||||||||||
| mean_hypernymy_of_vocab_t1 | -18.57 | -30.39 – -6.74 | 0.003 | |||||||||||||||
| mean_generality_of_vocab_t1 | -2.28 | -12.11 – 7.55 | 0.640 | -2.28 | -15.19 – 10.63 | 0.722 | ||||||||||||
|
total_vocab_t3 - total_vocab_t1 |
-0.00 | -0.04 – 0.04 | 0.885 | |||||||||||||||
| Observations | 36 | 36 | 36 | 36 | 36 | 36 | ||||||||||||
| R2 / R2 adjusted | 0.555 / 0.513 | 0.551 / 0.509 | 0.599 / 0.562 | 0.644 / 0.611 | 0.534 / 0.490 | 0.197 / 0.122 | ||||||||||||
Issues with collinearity among predictors, so full model isn’t appropriate. Models with mean frequency (+age+vocab) and mean hypernymy (+age+vocab) have the highest adjusted R-squared.
# all_t3_characteristics <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_helpfulness_of_vocab_t3 + mean_aoa_of_vocab_t3 +
# mean_freq_of_vocab_t3 + mean_hypernymy_of_vocab_t3, data = resid_task_data)
t3voc_hyper <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_hypernymy_of_vocab_t3, data = resid_task_data)
t3voc_aoa <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_aoa_of_vocab_t3, data = resid_task_data)
t3voc_freq <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_freq_of_vocab_t3, data = resid_task_data)
t3voc_help <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_helpfulness_of_vocab_t3, data = resid_task_data)
t3voc_general <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_generality_of_vocab_t3, data = resid_task_data)
tab_model(t3voc_help, t3voc_aoa, t3voc_freq, t3voc_hyper, t3voc_general)
| ASB 5 | ASB 5 | ASB 5 | ASB 5 | ASB 5 | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 19.02 | -67.41 – 105.46 | 0.657 | 3.27 | -30.91 – 37.45 | 0.847 | -40.80 | -110.95 – 29.34 | 0.245 | -4.07 | -8.74 – 0.61 | 0.086 | -1.64 | -5.53 – 2.25 | 0.396 |
| childAge_days_t3 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 | 0.01 | 0.00 – 0.01 | <0.001 |
| total_vocab_t3 | -0.01 | -0.07 – 0.05 | 0.684 | 0.01 | -0.04 – 0.05 | 0.704 | 0.03 | -0.02 – 0.08 | 0.255 | 0.01 | -0.01 – 0.02 | 0.453 | -0.00 | -0.02 – 0.02 | 0.958 |
| mean_helpfulness_of_vocab_t3 | -5.38 | -27.49 – 16.73 | 0.624 | ||||||||||||
| mean_aoa_of_vocab_t3 | -1.34 | -10.05 – 7.36 | 0.755 | ||||||||||||
| mean_freq_of_vocab_t3 | 5.13 | -4.12 – 14.38 | 0.267 | ||||||||||||
| mean_hypernymy_of_vocab_t3 | -11.21 | -26.45 – 4.03 | 0.144 | ||||||||||||
| mean_generality_of_vocab_t3 | -7.23 | -24.45 – 10.00 | 0.399 | ||||||||||||
| Observations | 36 | 36 | 36 | 36 | 36 | ||||||||||
| R2 / R2 adjusted | 0.535 / 0.492 | 0.533 / 0.489 | 0.549 / 0.507 | 0.562 / 0.521 | 0.542 / 0.499 | ||||||||||
Controlling for age in different ways (age at T3, age at T1, age at T3 & amount of time elapsed since T1), and looking at different aspects of vocabulary (e.g. knowledge at T3; growth from T1-T3).
current_task_data_forcorr <- current_task_data_forcorr %>%
mutate(prop_seed_t1 = total_says_seed_t1/(total_says_seed_t1+total_says_control_t1),
prop_seed_t3 = total_says_seed_t3/(total_says_seed_t3+total_says_control_t3))
# seed and control at t1
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_control_t1,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_control_t1,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8125 -1.0297 0.3963 0.9910 3.4542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.7519817 1.6842834 -1.040 0.306
## childAge_days_t3 0.0065471 0.0013050 5.017 1.76e-05 ***
## total_says_control_t1 -0.0004398 0.0079555 -0.055 0.956
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.862 on 33 degrees of freedom
## Multiple R-squared: 0.5305, Adjusted R-squared: 0.5021
## F-statistic: 18.65 on 2 and 33 DF, p-value: 3.813e-06
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_seed_t1,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_seed_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9032 -1.0060 0.1404 0.9911 3.8218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.014685 1.727209 -1.166 0.252
## childAge_days_t3 0.006022 0.001355 4.444 9.38e-05 ***
## total_says_seed_t1 0.019327 0.033764 0.572 0.571
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.853 on 33 degrees of freedom
## Multiple R-squared: 0.5351, Adjusted R-squared: 0.5069
## F-statistic: 18.99 on 2 and 33 DF, p-value: 3.244e-06
# seed and control at t3
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_control_t3,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_control_t3,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8207 -1.0704 0.3529 0.9740 3.5006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.7924172 1.7875491 -1.003 0.323
## childAge_days_t3 0.0064729 0.0012365 5.235 9.23e-06 ***
## total_says_control_t3 0.0004329 0.0083668 0.052 0.959
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.862 on 33 degrees of freedom
## Multiple R-squared: 0.5305, Adjusted R-squared: 0.5021
## F-statistic: 18.65 on 2 and 33 DF, p-value: 3.814e-06
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_seed_t3,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_seed_t3, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7858 -1.1089 0.1156 0.9766 3.6684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.876035 2.272839 -1.705 0.0975 .
## childAge_days_t3 0.005876 0.001139 5.159 1.15e-05 ***
## total_says_seed_t3 0.054278 0.040542 1.339 0.1898
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.813 on 33 degrees of freedom
## Multiple R-squared: 0.5547, Adjusted R-squared: 0.5277
## F-statistic: 20.55 on 2 and 33 DF, p-value: 1.596e-06
# difference between seed and control
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+I(total_says_seed_t1-total_says_control_t1),data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + I(total_says_seed_t1 -
## total_says_control_t1), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7946 -0.9722 0.3749 1.0346 3.3980
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.746512 1.677072 -1.041
## childAge_days_t3 0.006665 0.001265 5.267
## I(total_says_seed_t1 - total_says_control_t1) 0.002288 0.009810 0.233
## Pr(>|t|)
## (Intercept) 0.305
## childAge_days_t3 8.4e-06 ***
## I(total_says_seed_t1 - total_says_control_t1) 0.817
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.86 on 33 degrees of freedom
## Multiple R-squared: 0.5313, Adjusted R-squared: 0.5029
## F-statistic: 18.7 on 2 and 33 DF, p-value: 3.717e-06
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+I(total_says_seed_t3-total_says_control_t3),data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + I(total_says_seed_t3 -
## total_says_control_t3), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8089 -0.9116 0.3836 1.0327 3.3538
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.675047 1.710754 -0.979
## childAge_days_t3 0.006661 0.001235 5.395
## I(total_says_seed_t3 - total_says_control_t3) 0.002448 0.009859 0.248
## Pr(>|t|)
## (Intercept) 0.335
## childAge_days_t3 5.74e-06 ***
## I(total_says_seed_t3 - total_says_control_t3) 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.86 on 33 degrees of freedom
## Multiple R-squared: 0.5314, Adjusted R-squared: 0.503
## F-statistic: 18.71 on 2 and 33 DF, p-value: 3.703e-06
# seed growth
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+I(total_says_seed_t3-total_says_seed_t1),data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + I(total_says_seed_t3 -
## total_says_seed_t1), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6517 -0.9778 0.3544 1.1524 3.1646
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2.631187 2.072641 -1.269
## childAge_days_t3 0.006957 0.001236 5.627
## I(total_says_seed_t3 - total_says_seed_t1) 0.033720 0.047807 0.705
## Pr(>|t|)
## (Intercept) 0.213
## childAge_days_t3 2.9e-06 ***
## I(total_says_seed_t3 - total_says_seed_t1) 0.486
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.848 on 33 degrees of freedom
## Multiple R-squared: 0.5375, Adjusted R-squared: 0.5094
## F-statistic: 19.17 on 2 and 33 DF, p-value: 2.983e-06
# account for how much time elapsed between t1-t3
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_seed_t1,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) +
## total_says_seed_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9965 -1.0941 0.0728 1.0763 3.5832
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2.955054 2.528700 -1.169
## childAge_days_t1 0.005985 0.001372 4.361
## I(childAge_days_t3 - childAge_days_t1) 0.010928 0.009636 1.134
## total_says_seed_t1 0.019206 0.034148 0.562
## Pr(>|t|)
## (Intercept) 0.251195
## childAge_days_t1 0.000126 ***
## I(childAge_days_t3 - childAge_days_t1) 0.265184
## total_says_seed_t1 0.577734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.874 on 32 degrees of freedom
## Multiple R-squared: 0.5389, Adjusted R-squared: 0.4957
## F-statistic: 12.47 on 3 and 32 DF, p-value: 1.449e-05
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_seed_t3,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) +
## total_says_seed_t3, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8713 -1.1138 0.0911 0.9632 3.4512
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -4.699517 2.878083 -1.633
## childAge_days_t1 0.005849 0.001154 5.068
## I(childAge_days_t3 - childAge_days_t1) 0.010332 0.009432 1.095
## total_says_seed_t3 0.053486 0.041060 1.303
## Pr(>|t|)
## (Intercept) 0.112
## childAge_days_t1 1.63e-05 ***
## I(childAge_days_t3 - childAge_days_t1) 0.282
## total_says_seed_t3 0.202
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.835 on 32 degrees of freedom
## Multiple R-squared: 0.5578, Adjusted R-squared: 0.5164
## F-statistic: 13.46 on 3 and 32 DF, p-value: 7.536e-06
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_control_t3,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) +
## total_says_control_t3, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9139 -1.1796 0.2551 0.9983 3.2742
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2.7162429 2.5510073 -1.065
## childAge_days_t1 0.0064563 0.0012510 5.161
## I(childAge_days_t3 - childAge_days_t1) 0.0114262 0.0097309 1.174
## total_says_control_t3 0.0001165 0.0084842 0.014
## Pr(>|t|)
## (Intercept) 0.295
## childAge_days_t1 1.24e-05 ***
## I(childAge_days_t3 - childAge_days_t1) 0.249
## total_says_control_t3 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.883 on 32 degrees of freedom
## Multiple R-squared: 0.5344, Adjusted R-squared: 0.4907
## F-statistic: 12.24 on 3 and 32 DF, p-value: 1.689e-05
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_control_t1,data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) +
## total_says_control_t1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9050 -1.1567 0.2830 0.9991 3.3005
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2.7034895 2.5040150 -1.080
## childAge_days_t1 0.0065228 0.0013206 4.939
## I(childAge_days_t3 - childAge_days_t1) 0.0115327 0.0097076 1.188
## total_says_control_t1 -0.0006124 0.0080520 -0.076
## Pr(>|t|)
## (Intercept) 0.288
## childAge_days_t1 2.37e-05 ***
## I(childAge_days_t3 - childAge_days_t1) 0.244
## total_says_control_t1 0.940
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.883 on 32 degrees of freedom
## Multiple R-squared: 0.5345, Adjusted R-squared: 0.4908
## F-statistic: 12.25 on 3 and 32 DF, p-value: 1.685e-05
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1) + total_says_control_t1 + I(total_says_seed_t1-total_says_control_t1),data=.) %>% summary
##
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) +
## total_says_control_t1 + I(total_says_seed_t1 - total_says_control_t1),
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0201 -0.6222 0.1938 1.0509 3.7444
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -3.539976 2.584524 -1.370
## childAge_days_t1 0.006047 0.001371 4.410
## I(childAge_days_t3 - childAge_days_t1) 0.011672 0.009645 1.210
## total_says_control_t1 0.061193 0.052417 1.167
## I(total_says_seed_t1 - total_says_control_t1) 0.077205 0.064710 1.193
## Pr(>|t|)
## (Intercept) 0.180627
## childAge_days_t1 0.000115 ***
## I(childAge_days_t3 - childAge_days_t1) 0.235344
## total_says_control_t1 0.251940
## I(total_says_seed_t1 - total_says_control_t1) 0.241888
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.87 on 31 degrees of freedom
## Multiple R-squared: 0.5549, Adjusted R-squared: 0.4975
## F-statistic: 9.662 on 4 and 31 DF, p-value: 3.416e-05
Because the long-term goal is to identify specific words that could be helpful for cognitive/vocabulary development, shift analyses to look at the role of individual words
## put everything together into df
task_data_wide <- current_task_data %>%
mutate(total_says_t1 = total_says_control_t1+total_says_seed_t1,
total_says_t3 = total_says_control_t3+total_says_seed_t3,
total_understands_t1 = total_understands_control_t1+total_understands_seed_t1,
total_understands_t3 = total_understands_control_t3+total_understands_seed_t3) %>%
select(subjCode, childAge_days_t1, childAge_days_t3, total_understands_t1, total_says_t1,total_understands_t3, total_says_t3,
ASB5, WJ, WJ_scaled, ppvt_prop_correct, ppvt_standard_score, ppvt_percentile)
##inner join to only include kids who've come into lab
df_to_analyze <- vocab_survey_data %>%
filter(!is.na(word)) %>%
inner_join(task_data_wide, by="subjCode") %>%
select(-importance_to_teach, -childAge_days) %>%
rename(item=word)
###code from Molly
#params
predict_t3_vocab <- "total_says_t3 ~ says + childAge_days_t1 + total_says_t1"
predict_asb5 <- "ASB5 ~ says + childAge_days_t1 + total_says_t1"
predict_wj <- "WJ ~ says + childAge_days_t1 + total_says_t1"
#predict_wj_scaled <- "WJ_scaled ~ says + childAge_days_t1 + total_says_t1"
predict_asb5_fromt3 <- "ASB5 ~ says + childAge_days_t3 + total_says_t3"
predict_wj_fromt3 <- "WJ ~ says + childAge_days_t3 + total_says_t3"
#predict_wj_scaled_fromt3 <- "WJ_scaled ~ says + childAge_days_t3 + total_says_t3"
#ppvt standard doesn't need to control for age because the standardized score accounts for age
predict_ppvt_standard <- "ppvt_standard_score ~ says + total_says_t1"
# coefficient function
get_word_beta <- function(word, mod_formula, time, df){
relevant_df <- df %>%
filter(item == word & timepoint == time)
model <- lm(mod_formula, relevant_df)
summary(model)$coefficients %>%
data.frame() %>%
rownames_to_column("term") %>%
filter(term == 'says') %>%
mutate(item = word)
}
#test case
#get_word_beta("squeak", as.formula(predict_t3_vocab), 3, df_to_analyze)
t3_knowledge <- df_to_analyze %>%
group_by(item) %>%
filter(timepoint==3) %>%
summarise(n_say_t3=sum(says))
word_knowledge <- df_to_analyze %>%
group_by(item) %>%
filter(timepoint==1) %>%
summarise(n_say_t1=sum(says)) %>%
left_join(t3_knowledge, by="item")
DT::datatable(word_knowledge)
word_coeffs_asb5_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_asb5),
1,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_asb5_t1)
word_coeffs_wj_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_wj),
1,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_wj_t1)
word_coeffs_ppvt_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_ppvt_standard),
1,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_ppvt_t1)
word_coeffs_asb5_t3 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_asb5_fromt3),
3,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_asb5_t3)
word_coeffs_wj_t3 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""),
get_word_beta,
as.formula(predict_wj_fromt3),
3,
df_to_analyze) %>%
select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
arrange(desc(tval))
DT::datatable(word_coeffs_wj_t3)
The coefficient estimates above indicate the degree to which knowledge of a specific word boosts performance on the ASB5 and WJ, controlling for age and total vocabulary knowledge. Now we want to test the hypothesis that lower hypernymy is a characteristic of these more helpful words (i.e. words with higher estimates). So, we look at the relation between hypernymy and the estimate for each word.
It looks like there is a relationship here:
sb5_coeffs_t1 <- word_coeffs_asb5_t1 %>%
rename(word=item) %>%
left_join(word_info, by="word")
ggplot(sb5_coeffs_t1, aes(x = pos_scale_hypernyms, y = Estimate, label=word))+
geom_point(size=3)+
geom_smooth(method=lm)+
theme_classic()+
#geom_label()+
labs(x = "Hypernyms (scaled by pos)", y = "Coefficient Estimate")+
scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
theme(text = element_text(size=25))
Does hypernymy predict coefficients? Yes…
#coefficient of word knowledge at T1 (how well it predicts ASB5 at T3) is predicted by hypernymy, controlling for pos
sb5_coeffs_t1_alldata <- sb5_coeffs_t1 %>%
filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
!is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))
sb5_t1_mod1 <- lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
## Data: sb5_coeffs_t1_alldata
##
## REML criterion at convergence: 1101.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1294 -0.6303 -0.1774 0.3423 2.9030
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.1757 0.4192
## Residual 1.7719 1.3311
## Number of obs: 315, groups: seedword, 72
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.21511 0.09695 68.85640 2.219 0.0298 *
## pos_scale_hypernyms -0.20489 0.08050 305.67863 -2.545 0.0114 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ps_scl_hypr 0.071
…but the effect goes away when you account for other features of the word such as helpfulness, frequency, AoA, etc.
sb5_t1_mod2 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa +
mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword),
data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## pos_scale_defs + (1 | seedword)
## Data: sb5_coeffs_t1_alldata
##
## REML criterion at convergence: 1072.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1620 -0.6380 -0.1016 0.4450 2.7626
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.1163 0.3411
## Residual 1.5711 1.2534
## Number of obs: 315, groups: seedword, 72
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.57674 0.78632 289.18159 -0.733 0.46387
## pos_scale_hypernyms -0.10679 0.08029 289.74951 -1.330 0.18455
## mean_helpfulness -0.06045 0.16485 299.46853 -0.367 0.71410
## adult_log_freq 0.20228 0.06659 302.32563 3.038 0.00259 **
## aoa -0.14126 0.06484 299.27826 -2.179 0.03014 *
## mean_preschoolness -0.08885 0.11934 276.47730 -0.744 0.45722
## mean_parent_importance 0.26409 0.12298 293.24176 2.148 0.03257 *
## pos_scale_synsets -0.17202 0.11783 299.47644 -1.460 0.14537
## pos_scale_defs -0.10790 0.11334 302.49073 -0.952 0.34185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ps_scl_h mn_hlp adlt__ aoa mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.104
## mn_hlpflnss -0.402 0.038
## adlt_lg_frq -0.327 0.110 -0.322
## aoa -0.874 0.000 0.378 0.237
## mn_prschlns -0.112 0.061 -0.447 0.070 0.071
## mn_prnt_mpr -0.252 0.027 -0.214 -0.214 0.041 -0.006
## ps_scl_syns 0.126 0.077 -0.013 -0.202 -0.006 0.063 -0.077
## pos_scl_dfs -0.108 0.074 -0.027 0.027 0.046 0.035 0.171 -0.666
sb5_t1_mod3 <- lmer(Estimate ~ adult_log_freq + aoa + mean_parent_importance + (1|seedword), data = sb5_coeffs_t1_alldata)
summary(sb5_t1_mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ adult_log_freq + aoa + mean_parent_importance + (1 |
## seedword)
## Data: sb5_coeffs_t1_alldata
##
## REML criterion at convergence: 1069
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3405 -0.6382 -0.1494 0.4368 2.8746
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.1859 0.4311
## Residual 1.5504 1.2452
## Number of obs: 315, groups: seedword, 72
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.87500 0.67071 299.43635 -1.305 0.19304
## adult_log_freq 0.15846 0.06007 307.41480 2.638 0.00876 **
## aoa -0.11332 0.05772 310.99943 -1.963 0.05050 .
## mean_parent_importance 0.26577 0.11887 302.16653 2.236 0.02610 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) adlt__ aoa
## adlt_lg_frq -0.597
## aoa -0.852 0.501
## mn_prnt_mpr -0.452 -0.331 0.172
anova(sb5_t1_mod2, sb5_t1_mod3)
## refitting model(s) with ML (instead of REML)
## Data: sb5_coeffs_t1_alldata
## Models:
## sb5_t1_mod3: Estimate ~ adult_log_freq + aoa + mean_parent_importance + (1 |
## sb5_t1_mod3: seedword)
## sb5_t1_mod2: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## sb5_t1_mod2: aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## sb5_t1_mod2: pos_scale_defs + (1 | seedword)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## sb5_t1_mod3 6 1067.4 1089.9 -527.69 1055.4
## sb5_t1_mod2 11 1067.7 1108.9 -522.83 1045.7 9.7209 5 0.08354 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Does hypernymy predict coefficients? Yes (but in a weird direction)…
wj_coeffs_t1 <- word_coeffs_wj_t1 %>%
rename(word=item) %>%
left_join(word_info, by="word")
wj_coeffs_t1_alldata <- wj_coeffs_t1 %>%
filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
!is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))
summary(lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=wj_coeffs_t1_alldata))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
## Data: wj_coeffs_t1_alldata
##
## REML criterion at convergence: 1413.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6034 -0.6759 -0.0707 0.5379 3.5593
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.380 0.6165
## Residual 4.865 2.2056
## Number of obs: 315, groups: seedword, 72
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.1696 0.1545 50.1515 -1.097 0.27774
## pos_scale_hypernyms 0.3606 0.1321 297.7159 2.729 0.00673 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ps_scl_hypr 0.064
…and goes away when you account for other word features. No difference in fit between full and reduced models
wj_t1_mod1 <-lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa +
mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword), data=wj_coeffs_t1_alldata)
summary(wj_t1_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## pos_scale_defs + (1 | seedword)
## Data: wj_coeffs_t1_alldata
##
## REML criterion at convergence: 1373.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8702 -0.5834 -0.1273 0.5275 2.9961
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.2461 0.496
## Residual 4.2419 2.060
## Number of obs: 315, groups: seedword, 72
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.96795 1.28109 275.82813 0.756 0.45056
## pos_scale_hypernyms 0.11520 0.13083 280.12929 0.881 0.37934
## mean_helpfulness -0.61180 0.26891 293.99393 -2.275 0.02362 *
## adult_log_freq -0.11738 0.10868 299.32455 -1.080 0.28099
## aoa 0.23346 0.10575 291.07830 2.208 0.02805 *
## mean_preschoolness 0.03223 0.19426 262.32362 0.166 0.86835
## mean_parent_importance -0.01212 0.20045 284.17077 -0.060 0.95185
## pos_scale_synsets 0.58516 0.19224 296.81809 3.044 0.00254 **
## pos_scale_defs -0.55431 0.18501 301.72177 -2.996 0.00296 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ps_scl_h mn_hlp adlt__ aoa mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.108
## mn_hlpflnss -0.400 0.042
## adlt_lg_frq -0.328 0.108 -0.321
## aoa -0.874 0.002 0.376 0.238
## mn_prschlns -0.114 0.058 -0.448 0.070 0.072
## mn_prnt_mpr -0.251 0.029 -0.218 -0.216 0.040 0.000
## ps_scl_syns 0.128 0.075 -0.011 -0.207 -0.007 0.062 -0.075
## pos_scl_dfs -0.109 0.077 -0.030 0.029 0.046 0.037 0.173 -0.665
wj_t1_mod2 <-lmer(Estimate ~ mean_helpfulness + aoa + pos_scale_synsets + pos_scale_defs + (1|seedword), data=wj_coeffs_t1_alldata)
summary(wj_t1_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## Estimate ~ mean_helpfulness + aoa + pos_scale_synsets + pos_scale_defs +
## (1 | seedword)
## Data: wj_coeffs_t1_alldata
##
## REML criterion at convergence: 1368.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9418 -0.5878 -0.1166 0.5295 3.0427
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.2165 0.4653
## Residual 4.2395 2.0590
## Number of obs: 315, groups: seedword, 72
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.4757 1.1218 267.7837 0.424 0.671871
## mean_helpfulness -0.7318 0.2107 274.8521 -3.473 0.000598 ***
## aoa 0.2664 0.1016 289.6401 2.621 0.009222 **
## pos_scale_synsets 0.5105 0.1843 294.3292 2.770 0.005957 **
## pos_scale_defs -0.5495 0.1807 300.1460 -3.042 0.002560 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) mn_hlp aoa ps_scl_s
## mn_hlpflnss -0.913
## aoa -0.899 0.666
## ps_scl_syns 0.038 -0.122 0.057
## pos_scl_dfs -0.038 0.052 0.022 -0.684
anova(wj_t1_mod1, wj_t1_mod2)
## refitting model(s) with ML (instead of REML)
## Data: wj_coeffs_t1_alldata
## Models:
## wj_t1_mod2: Estimate ~ mean_helpfulness + aoa + pos_scale_synsets + pos_scale_defs +
## wj_t1_mod2: (1 | seedword)
## wj_t1_mod1: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## wj_t1_mod1: aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## wj_t1_mod1: pos_scale_defs + (1 | seedword)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## wj_t1_mod2 7 1371.4 1397.7 -678.72 1357.4
## wj_t1_mod1 11 1377.1 1418.3 -677.53 1355.1 2.3634 4 0.6693
Does hypernymy predict coefficients? Again, yes, but goes away when you account for other word features.
sb5_coeffs_t3 <- word_coeffs_asb5_t3 %>%
rename(word=item) %>%
left_join(word_info, by="word")
sb5_coeffs_t3_alldata <- sb5_coeffs_t3 %>%
filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
!is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))
sb5_t3_mod1 <- lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=sb5_coeffs_t3_alldata)
summary(sb5_t3_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
## Data: sb5_coeffs_t3_alldata
##
## REML criterion at convergence: 1169
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2263 -0.6432 -0.1484 0.3050 2.6922
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.1958 0.4425
## Residual 2.2955 1.5151
## Number of obs: 312, groups: seedword, 73
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.33421 0.10804 48.45794 3.094 0.00328 **
## pos_scale_hypernyms -0.19512 0.08977 297.00388 -2.174 0.03053 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ps_scl_hypr 0.063
sb5_t3_mod2 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa +
mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword),
data=sb5_coeffs_t3_alldata)
summary(sb5_t3_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## pos_scale_defs + (1 | seedword)
## Data: sb5_coeffs_t3_alldata
##
## REML criterion at convergence: 1161.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3071 -0.5888 -0.1379 0.3462 2.7279
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.1538 0.3922
## Residual 2.1793 1.4762
## Number of obs: 312, groups: seedword, 73
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.86239 0.94345 277.49245 -0.914 0.3615
## pos_scale_hypernyms -0.09670 0.09248 281.64576 -1.046 0.2966
## mean_helpfulness 0.29368 0.19736 291.53024 1.488 0.1378
## adult_log_freq 0.17937 0.07810 299.76038 2.297 0.0223 *
## aoa -0.04973 0.07623 298.17808 -0.652 0.5147
## mean_preschoolness -0.12274 0.14049 262.69461 -0.874 0.3831
## mean_parent_importance -0.06219 0.15645 282.79343 -0.398 0.6913
## pos_scale_synsets -0.25425 0.13899 293.89989 -1.829 0.0684 .
## pos_scale_defs 0.17298 0.13387 297.30342 1.292 0.1973
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ps_scl_h mn_hlp adlt__ aoa mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.134
## mn_hlpflnss -0.405 0.047
## adlt_lg_frq -0.314 0.059 -0.332
## aoa -0.867 0.030 0.389 0.247
## mn_prschlns -0.119 0.073 -0.424 0.046 0.084
## mn_prnt_mpr -0.281 0.089 -0.214 -0.187 0.020 0.002
## ps_scl_syns 0.141 0.060 -0.009 -0.195 -0.006 0.063 -0.119
## pos_scl_dfs -0.128 0.097 -0.020 0.014 0.053 0.028 0.206 -0.669
sb5_t3_mod3 <- lmer(Estimate ~ adult_log_freq + pos_scale_synsets + (1|seedword), data=sb5_coeffs_t3_alldata)
summary(sb5_t3_mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ adult_log_freq + pos_scale_synsets + (1 | seedword)
## Data: sb5_coeffs_t3_alldata
##
## REML criterion at convergence: 1154.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5857 -0.5677 -0.1680 0.3554 2.7772
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.1742 0.4173
## Residual 2.1772 1.4755
## Number of obs: 312, groups: seedword, 73
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.26052 0.34991 266.94695 -3.602 0.000376 ***
## adult_log_freq 0.27246 0.05666 302.59162 4.808 2.4e-06 ***
## pos_scale_synsets -0.10065 0.10155 308.83382 -0.991 0.322371
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) adlt__
## adlt_lg_frq -0.955
## ps_scl_syns 0.360 -0.383
anova(sb5_t3_mod2, sb5_t3_mod3)
## refitting model(s) with ML (instead of REML)
## Data: sb5_coeffs_t3_alldata
## Models:
## sb5_t3_mod3: Estimate ~ adult_log_freq + pos_scale_synsets + (1 | seedword)
## sb5_t3_mod2: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## sb5_t3_mod2: aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## sb5_t3_mod2: pos_scale_defs + (1 | seedword)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## sb5_t3_mod3 5 1154.8 1173.5 -572.39 1144.8
## sb5_t3_mod2 11 1159.0 1200.2 -568.50 1137.0 7.7788 6 0.2548
Here, hypernymy remains a significant predictor even after accounting for other word characteristics. But why is higher hypernymy predicting better WJ performance? (we would expect lower-hypernym words to be more helpful…)
wj_coeffs_t3 <- word_coeffs_wj_t3 %>%
rename(word=item) %>%
left_join(word_info, by="word")
wj_coeffs_t3_alldata <- wj_coeffs_t3 %>%
filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
!is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))
#hypernymy still matters here after accounting for frequency & aoa, and helpfulness
wj_t3_mod1 <- lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=wj_coeffs_t3_alldata)
summary(wj_t3_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
## Data: wj_coeffs_t3_alldata
##
## REML criterion at convergence: 1374.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2916 -0.6591 0.0058 0.6458 4.4526
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.6227 0.7891
## Residual 4.3116 2.0764
## Number of obs: 312, groups: seedword, 73
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.2470 0.1629 45.5491 -1.517 0.136
## pos_scale_hypernyms 0.5783 0.1258 306.2179 4.598 6.24e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ps_scl_hypr 0.079
wj_t3_mod2 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa +
mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword),data=wj_coeffs_t3_alldata)
summary(wj_t3_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## pos_scale_defs + (1 | seedword)
## Data: wj_coeffs_t3_alldata
##
## REML criterion at convergence: 1350.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9710 -0.6479 -0.0584 0.5909 4.1701
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.4075 0.6384
## Residual 3.9912 1.9978
## Number of obs: 312, groups: seedword, 73
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -3.72180 1.29527 291.51949 -2.873 0.004359 **
## pos_scale_hypernyms 0.46110 0.12693 291.61947 3.633 0.000331 ***
## mean_helpfulness -0.49188 0.27051 298.07418 -1.818 0.070018 .
## adult_log_freq 0.23923 0.10688 301.81065 2.238 0.025940 *
## aoa 0.42644 0.10433 302.56631 4.087 5.6e-05 ***
## mean_preschoolness 0.06059 0.19323 277.83753 0.314 0.754105
## mean_parent_importance 0.31968 0.21467 293.84116 1.489 0.137517
## pos_scale_synsets -0.11762 0.19051 296.79836 -0.617 0.537425
## pos_scale_defs -0.03673 0.18341 297.90644 -0.200 0.841405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ps_scl_h mn_hlp adlt__ aoa mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.131
## mn_hlpflnss -0.407 0.043
## adlt_lg_frq -0.313 0.061 -0.334
## aoa -0.866 0.029 0.391 0.246
## mn_prschlns -0.116 0.076 -0.423 0.044 0.084
## mn_prnt_mpr -0.285 0.089 -0.208 -0.182 0.021 -0.006
## ps_scl_syns 0.138 0.064 -0.013 -0.187 -0.005 0.066 -0.121
## pos_scl_dfs -0.127 0.092 -0.016 0.010 0.054 0.025 0.203 -0.671
wj_t3_mod3 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa + (1|seedword),data=wj_coeffs_t3_alldata)
summary(wj_t3_mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## aoa + (1 | seedword)
## Data: wj_coeffs_t3_alldata
##
## REML criterion at convergence: 1347.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9986 -0.6868 -0.0379 0.5747 4.2003
##
## Random effects:
## Groups Name Variance Std.Dev.
## seedword (Intercept) 0.3643 0.6036
## Residual 4.0152 2.0038
## Number of obs: 312, groups: seedword, 73
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -3.0414 1.2216 295.1326 -2.490 0.013334 *
## pos_scale_hypernyms 0.4658 0.1238 293.6191 3.762 0.000203 ***
## mean_helpfulness -0.3618 0.2377 291.6385 -1.522 0.129125
## adult_log_freq 0.2401 0.1013 289.9098 2.369 0.018482 *
## aoa 0.4279 0.1037 305.9709 4.126 4.75e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ps_sc_ mn_hlp adlt__
## ps_scl_hypr -0.116
## mn_hlpflnss -0.620 0.103
## adlt_lg_frq -0.373 0.117 -0.419
## aoa -0.905 0.012 0.489 0.268
anova(wj_t3_mod2, wj_t3_mod3)
## refitting model(s) with ML (instead of REML)
## Data: wj_coeffs_t3_alldata
## Models:
## wj_t3_mod3: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## wj_t3_mod3: aoa + (1 | seedword)
## wj_t3_mod2: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +
## wj_t3_mod2: aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +
## wj_t3_mod2: pos_scale_defs + (1 | seedword)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## wj_t3_mod3 7 1349.8 1376.0 -667.91 1335.8
## wj_t3_mod2 11 1354.0 1395.2 -666.00 1332.0 3.8106 4 0.4323
child_vocab_summary <- vocab_survey_data %>%
group_by(subjCode, timepoint) %>%
summarise(total_says = sum(says), total_understands = sum(understands)) %>%
pivot_wider(names_from = timepoint, values_from = total_says:total_understands)
exp_survey_qs <- non_vocab_survey_data %>%
left_join(child_vocab_summary, by="subjCode") %>%
mutate(vocab_predict_academics_num = ifelse(vocab_predict_academics=="No",0,1),
importance_of_reading_num = case_when(
importance_of_reading == "Very important" ~ 4,
importance_of_reading == "Extremely important" ~ 5
),
parent_vocab_knowledge_num = case_when(
parent_vocab_knowledge == "Below average" ~ 0,
parent_vocab_knowledge == "Average" ~ 1,
parent_vocab_knowledge == "Above average" ~ 2
),
parent_vocab_knowledge_binary = ifelse(parent_vocab_knowledge_num==2,1,0),
bilingualism_bad_num = ifelse(bilingualism_bad== "No",0,1),
productive_vocab_growth = total_says_3-total_says_1,
receptive_vocab_growth = total_understands_3-total_understands_1)
parent_vocab_knowledge_num: Do you think your vocabulary is below average, average, or above average? (coded as 0, 1, 2, respectively)
parent_vocab_knowledge_binary: Recoded to group parents into 2 groups (below/average, and above average) as there was only one “below average” response
vocab_predict_academics_num: Do you think early vocabulary predicts later academics? (0 = no, 1 = yes)
bilingualism_bad_num: Do you think the costs of bilingualism outweigh the benefits? (0 = no, 1 = yes)
est_threeyo_vocab: How many words do you think the average 3-year-old knows?
hist(exp_survey_qs$parent_vocab_knowledge_num)
hist(exp_survey_qs$vocab_predict_academics_num)
hist(exp_survey_qs$bilingualism_bad_num)
hist(exp_survey_qs$est_threeyo_vocab)
Unfortunately the range of these variables is too small to look at any correlations. But can look at survey responses as categorical predictors:
### Does child vocab size or growth relate to parent self-report vocab?
summary(lm(total_says_1 ~ childAge_days + parent_vocab_knowledge_binary, data = exp_survey_qs))
##
## Call:
## lm(formula = total_says_1 ~ childAge_days + parent_vocab_knowledge_binary,
## data = exp_survey_qs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.55 -44.97 16.55 34.09 102.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.71851 40.85160 -1.070 0.2913
## childAge_days 0.15249 0.02627 5.804 1.06e-06 ***
## parent_vocab_knowledge_binary 31.29472 16.35868 1.913 0.0633 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.97 on 38 degrees of freedom
## Multiple R-squared: 0.507, Adjusted R-squared: 0.481
## F-statistic: 19.54 on 2 and 38 DF, p-value: 1.46e-06
summary(lm(total_says_3 ~ childAge_days + parent_vocab_knowledge_binary, data = exp_survey_qs))
##
## Call:
## lm(formula = total_says_3 ~ childAge_days + parent_vocab_knowledge_binary,
## data = exp_survey_qs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.879 -26.389 7.452 30.603 90.400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.93243 38.84877 1.157 0.2547
## childAge_days 0.11508 0.02498 4.606 4.51e-05 ***
## parent_vocab_knowledge_binary 29.57490 15.55666 1.901 0.0649 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.47 on 38 degrees of freedom
## Multiple R-squared: 0.4078, Adjusted R-squared: 0.3766
## F-statistic: 13.08 on 2 and 38 DF, p-value: 4.751e-05
summary(lm(productive_vocab_growth ~ childAge_days + parent_vocab_knowledge_binary, data = exp_survey_qs))
##
## Call:
## lm(formula = productive_vocab_growth ~ childAge_days + parent_vocab_knowledge_binary,
## data = exp_survey_qs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.314 -13.368 0.255 13.981 52.404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.65094 18.00202 4.924 1.68e-05 ***
## childAge_days -0.03742 0.01158 -3.232 0.00254 **
## parent_vocab_knowledge_binary -1.71982 7.20876 -0.239 0.81272
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.46 on 38 degrees of freedom
## Multiple R-squared: 0.2191, Adjusted R-squared: 0.178
## F-statistic: 5.33 on 2 and 38 DF, p-value: 0.009113
summary(lm(total_says_1 ~ childAge_days + vocab_predict_academics_num, data = exp_survey_qs))
##
## Call:
## lm(formula = total_says_1 ~ childAge_days + vocab_predict_academics_num,
## data = exp_survey_qs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -134.07 -34.49 10.08 36.10 114.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -28.65394 54.47749 -0.526 0.602
## childAge_days 0.15563 0.02921 5.329 4.74e-06 ***
## vocab_predict_academics_num -0.83306 23.57496 -0.035 0.972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.37 on 38 degrees of freedom
## Multiple R-squared: 0.4595, Adjusted R-squared: 0.4311
## F-statistic: 16.15 on 2 and 38 DF, p-value: 8.372e-06
summary(lm(total_says_3 ~ childAge_days + vocab_predict_academics_num, data = exp_survey_qs))
##
## Call:
## lm(formula = total_says_3 ~ childAge_days + vocab_predict_academics_num,
## data = exp_survey_qs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -156.30 -18.04 7.27 31.18 102.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.48180 51.73324 0.956 0.345
## childAge_days 0.12084 0.02773 4.357 9.66e-05 ***
## vocab_predict_academics_num 5.81733 22.38740 0.260 0.796
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.68 on 38 degrees of freedom
## Multiple R-squared: 0.3526, Adjusted R-squared: 0.3186
## F-statistic: 10.35 on 2 and 38 DF, p-value: 0.0002581
summary(lm(productive_vocab_growth ~ childAge_days + vocab_predict_academics_num, data = exp_survey_qs))
##
## Call:
## lm(formula = productive_vocab_growth ~ childAge_days + vocab_predict_academics_num,
## data = exp_survey_qs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.162 -11.870 -0.994 13.728 51.716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.13574 22.80954 3.426 0.00149 **
## childAge_days -0.03479 0.01223 -2.845 0.00712 **
## vocab_predict_academics_num 6.65039 9.87076 0.674 0.50455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.35 on 38 degrees of freedom
## Multiple R-squared: 0.2271, Adjusted R-squared: 0.1865
## F-statistic: 5.584 on 2 and 38 DF, p-value: 0.007483
t.test(mean_parent_importance ~ as.factor(type), data = word_info)
##
## Welch Two Sample t-test
##
## data: mean_parent_importance by as.factor(type)
## t = -3.1043, df = 119.15, p-value = 0.002383
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5176086 -0.1144512
## sample estimates:
## mean in group control mean in group seed
## 3.048542 3.364572
Some nice convergent validity here - e.g. words that Turkers think are helpful are also rated by parents to be important to teach.
word_info_forcorr <- word_info %>%
select(mean_helpfulness, mean_babiness, mean_preschoolness, mean_parent_importance)
info_data_corrmat <- cor(word_info_forcorr, use="pairwise.complete.obs")
info_pmat <- cor.mtest(word_info_forcorr)
info_sig_matrix <- info_pmat$p
corrplot(corr = info_data_corrmat, method="color", type="lower",
diag=FALSE, addCoef.col = "black", p.mat=info_sig_matrix, insig="blank",
tl.srt = 45, tl.col="black", tl.cex=.8, number.cex = .5)