SES_data <- read_csv("SES_data.csv")
## Parsed with column specification:
## cols(
##   subjCode = col_double(),
##   parentEd = col_character(),
##   parentIncome = col_character(),
##   childGender = col_character(),
##   childRace = col_character(),
##   childHispanic = col_character(),
##   parentEd_num = col_double(),
##   parentIncome_num = col_double(),
##   parentEd_z = col_double(),
##   parentIncome_z = col_double()
## )
task_data <- read_csv("combined_data_wide.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
names(task_data) <- c("subjCode","childAge_days_t1","childAge_days_t3","childAge_months_t1","childAge_months_t3",
                              "childAge_years_t1","childAge_years_t3", "subjCode_lab_t3", "ASB5", "Color",
                              "ppvt_overlapping_survey", "Shape","WJ", "WJ_scaled", "ppvt_first_set", "ppvt_n_correct",
                              "ppvt_n_attempted", "ppvt_standard_score", "ppvt_percentile", "ppvt_prop_correct",
                              "total_understands_control_t1", "total_understands_control_t3", "total_understands_seed_t1",
                              "total_understands_seed_t3", "total_says_control_t1", "total_says_control_t3", "total_says_seed_t1",
                              "total_says_seed_t3")

non_vocab_survey_data <- read_csv("non_vocab_survey_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   subjCode = col_double(),
##   timepoint = col_double(),
##   parentAge = col_logical(),
##   childNumSibs = col_double(),
##   childBirthOrder = col_double(),
##   recordedDate = col_date(format = ""),
##   childAge_days = col_double(),
##   surgency_score = col_double(),
##   LangUse_sum = col_double(),
##   child_L2 = col_logical(),
##   percent_english = col_double(),
##   percent_L2 = col_logical(),
##   age_began_daycare = col_logical(),
##   age_began_preschool = col_double(),
##   age_began_formal_school = col_logical(),
##   est_threeyo_vocab = col_double()
## )
## See spec(...) for full column specifications.
current_task_data <- task_data %>% 
  left_join(SES_data, by="subjCode") %>% 
  left_join(non_vocab_survey_data, by="subjCode")

vocab_survey_data <- read_csv("vocab_data_by_word.csv") %>% filter(!is.na(subjCode))
## Parsed with column specification:
## cols(
##   subjCode = col_double(),
##   timepoint = col_double(),
##   childAge_days = col_double(),
##   word = col_character(),
##   understands = col_double(),
##   says = col_double(),
##   importance_to_teach = col_character(),
##   seedword = col_character(),
##   type = col_character(),
##   pos = col_character(),
##   num_hypernyms = col_double(),
##   num_hyponyms = col_double(),
##   aoa = col_double(),
##   pos_scale_hypernyms = col_double(),
##   pos_scale_log_hyponyms = col_double()
## )
vocab_tasks_byword <- read_csv("vocab_task_data_by_word.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   word = col_character(),
##   importance_to_teach = col_character(),
##   seedword = col_character(),
##   type = col_character(),
##   pos = col_character()
## )
## See spec(...) for full column specifications.
word_importance <- vocab_survey_data %>% 
  select(-seedword) %>% 
  distinct() %>% 
  filter(!is.na(importance_to_teach)) %>% 
  mutate(importance_num = case_when(
    importance_to_teach == "Not at all important" ~ 1,
    importance_to_teach == "Slightly important" ~ 2,
    importance_to_teach == "Moderately important" ~ 3,
    importance_to_teach == "Very important" ~ 4,
    importance_to_teach == "Extremely important" ~ 5
  )) %>% 
  group_by(word) %>% 
  summarize(mean_parent_importance = mean(importance_num), n_importance_ratings = n())

helpfulness <- read_csv("../word_norms/helpfulness_ratings.csv") %>% select(word, mean_helpfulness = resp_mean) %>% 
  group_by(word) %>% 
  summarise(mean_helpfulness = mean(mean_helpfulness)) %>% 
  ungroup()
## Parsed with column specification:
## cols(
##   word = col_character(),
##   resp_mean = col_double(),
##   resp_sd = col_double(),
##   num_resp = col_double()
## )
babiness_preschoolness <- read_csv("../word_norms/babiness_ratings.csv") %>% 
  select(word, mean_babiness = babiness_mean, mean_preschoolness = preschoolness_mean)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   word = col_character(),
##   babiness_mean = col_double(),
##   babiness_sd = col_double(),
##   preschoolness_mean = col_double(),
##   preschoolness_sd = col_double(),
##   num_resp = col_double()
## )
defs_synsets <- read_csv("../word_norms/all_words_synsets_defs.csv")
## Parsed with column specification:
## cols(
##   word = col_character(),
##   pos = col_character(),
##   n_synsets = col_double(),
##   n_definitions = col_double()
## )
no_generality <- read_csv("../word_norms/old_files/no_generality.csv") %>% select(-mean_generality, -num_item_id)
## Parsed with column specification:
## cols(
##   num_item_id = col_character(),
##   word = col_character(),
##   pos = col_character(),
##   mean_generality = col_logical()
## )
generality_cdi_childes <- read_csv("../word_norms/generality_ratings_byWord_cdi_childes.csv")
## Parsed with column specification:
## cols(
##   word = col_character(),
##   num_item_id = col_character(),
##   mean_generality = col_double(),
##   n_generality_ratings = col_double()
## )
generality_otherwords <- read_csv("../word_norms/new_generality_ratings.csv") %>% 
  rename(pos_fullword = pos, word_descriptive = word, word = word2) %>% 
  mutate(pos = case_when(
    pos_fullword == "noun" ~ "n",
    pos_fullword == "verb" ~ "v"
  ))
## Parsed with column specification:
## cols(
##   word2 = col_character(),
##   word = col_character(),
##   pos = col_character(),
##   mean_generality = col_double(),
##   n_gen_ratings = col_double()
## )
generality_otherwords_getpos <- generality_otherwords %>% left_join(no_generality, by="word") %>% 
  mutate(pos_resolved = coalesce(pos.x, pos.y)) %>% 
  select(word, pos = pos_resolved, mean_generality, n_generality_ratings = n_gen_ratings) %>% 
  distinct()

word_info <- read_csv("../word_norms/word_info_with_cdi_ids.csv") %>% 
  select(num_item_id,word, seedword, type, pos, num_hypernyms, num_hyponyms, aoa, adult_log_freq) %>% 
  distinct() %>% 
  left_join(defs_synsets, by=c("word", "pos")) %>% 
  left_join(helpfulness, by="word") %>% 
  left_join(babiness_preschoolness, by="word") %>% 
  left_join(word_importance, by="word") %>% 
  left_join(generality_cdi_childes, by="num_item_id") %>% 
  select(-word.y, word=word.x) %>% 
  left_join(generality_otherwords_getpos, by=c("word", "pos")) %>% 
  mutate(n_generality_ratings = coalesce(n_generality_ratings.x, n_generality_ratings.y),
         mean_generality = coalesce(mean_generality.x, mean_generality.y)) %>% 
  select(-n_generality_ratings.x, -n_generality_ratings.y, -mean_generality.x, -mean_generality.y) %>% 
  group_by(pos) %>% 
  mutate(pos_scale_hypernyms = scale(num_hypernyms),
         pos_scale_log_hyponyms = scale(log(1+num_hyponyms)),
         pos_scale_synsets = scale(n_synsets),
         pos_scale_defs = scale(n_definitions),
         pos_scale_generality = scale(mean_generality)) %>% 
  ungroup()
## Parsed with column specification:
## cols(
##   word = col_character(),
##   num_item_id = col_character(),
##   seedword = col_character(),
##   pos = col_character(),
##   control_def = col_character(),
##   child_know = col_double(),
##   kid_log_freq = col_double(),
##   adult_log_freq = col_double(),
##   num_peers = col_double(),
##   num_hypernyms = col_double(),
##   type = col_character(),
##   aoa = col_double(),
##   FREQcount = col_double(),
##   logFreq_subtlex = col_double(),
##   num_hyponyms = col_double(),
##   log_hyponyms = col_double()
## )
child_mean_word_info <- word_info %>% 
  select(word, mean_helpfulness, mean_generality, pos_scale_generality, adult_log_freq) %>% 
  left_join(vocab_survey_data, by="word") %>% 
  select(subjCode, timepoint, childAge_days, word, understands, says, everything()) %>% 
  distinct() %>%
  group_by(subjCode, timepoint) %>% 
  filter(says==1) %>% 
  summarize(mean_helpfulness_of_vocab = mean(mean_helpfulness, na.rm=TRUE),
            mean_aoa_of_vocab = mean(aoa, na.rm=TRUE),
            mean_freq_of_vocab = mean(adult_log_freq, na.rm=TRUE),
            mean_hypernymy_of_vocab = mean(pos_scale_hypernyms, na.rm=TRUE),
            mean_generality_of_vocab_unscaled = mean(mean_generality, na.rm=TRUE),
            mean_generality_of_vocab = mean(pos_scale_generality, na.rm=TRUE)) %>% 
  pivot_wider(names_from = timepoint, names_prefix="t", values_from = mean_helpfulness_of_vocab:mean_generality_of_vocab)

1 Variables/codebook

The goal of these analyses is to investigate how 2-4yo’s vocabulary knowledge might predict performance on cognitive tasks. We investigate vocabulary at the composite level (e.g., how many total seed/control words) as well as individual word knowledge to identify which specific words might be most helpful for kids to know.

t1: first timepoint, when parents did online survey (summer 2019)
t3: third timepoint, when parents brought kids into lab (fall 2019/winter 2020)
(t2 is online language use survey, summer 2019, not included here because we didn’t ask about vocab)

ASB5: number of items child got correct on our adapted SB-5 – max of 12
WJ: number of items child got correct on Woodcock-Johnson Concept Formation test – max of 40
Color: number of items child got correct on productive color test – max of 5
Shape: number of items child got correct on productive shape test – max of 8
ppvt_overlapping_survey: number of items with same response (knows or doesn’t know) on PPVT (child behavioral measure) and parent report (survey) – max of 9
ppvt_first_set: number of starting set when child did PPVT
mean_helpfulness_of_vocab: mean helpfulness rating of all the words child is reported to know (helpfulness ratings provided by adults who were asked “How helpful would it be for a preschooler to know the word ____?”)
mean_aoa_of_vocab: mean age of acquisition of all the words child is reported to know (from Kuperman norms)
mean_freq_of_vocab: mean frequency of all the words child is reported to know (from adult speech in CHILDES, log-transformed)
mean_hypernymy_of_vocab: mean number of hypernyms of all the words child is reported to know (from Wordnet) mean_generality_of_vocab: mean generality rating of all the words child is reported to know (generality ratings provided by adults on a scale of 1-5, 1 being most specific; currently have ratings for 133 of 371 words)
mean_parent_importance: for words that parents reported their children not knowing, we asked them to rate on a scale of 1-5 how important it would be to teach their child that word. This is used as a predictor in the word-level analyses.

2 Investigate distributions of behavioral measures

When kids came into the lab at T3, they completed a subset of trials from the Early SB-5, the Concept Formation test of the Woodcock-Johnson, a productive color vocabulary test, productive shape vocabulary test, and the PPVT. 9 items on the PPVT were also on the parent-report vocabulary survey, so we can look at how much overlap there is between children’s parent-reported vs. actual knowledge.

hist(current_task_data$ASB5)

hist(current_task_data$WJ)

hist(current_task_data$Color)

hist(current_task_data$Shape)

hist(current_task_data$ppvt_overlapping_survey)

3 Investigate distributions of SES (Education range 1-7, Income range 1-4)

Unsurprisingly, it’s Madison - parent education and income are both high. This likely explains the lack of correlation between vocabulary and SES measures below.

hist(current_task_data$parentEd_num)

hist(current_task_data$parentIncome_num)

4 Correlations among variables

4.1 Zero-order correlations

current_task_data_forcorr <- current_task_data %>%
  ungroup() %>% 
  left_join(child_mean_word_info, by="subjCode") %>% 
  mutate(avg_SES_z = (parentEd_z + parentIncome_z)/2) %>% 
  select(childAge_days_t1, childAge_days_t3, avg_SES_z, ASB5, WJ, Color, Shape, ppvt_overlapping_survey,
         ppvt_standard_score, ppvt_percentile, ppvt_prop_correct,
         total_says_seed_t1, total_understands_seed_t1, total_says_control_t1, total_understands_control_t1,
         total_says_seed_t3, total_understands_seed_t3, total_says_control_t3, total_understands_control_t3)

current_data_corrmat <- cor(current_task_data_forcorr, use="pairwise.complete.obs")
pmat <- cor.mtest(current_task_data_forcorr)

sig_matrix <- pmat$p

corrplot(corr = current_data_corrmat, method="color", type="lower",
         diag=FALSE, addCoef.col = "black", p.mat=sig_matrix, insig="blank",
         tl.srt = 45, tl.col="black", tl.cex=.8, number.cex = .5)

4.2 Regress age out of ASB5 and WJ; add other characteristics of kids’ vocabularies

ASB5.age: ASB5 performance, controlling for age
WJ.age: WJ performance, controlling for age

age_sb5 <- lm(ASB5 ~ childAge_days_t3, data=current_task_data_forcorr)
age_wj <- lm(WJ ~ childAge_days_t3, data=current_task_data_forcorr)

resid_task_data <- current_task_data %>%
  left_join(child_mean_word_info, by="subjCode") %>% 
  mutate(ASB5.age = age_sb5$residuals,
         WJ.age = age_wj$residuals,
         total_vocab_t1 = total_says_seed_t1 + total_says_control_t1,
         total_vocab_t3 = total_says_seed_t3 + total_says_control_t3)

resid_task_data_forcorr <- resid_task_data %>% 
  select(childAge_days_t3, ASB5.age, WJ.age, ppvt_prop_correct, ppvt_standard_score, ppvt_percentile,
         total_says_seed_t1, total_says_control_t1, total_says_seed_t3, total_says_control_t3, mean_helpfulness_of_vocab_t1,
         mean_aoa_of_vocab_t1, mean_freq_of_vocab_t1, mean_hypernymy_of_vocab_t1, mean_generality_of_vocab_t1,
         mean_helpfulness_of_vocab_t3, mean_aoa_of_vocab_t3,
         mean_freq_of_vocab_t3, mean_hypernymy_of_vocab_t3, mean_generality_of_vocab_t3)

resid_data_corrmat <- cor(resid_task_data_forcorr, use="pairwise.complete.obs")
resid_pmat <- cor.mtest(resid_task_data_forcorr)

resid_sig_matrix <- resid_pmat$p

corrplot(corr = resid_data_corrmat, method="color", type="lower",
         diag=FALSE, addCoef.col = "black", p.mat=resid_sig_matrix, insig="blank",
         tl.srt = 45, tl.col="black", tl.cex=.8, number.cex = .5)

4.3 Compare vocabulary characteristics across seed/control words and timepoints

resid_task_data <- resid_task_data %>%
  mutate(prop_seed_known_t1 = total_says_seed_t1/72,
         prop_control_known_t1 = total_says_control_t1/251,
         prop_seed_known_t3 = total_says_seed_t3/72,
         prop_control_known_t3 = total_says_control_t3/251)

t.test(resid_task_data$prop_seed_known_t1, resid_task_data$prop_control_known_t1, paired=TRUE)
## 
##  Paired t-test
## 
## data:  resid_task_data$prop_seed_known_t1 and resid_task_data$prop_control_known_t1
## t = 4.2335, df = 35, p-value = 0.0001581
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03076256 0.08744978
## sample estimates:
## mean of the differences 
##              0.05910617
t.test(resid_task_data$prop_seed_known_t3, resid_task_data$prop_control_known_t3, paired=TRUE)
## 
##  Paired t-test
## 
## data:  resid_task_data$prop_seed_known_t3 and resid_task_data$prop_control_known_t3
## t = 2.0213, df = 35, p-value = 0.05095
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0001465045  0.0674421368
## sample estimates:
## mean of the differences 
##              0.03364782
t.test(resid_task_data$mean_hypernymy_of_vocab_t1, resid_task_data$mean_hypernymy_of_vocab_t3, paired=TRUE)
## 
##  Paired t-test
## 
## data:  resid_task_data$mean_hypernymy_of_vocab_t1 and resid_task_data$mean_hypernymy_of_vocab_t3
## t = -0.40011, df = 35, p-value = 0.6915
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02398338  0.01608620
## sample estimates:
## mean of the differences 
##            -0.003948589
t.test(resid_task_data$mean_generality_of_vocab_t1, resid_task_data$mean_generality_of_vocab_t3, paired=TRUE)
## 
##  Paired t-test
## 
## data:  resid_task_data$mean_generality_of_vocab_t1 and resid_task_data$mean_generality_of_vocab_t3
## t = -1.663, df = 35, p-value = 0.1052
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.048796655  0.004851023
## sample estimates:
## mean of the differences 
##             -0.02197282

At T1, kids know 72% of seed words and 66% of control words (made proportion to account for different numbers of seed and control words).
At T3, this gap has closed a bit and difference is no longer statistically significant (know 79% of seed and 76% of control).
Vocabulary hypernymy and generality don’t change across timepoints.

4.4 Look more into why ASB5-WJ correlation goes away when we control for age

asb5_wj_age <- current_task_data %>% 
  mutate(ASB5.age = age_sb5$residuals,
         WJ.age = age_wj$residuals,
         log_WJ = log(WJ+1)) %>% 
  select(subjCode, ASB5, ASB5.age, WJ, WJ_scaled, log_WJ, WJ.age, childAge_days_t3)

ggpairs(asb5_wj_age, columns = c(2, 3, 4, 7))

4.4.1 How do ASB5 and age predict WJ together?

predict_wj <- lm(WJ ~ ASB5 + childAge_days_t3, data=asb5_wj_age)
summary(predict_wj)
## 
## Call:
## lm(formula = WJ ~ ASB5 + childAge_days_t3, data = asb5_wj_age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7223 -3.0578 -0.0357  1.6218 16.6507 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      -14.279986   4.173201  -3.422  0.00168 **
## ASB5               0.197919   0.426039   0.465  0.64530   
## childAge_days_t3   0.011434   0.003805   3.005  0.00505 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.557 on 33 degrees of freedom
## Multiple R-squared:  0.4213, Adjusted R-squared:  0.3862 
## F-statistic: 12.01 on 2 and 33 DF,  p-value: 0.0001205

4.4.2 How do WJ and age predict ASB5 together?

predict_asb5 <- lm(ASB5 ~ WJ + childAge_days_t3, data=asb5_wj_age)
summary(predict_asb5)
## 
## Call:
## lm(formula = ASB5 ~ WJ + childAge_days_t3, data = asb5_wj_age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6545 -1.0931  0.3525  1.0771  3.4288 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.280224   1.965688  -0.651 0.519372    
## WJ                0.032828   0.070665   0.465 0.645300    
## childAge_days_t3  0.006088   0.001391   4.376 0.000115 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.856 on 33 degrees of freedom
## Multiple R-squared:  0.5336, Adjusted R-squared:  0.5053 
## F-statistic: 18.87 on 2 and 33 DF,  p-value: 3.429e-06

It seems like age is just a very strong predictor of both ASB5 and WJ, so after you control for age, performance on one measure doesn’t account for any additional variance in performance on the other.

5 Investigate aggregate vocabulary characteristics (mean hypernymy, etc.) as predictor of ASB5 performance

5.1 Characteristics of T1 vocabulary: estimates and model comparison

Issues with collinearity among predictors, so full model isn’t appropriate. The model that contains age, T1 vocab size, and mean T1 vocab hypernymy has the highest adjusted R-squared.

# all_t1_characteristics <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_helpfulness_of_vocab_t1 + mean_aoa_of_vocab_t1 +
#                          mean_freq_of_vocab_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_hyper <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_aoa <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_aoa_of_vocab_t1, data = resid_task_data)
t1voc_freq <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_freq_of_vocab_t1, data = resid_task_data)
t1voc_help <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_helpfulness_of_vocab_t1, data = resid_task_data)
t1voc_general <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t1 + mean_generality_of_vocab_t1, data = resid_task_data)
t1vocchange_general <- lm(ASB5 ~ mean_generality_of_vocab_t1 + total_vocab_t1 + I(total_vocab_t3-total_vocab_t1), data = resid_task_data)

tab_model(t1voc_help, t1voc_aoa, t1voc_freq, t1voc_hyper, t1voc_general, t1vocchange_general)
  ASB 5 ASB 5 ASB 5 ASB 5 ASB 5 ASB 5
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) -39.10 -97.00 – 18.79 0.178 15.49 -13.56 – 44.53 0.285 -55.15 -101.67 – -8.62 0.022 -1.80 -4.84 – 1.25 0.238 -2.10 -5.86 – 1.66 0.263 3.95 -0.90 – 8.80 0.107
childAge_days_t3 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.00 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001
total_vocab_t1 0.03 -0.02 – 0.07 0.214 0.02 -0.02 – 0.06 0.250 0.04 0.00 – 0.07 0.034 0.00 -0.01 – 0.02 0.449 0.00 -0.01 – 0.02 0.835 0.02 0.00 – 0.04 0.030
mean_helpfulness_of_vocab_t1 9.86 -5.40 – 25.12 0.198
mean_aoa_of_vocab_t1 -4.29 -11.47 – 2.88 0.232
mean_freq_of_vocab_t1 7.11 0.93 – 13.29 0.026
mean_hypernymy_of_vocab_t1 -18.57 -30.39 – -6.74 0.003
mean_generality_of_vocab_t1 -2.28 -12.11 – 7.55 0.640 -2.28 -15.19 – 10.63 0.722
total_vocab_t3 -
total_vocab_t1
-0.00 -0.04 – 0.04 0.885
Observations 36 36 36 36 36 36
R2 / R2 adjusted 0.555 / 0.513 0.551 / 0.509 0.599 / 0.562 0.644 / 0.611 0.534 / 0.490 0.197 / 0.122

5.2 Characteristics of T3 vocabulary: estimates and model comparison

Issues with collinearity among predictors, so full model isn’t appropriate. Models with mean frequency (+age+vocab) and mean hypernymy (+age+vocab) have the highest adjusted R-squared.

# all_t3_characteristics <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_helpfulness_of_vocab_t3 + mean_aoa_of_vocab_t3 +
#                          mean_freq_of_vocab_t3 + mean_hypernymy_of_vocab_t3, data = resid_task_data)
t3voc_hyper <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_hypernymy_of_vocab_t3, data = resid_task_data)
t3voc_aoa <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_aoa_of_vocab_t3, data = resid_task_data)
t3voc_freq <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_freq_of_vocab_t3, data = resid_task_data)
t3voc_help <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_helpfulness_of_vocab_t3, data = resid_task_data)
t3voc_general <- lm(ASB5 ~ childAge_days_t3 + total_vocab_t3 + mean_generality_of_vocab_t3, data = resid_task_data)

tab_model(t3voc_help, t3voc_aoa, t3voc_freq, t3voc_hyper, t3voc_general)
  ASB 5 ASB 5 ASB 5 ASB 5 ASB 5
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) 19.02 -67.41 – 105.46 0.657 3.27 -30.91 – 37.45 0.847 -40.80 -110.95 – 29.34 0.245 -4.07 -8.74 – 0.61 0.086 -1.64 -5.53 – 2.25 0.396
childAge_days_t3 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001
total_vocab_t3 -0.01 -0.07 – 0.05 0.684 0.01 -0.04 – 0.05 0.704 0.03 -0.02 – 0.08 0.255 0.01 -0.01 – 0.02 0.453 -0.00 -0.02 – 0.02 0.958
mean_helpfulness_of_vocab_t3 -5.38 -27.49 – 16.73 0.624
mean_aoa_of_vocab_t3 -1.34 -10.05 – 7.36 0.755
mean_freq_of_vocab_t3 5.13 -4.12 – 14.38 0.267
mean_hypernymy_of_vocab_t3 -11.21 -26.45 – 4.03 0.144
mean_generality_of_vocab_t3 -7.23 -24.45 – 10.00 0.399
Observations 36 36 36 36 36
R2 / R2 adjusted 0.535 / 0.492 0.533 / 0.489 0.549 / 0.507 0.562 / 0.521 0.542 / 0.499

6 Investigate aggregate vocabulary knowledge as predictor of ASB5 performance

6.1 Vocabulary as predictors, controlling for age

Controlling for age in different ways (age at T3, age at T1, age at T3 & amount of time elapsed since T1), and looking at different aspects of vocabulary (e.g. knowledge at T3; growth from T1-T3).

current_task_data_forcorr <- current_task_data_forcorr %>% 
  mutate(prop_seed_t1 = total_says_seed_t1/(total_says_seed_t1+total_says_control_t1),
         prop_seed_t3 = total_says_seed_t3/(total_says_seed_t3+total_says_control_t3))

# seed and control at t1
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_control_t1,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_control_t1, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8125 -1.0297  0.3963  0.9910  3.4542 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.7519817  1.6842834  -1.040    0.306    
## childAge_days_t3       0.0065471  0.0013050   5.017 1.76e-05 ***
## total_says_control_t1 -0.0004398  0.0079555  -0.055    0.956    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.862 on 33 degrees of freedom
## Multiple R-squared:  0.5305, Adjusted R-squared:  0.5021 
## F-statistic: 18.65 on 2 and 33 DF,  p-value: 3.813e-06
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_seed_t1,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_seed_t1, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9032 -1.0060  0.1404  0.9911  3.8218 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.014685   1.727209  -1.166    0.252    
## childAge_days_t3    0.006022   0.001355   4.444 9.38e-05 ***
## total_says_seed_t1  0.019327   0.033764   0.572    0.571    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.853 on 33 degrees of freedom
## Multiple R-squared:  0.5351, Adjusted R-squared:  0.5069 
## F-statistic: 18.99 on 2 and 33 DF,  p-value: 3.244e-06
# seed and control at t3
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_control_t3,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_control_t3, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8207 -1.0704  0.3529  0.9740  3.5006 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.7924172  1.7875491  -1.003    0.323    
## childAge_days_t3       0.0064729  0.0012365   5.235 9.23e-06 ***
## total_says_control_t3  0.0004329  0.0083668   0.052    0.959    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.862 on 33 degrees of freedom
## Multiple R-squared:  0.5305, Adjusted R-squared:  0.5021 
## F-statistic: 18.65 on 2 and 33 DF,  p-value: 3.814e-06
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+total_says_seed_t3,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + total_says_seed_t3, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7858 -1.1089  0.1156  0.9766  3.6684 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.876035   2.272839  -1.705   0.0975 .  
## childAge_days_t3    0.005876   0.001139   5.159 1.15e-05 ***
## total_says_seed_t3  0.054278   0.040542   1.339   0.1898    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.813 on 33 degrees of freedom
## Multiple R-squared:  0.5547, Adjusted R-squared:  0.5277 
## F-statistic: 20.55 on 2 and 33 DF,  p-value: 1.596e-06
# difference between seed and control
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+I(total_says_seed_t1-total_says_control_t1),data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + I(total_says_seed_t1 - 
##     total_says_control_t1), data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7946 -0.9722  0.3749  1.0346  3.3980 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                   -1.746512   1.677072  -1.041
## childAge_days_t3                               0.006665   0.001265   5.267
## I(total_says_seed_t1 - total_says_control_t1)  0.002288   0.009810   0.233
##                                               Pr(>|t|)    
## (Intercept)                                      0.305    
## childAge_days_t3                               8.4e-06 ***
## I(total_says_seed_t1 - total_says_control_t1)    0.817    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.86 on 33 degrees of freedom
## Multiple R-squared:  0.5313, Adjusted R-squared:  0.5029 
## F-statistic:  18.7 on 2 and 33 DF,  p-value: 3.717e-06
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+I(total_says_seed_t3-total_says_control_t3),data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + I(total_says_seed_t3 - 
##     total_says_control_t3), data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8089 -0.9116  0.3836  1.0327  3.3538 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                   -1.675047   1.710754  -0.979
## childAge_days_t3                               0.006661   0.001235   5.395
## I(total_says_seed_t3 - total_says_control_t3)  0.002448   0.009859   0.248
##                                               Pr(>|t|)    
## (Intercept)                                      0.335    
## childAge_days_t3                              5.74e-06 ***
## I(total_says_seed_t3 - total_says_control_t3)    0.805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.86 on 33 degrees of freedom
## Multiple R-squared:  0.5314, Adjusted R-squared:  0.503 
## F-statistic: 18.71 on 2 and 33 DF,  p-value: 3.703e-06
# seed growth
current_task_data_forcorr %>% lm(ASB5~childAge_days_t3+I(total_says_seed_t3-total_says_seed_t1),data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t3 + I(total_says_seed_t3 - 
##     total_says_seed_t1), data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6517 -0.9778  0.3544  1.1524  3.1646 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                -2.631187   2.072641  -1.269
## childAge_days_t3                            0.006957   0.001236   5.627
## I(total_says_seed_t3 - total_says_seed_t1)  0.033720   0.047807   0.705
##                                            Pr(>|t|)    
## (Intercept)                                   0.213    
## childAge_days_t3                            2.9e-06 ***
## I(total_says_seed_t3 - total_says_seed_t1)    0.486    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.848 on 33 degrees of freedom
## Multiple R-squared:  0.5375, Adjusted R-squared:  0.5094 
## F-statistic: 19.17 on 2 and 33 DF,  p-value: 2.983e-06
# account for how much time elapsed between t1-t3
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_seed_t1,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) + 
##     total_says_seed_t1, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9965 -1.0941  0.0728  1.0763  3.5832 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                            -2.955054   2.528700  -1.169
## childAge_days_t1                        0.005985   0.001372   4.361
## I(childAge_days_t3 - childAge_days_t1)  0.010928   0.009636   1.134
## total_says_seed_t1                      0.019206   0.034148   0.562
##                                        Pr(>|t|)    
## (Intercept)                            0.251195    
## childAge_days_t1                       0.000126 ***
## I(childAge_days_t3 - childAge_days_t1) 0.265184    
## total_says_seed_t1                     0.577734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.874 on 32 degrees of freedom
## Multiple R-squared:  0.5389, Adjusted R-squared:  0.4957 
## F-statistic: 12.47 on 3 and 32 DF,  p-value: 1.449e-05
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_seed_t3,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) + 
##     total_says_seed_t3, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8713 -1.1138  0.0911  0.9632  3.4512 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                            -4.699517   2.878083  -1.633
## childAge_days_t1                        0.005849   0.001154   5.068
## I(childAge_days_t3 - childAge_days_t1)  0.010332   0.009432   1.095
## total_says_seed_t3                      0.053486   0.041060   1.303
##                                        Pr(>|t|)    
## (Intercept)                               0.112    
## childAge_days_t1                       1.63e-05 ***
## I(childAge_days_t3 - childAge_days_t1)    0.282    
## total_says_seed_t3                        0.202    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.835 on 32 degrees of freedom
## Multiple R-squared:  0.5578, Adjusted R-squared:  0.5164 
## F-statistic: 13.46 on 3 and 32 DF,  p-value: 7.536e-06
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_control_t3,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) + 
##     total_says_control_t3, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9139 -1.1796  0.2551  0.9983  3.2742 
## 
## Coefficients:
##                                          Estimate Std. Error t value
## (Intercept)                            -2.7162429  2.5510073  -1.065
## childAge_days_t1                        0.0064563  0.0012510   5.161
## I(childAge_days_t3 - childAge_days_t1)  0.0114262  0.0097309   1.174
## total_says_control_t3                   0.0001165  0.0084842   0.014
##                                        Pr(>|t|)    
## (Intercept)                               0.295    
## childAge_days_t1                       1.24e-05 ***
## I(childAge_days_t3 - childAge_days_t1)    0.249    
## total_says_control_t3                     0.989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.883 on 32 degrees of freedom
## Multiple R-squared:  0.5344, Adjusted R-squared:  0.4907 
## F-statistic: 12.24 on 3 and 32 DF,  p-value: 1.689e-05
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1)+total_says_control_t1,data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) + 
##     total_says_control_t1, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9050 -1.1567  0.2830  0.9991  3.3005 
## 
## Coefficients:
##                                          Estimate Std. Error t value
## (Intercept)                            -2.7034895  2.5040150  -1.080
## childAge_days_t1                        0.0065228  0.0013206   4.939
## I(childAge_days_t3 - childAge_days_t1)  0.0115327  0.0097076   1.188
## total_says_control_t1                  -0.0006124  0.0080520  -0.076
##                                        Pr(>|t|)    
## (Intercept)                               0.288    
## childAge_days_t1                       2.37e-05 ***
## I(childAge_days_t3 - childAge_days_t1)    0.244    
## total_says_control_t1                     0.940    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.883 on 32 degrees of freedom
## Multiple R-squared:  0.5345, Adjusted R-squared:  0.4908 
## F-statistic: 12.25 on 3 and 32 DF,  p-value: 1.685e-05
current_task_data_forcorr %>% lm(ASB5 ~ childAge_days_t1+I(childAge_days_t3-childAge_days_t1) + total_says_control_t1 + I(total_says_seed_t1-total_says_control_t1),data=.) %>% summary
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t1 + I(childAge_days_t3 - childAge_days_t1) + 
##     total_says_control_t1 + I(total_says_seed_t1 - total_says_control_t1), 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0201 -0.6222  0.1938  1.0509  3.7444 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                   -3.539976   2.584524  -1.370
## childAge_days_t1                               0.006047   0.001371   4.410
## I(childAge_days_t3 - childAge_days_t1)         0.011672   0.009645   1.210
## total_says_control_t1                          0.061193   0.052417   1.167
## I(total_says_seed_t1 - total_says_control_t1)  0.077205   0.064710   1.193
##                                               Pr(>|t|)    
## (Intercept)                                   0.180627    
## childAge_days_t1                              0.000115 ***
## I(childAge_days_t3 - childAge_days_t1)        0.235344    
## total_says_control_t1                         0.251940    
## I(total_says_seed_t1 - total_says_control_t1) 0.241888    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.87 on 31 degrees of freedom
## Multiple R-squared:  0.5549, Adjusted R-squared:  0.4975 
## F-statistic: 9.662 on 4 and 31 DF,  p-value: 3.416e-05

7 Investigate predictiveness of individual word knowledge at T1 and T3 for T3 outcomes

Because the long-term goal is to identify specific words that could be helpful for cognitive/vocabulary development, shift analyses to look at the role of individual words

## put everything together into df
task_data_wide <- current_task_data %>% 
  mutate(total_says_t1 = total_says_control_t1+total_says_seed_t1,
         total_says_t3 = total_says_control_t3+total_says_seed_t3,
         total_understands_t1 = total_understands_control_t1+total_understands_seed_t1,
         total_understands_t3 = total_understands_control_t3+total_understands_seed_t3) %>% 
  select(subjCode, childAge_days_t1, childAge_days_t3, total_understands_t1, total_says_t1,total_understands_t3, total_says_t3,
         ASB5, WJ, WJ_scaled, ppvt_prop_correct, ppvt_standard_score, ppvt_percentile)
  
##inner join to only include kids who've come into lab
df_to_analyze <- vocab_survey_data %>% 
  filter(!is.na(word)) %>% 
  inner_join(task_data_wide, by="subjCode") %>% 
  select(-importance_to_teach, -childAge_days) %>% 
  rename(item=word)

###code from Molly
#params
predict_t3_vocab <- "total_says_t3 ~ says + childAge_days_t1 + total_says_t1"
predict_asb5 <- "ASB5 ~ says + childAge_days_t1 + total_says_t1"
predict_wj <- "WJ ~ says + childAge_days_t1 + total_says_t1"
#predict_wj_scaled <- "WJ_scaled ~ says + childAge_days_t1 + total_says_t1"
predict_asb5_fromt3 <- "ASB5 ~ says + childAge_days_t3 + total_says_t3"
predict_wj_fromt3 <- "WJ ~ says + childAge_days_t3 + total_says_t3"
#predict_wj_scaled_fromt3 <- "WJ_scaled ~ says + childAge_days_t3 + total_says_t3"

#ppvt standard doesn't need to control for age because the standardized score accounts for age
predict_ppvt_standard <- "ppvt_standard_score ~ says + total_says_t1"

# coefficient function
get_word_beta <- function(word, mod_formula, time, df){
  relevant_df <- df %>% 
    filter(item == word & timepoint == time)
  model <- lm(mod_formula, relevant_df)
  summary(model)$coefficients %>%
    data.frame() %>%
    rownames_to_column("term") %>% 
    filter(term == 'says') %>% 
    mutate(item = word)
}

#test case
#get_word_beta("squeak", as.formula(predict_t3_vocab), 3, df_to_analyze)

7.1 First, how many children know each word at each timepoint?

t3_knowledge <- df_to_analyze %>% 
  group_by(item) %>% 
  filter(timepoint==3) %>% 
  summarise(n_say_t3=sum(says))

word_knowledge <- df_to_analyze %>% 
  group_by(item) %>% 
  filter(timepoint==1) %>% 
  summarise(n_say_t1=sum(says)) %>% 
  left_join(t3_knowledge, by="item")

DT::datatable(word_knowledge)

7.2 Longitudinal estimates: T1 knowledge predicting T3 outcomes

7.2.1 ASB5 performance

word_coeffs_asb5_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_asb5),
                      1,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_asb5_t1)

7.2.2 WJ performance

word_coeffs_wj_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_wj),
                      1,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_wj_t1)

7.2.3 PPVT standard score

word_coeffs_ppvt_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_ppvt_standard),
                      1,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_ppvt_t1)

7.3 Contemporaneous estimates: T3 knowledge predicting T3 outcomes

7.3.1 ASB5 performance

word_coeffs_asb5_t3 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_asb5_fromt3),
                      3,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_asb5_t3)

7.3.2 WJ performance

word_coeffs_wj_t3 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_wj_fromt3),
                      3,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_wj_t3)

8 How does hypernymy predict coefficient estimates?

The coefficient estimates above indicate the degree to which knowledge of a specific word boosts performance on the ASB5 and WJ, controlling for age and total vocabulary knowledge. Now we want to test the hypothesis that lower hypernymy is a characteristic of these more helpful words (i.e. words with higher estimates). So, we look at the relation between hypernymy and the estimate for each word.

8.1 Coefficients generated by T1 word knowledge

8.1.1 ASB5

It looks like there is a relationship here:

sb5_coeffs_t1 <- word_coeffs_asb5_t1 %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

ggplot(sb5_coeffs_t1, aes(x = pos_scale_hypernyms, y = Estimate, label=word))+
  geom_point(size=3)+
  geom_smooth(method=lm)+
  theme_classic()+
  #geom_label()+
  labs(x = "Hypernyms (scaled by pos)", y = "Coefficient Estimate")+
  scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
  theme(text = element_text(size=25))

Does hypernymy predict coefficients? Yes…

#coefficient of word knowledge at T1 (how well it predicts ASB5 at T3) is predicted by hypernymy, controlling for pos
sb5_coeffs_t1_alldata <- sb5_coeffs_t1 %>%
  filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
         !is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))

sb5_t1_mod1 <- lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
##    Data: sb5_coeffs_t1_alldata
## 
## REML criterion at convergence: 1101.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1294 -0.6303 -0.1774  0.3423  2.9030 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.1757   0.4192  
##  Residual             1.7719   1.3311  
## Number of obs: 315, groups:  seedword, 72
## 
## Fixed effects:
##                      Estimate Std. Error        df t value Pr(>|t|)  
## (Intercept)           0.21511    0.09695  68.85640   2.219   0.0298 *
## pos_scale_hypernyms  -0.20489    0.08050 305.67863  -2.545   0.0114 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## ps_scl_hypr 0.071

…but the effect goes away when you account for other features of the word such as helpfulness, frequency, AoA, etc.

sb5_t1_mod2 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa + 
               mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword),
               data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +  
##     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +  
##     pos_scale_defs + (1 | seedword)
##    Data: sb5_coeffs_t1_alldata
## 
## REML criterion at convergence: 1072.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1620 -0.6380 -0.1016  0.4450  2.7626 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.1163   0.3411  
##  Residual             1.5711   1.2534  
## Number of obs: 315, groups:  seedword, 72
## 
## Fixed effects:
##                         Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)             -0.57674    0.78632 289.18159  -0.733  0.46387   
## pos_scale_hypernyms     -0.10679    0.08029 289.74951  -1.330  0.18455   
## mean_helpfulness        -0.06045    0.16485 299.46853  -0.367  0.71410   
## adult_log_freq           0.20228    0.06659 302.32563   3.038  0.00259 **
## aoa                     -0.14126    0.06484 299.27826  -2.179  0.03014 * 
## mean_preschoolness      -0.08885    0.11934 276.47730  -0.744  0.45722   
## mean_parent_importance   0.26409    0.12298 293.24176   2.148  0.03257 * 
## pos_scale_synsets       -0.17202    0.11783 299.47644  -1.460  0.14537   
## pos_scale_defs          -0.10790    0.11334 302.49073  -0.952  0.34185   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ps_scl_h mn_hlp adlt__ aoa    mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.104                                                     
## mn_hlpflnss -0.402  0.038                                              
## adlt_lg_frq -0.327  0.110   -0.322                                     
## aoa         -0.874  0.000    0.378  0.237                              
## mn_prschlns -0.112  0.061   -0.447  0.070  0.071                       
## mn_prnt_mpr -0.252  0.027   -0.214 -0.214  0.041 -0.006                
## ps_scl_syns  0.126  0.077   -0.013 -0.202 -0.006  0.063 -0.077         
## pos_scl_dfs -0.108  0.074   -0.027  0.027  0.046  0.035  0.171 -0.666
sb5_t1_mod3 <- lmer(Estimate ~ adult_log_freq + aoa + mean_parent_importance + (1|seedword), data = sb5_coeffs_t1_alldata)
summary(sb5_t1_mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ adult_log_freq + aoa + mean_parent_importance + (1 |  
##     seedword)
##    Data: sb5_coeffs_t1_alldata
## 
## REML criterion at convergence: 1069
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3405 -0.6382 -0.1494  0.4368  2.8746 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.1859   0.4311  
##  Residual             1.5504   1.2452  
## Number of obs: 315, groups:  seedword, 72
## 
## Fixed effects:
##                         Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)             -0.87500    0.67071 299.43635  -1.305  0.19304   
## adult_log_freq           0.15846    0.06007 307.41480   2.638  0.00876 **
## aoa                     -0.11332    0.05772 310.99943  -1.963  0.05050 . 
## mean_parent_importance   0.26577    0.11887 302.16653   2.236  0.02610 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) adlt__ aoa   
## adlt_lg_frq -0.597              
## aoa         -0.852  0.501       
## mn_prnt_mpr -0.452 -0.331  0.172
anova(sb5_t1_mod2, sb5_t1_mod3)
## refitting model(s) with ML (instead of REML)
## Data: sb5_coeffs_t1_alldata
## Models:
## sb5_t1_mod3: Estimate ~ adult_log_freq + aoa + mean_parent_importance + (1 | 
## sb5_t1_mod3:     seedword)
## sb5_t1_mod2: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + 
## sb5_t1_mod2:     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets + 
## sb5_t1_mod2:     pos_scale_defs + (1 | seedword)
##             Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
## sb5_t1_mod3  6 1067.4 1089.9 -527.69   1055.4                           
## sb5_t1_mod2 11 1067.7 1108.9 -522.83   1045.7 9.7209      5    0.08354 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.1.2 WJ

Does hypernymy predict coefficients? Yes (but in a weird direction)…

wj_coeffs_t1 <- word_coeffs_wj_t1 %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

wj_coeffs_t1_alldata <- wj_coeffs_t1 %>%
  filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
         !is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))

summary(lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=wj_coeffs_t1_alldata))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
##    Data: wj_coeffs_t1_alldata
## 
## REML criterion at convergence: 1413.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6034 -0.6759 -0.0707  0.5379  3.5593 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.380    0.6165  
##  Residual             4.865    2.2056  
## Number of obs: 315, groups:  seedword, 72
## 
## Fixed effects:
##                     Estimate Std. Error       df t value Pr(>|t|)   
## (Intercept)          -0.1696     0.1545  50.1515  -1.097  0.27774   
## pos_scale_hypernyms   0.3606     0.1321 297.7159   2.729  0.00673 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## ps_scl_hypr 0.064

…and goes away when you account for other word features. No difference in fit between full and reduced models

wj_t1_mod1 <-lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa + 
               mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword), data=wj_coeffs_t1_alldata)

summary(wj_t1_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +  
##     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +  
##     pos_scale_defs + (1 | seedword)
##    Data: wj_coeffs_t1_alldata
## 
## REML criterion at convergence: 1373.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.8702 -0.5834 -0.1273  0.5275  2.9961 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.2461   0.496   
##  Residual             4.2419   2.060   
## Number of obs: 315, groups:  seedword, 72
## 
## Fixed effects:
##                         Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)              0.96795    1.28109 275.82813   0.756  0.45056   
## pos_scale_hypernyms      0.11520    0.13083 280.12929   0.881  0.37934   
## mean_helpfulness        -0.61180    0.26891 293.99393  -2.275  0.02362 * 
## adult_log_freq          -0.11738    0.10868 299.32455  -1.080  0.28099   
## aoa                      0.23346    0.10575 291.07830   2.208  0.02805 * 
## mean_preschoolness       0.03223    0.19426 262.32362   0.166  0.86835   
## mean_parent_importance  -0.01212    0.20045 284.17077  -0.060  0.95185   
## pos_scale_synsets        0.58516    0.19224 296.81809   3.044  0.00254 **
## pos_scale_defs          -0.55431    0.18501 301.72177  -2.996  0.00296 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ps_scl_h mn_hlp adlt__ aoa    mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.108                                                     
## mn_hlpflnss -0.400  0.042                                              
## adlt_lg_frq -0.328  0.108   -0.321                                     
## aoa         -0.874  0.002    0.376  0.238                              
## mn_prschlns -0.114  0.058   -0.448  0.070  0.072                       
## mn_prnt_mpr -0.251  0.029   -0.218 -0.216  0.040  0.000                
## ps_scl_syns  0.128  0.075   -0.011 -0.207 -0.007  0.062 -0.075         
## pos_scl_dfs -0.109  0.077   -0.030  0.029  0.046  0.037  0.173 -0.665
wj_t1_mod2 <-lmer(Estimate ~ mean_helpfulness + aoa + pos_scale_synsets + pos_scale_defs + (1|seedword), data=wj_coeffs_t1_alldata)

summary(wj_t1_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## Estimate ~ mean_helpfulness + aoa + pos_scale_synsets + pos_scale_defs +  
##     (1 | seedword)
##    Data: wj_coeffs_t1_alldata
## 
## REML criterion at convergence: 1368.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9418 -0.5878 -0.1166  0.5295  3.0427 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.2165   0.4653  
##  Residual             4.2395   2.0590  
## Number of obs: 315, groups:  seedword, 72
## 
## Fixed effects:
##                   Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)         0.4757     1.1218 267.7837   0.424 0.671871    
## mean_helpfulness   -0.7318     0.2107 274.8521  -3.473 0.000598 ***
## aoa                 0.2664     0.1016 289.6401   2.621 0.009222 ** 
## pos_scale_synsets   0.5105     0.1843 294.3292   2.770 0.005957 ** 
## pos_scale_defs     -0.5495     0.1807 300.1460  -3.042 0.002560 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) mn_hlp aoa    ps_scl_s
## mn_hlpflnss -0.913                       
## aoa         -0.899  0.666                
## ps_scl_syns  0.038 -0.122  0.057         
## pos_scl_dfs -0.038  0.052  0.022 -0.684
anova(wj_t1_mod1, wj_t1_mod2)
## refitting model(s) with ML (instead of REML)
## Data: wj_coeffs_t1_alldata
## Models:
## wj_t1_mod2: Estimate ~ mean_helpfulness + aoa + pos_scale_synsets + pos_scale_defs + 
## wj_t1_mod2:     (1 | seedword)
## wj_t1_mod1: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + 
## wj_t1_mod1:     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets + 
## wj_t1_mod1:     pos_scale_defs + (1 | seedword)
##            Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## wj_t1_mod2  7 1371.4 1397.7 -678.72   1357.4                         
## wj_t1_mod1 11 1377.1 1418.3 -677.53   1355.1 2.3634      4     0.6693

8.2 Coefficients generated by T3 word knowledge

8.2.1 ASB5

Does hypernymy predict coefficients? Again, yes, but goes away when you account for other word features.

sb5_coeffs_t3 <- word_coeffs_asb5_t3 %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

sb5_coeffs_t3_alldata <- sb5_coeffs_t3 %>%
  filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
         !is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))

sb5_t3_mod1 <- lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=sb5_coeffs_t3_alldata)
summary(sb5_t3_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
##    Data: sb5_coeffs_t3_alldata
## 
## REML criterion at convergence: 1169
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2263 -0.6432 -0.1484  0.3050  2.6922 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.1958   0.4425  
##  Residual             2.2955   1.5151  
## Number of obs: 312, groups:  seedword, 73
## 
## Fixed effects:
##                      Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)           0.33421    0.10804  48.45794   3.094  0.00328 **
## pos_scale_hypernyms  -0.19512    0.08977 297.00388  -2.174  0.03053 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## ps_scl_hypr 0.063
sb5_t3_mod2 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa + 
               mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword),
               data=sb5_coeffs_t3_alldata)

summary(sb5_t3_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +  
##     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +  
##     pos_scale_defs + (1 | seedword)
##    Data: sb5_coeffs_t3_alldata
## 
## REML criterion at convergence: 1161.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3071 -0.5888 -0.1379  0.3462  2.7279 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.1538   0.3922  
##  Residual             2.1793   1.4762  
## Number of obs: 312, groups:  seedword, 73
## 
## Fixed effects:
##                         Estimate Std. Error        df t value Pr(>|t|)  
## (Intercept)             -0.86239    0.94345 277.49245  -0.914   0.3615  
## pos_scale_hypernyms     -0.09670    0.09248 281.64576  -1.046   0.2966  
## mean_helpfulness         0.29368    0.19736 291.53024   1.488   0.1378  
## adult_log_freq           0.17937    0.07810 299.76038   2.297   0.0223 *
## aoa                     -0.04973    0.07623 298.17808  -0.652   0.5147  
## mean_preschoolness      -0.12274    0.14049 262.69461  -0.874   0.3831  
## mean_parent_importance  -0.06219    0.15645 282.79343  -0.398   0.6913  
## pos_scale_synsets       -0.25425    0.13899 293.89989  -1.829   0.0684 .
## pos_scale_defs           0.17298    0.13387 297.30342   1.292   0.1973  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ps_scl_h mn_hlp adlt__ aoa    mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.134                                                     
## mn_hlpflnss -0.405  0.047                                              
## adlt_lg_frq -0.314  0.059   -0.332                                     
## aoa         -0.867  0.030    0.389  0.247                              
## mn_prschlns -0.119  0.073   -0.424  0.046  0.084                       
## mn_prnt_mpr -0.281  0.089   -0.214 -0.187  0.020  0.002                
## ps_scl_syns  0.141  0.060   -0.009 -0.195 -0.006  0.063 -0.119         
## pos_scl_dfs -0.128  0.097   -0.020  0.014  0.053  0.028  0.206 -0.669
sb5_t3_mod3 <- lmer(Estimate ~ adult_log_freq + pos_scale_synsets + (1|seedword), data=sb5_coeffs_t3_alldata)
summary(sb5_t3_mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ adult_log_freq + pos_scale_synsets + (1 | seedword)
##    Data: sb5_coeffs_t3_alldata
## 
## REML criterion at convergence: 1154.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5857 -0.5677 -0.1680  0.3554  2.7772 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.1742   0.4173  
##  Residual             2.1772   1.4755  
## Number of obs: 312, groups:  seedword, 73
## 
## Fixed effects:
##                    Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)        -1.26052    0.34991 266.94695  -3.602 0.000376 ***
## adult_log_freq      0.27246    0.05666 302.59162   4.808  2.4e-06 ***
## pos_scale_synsets  -0.10065    0.10155 308.83382  -0.991 0.322371    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) adlt__
## adlt_lg_frq -0.955       
## ps_scl_syns  0.360 -0.383
anova(sb5_t3_mod2, sb5_t3_mod3)
## refitting model(s) with ML (instead of REML)
## Data: sb5_coeffs_t3_alldata
## Models:
## sb5_t3_mod3: Estimate ~ adult_log_freq + pos_scale_synsets + (1 | seedword)
## sb5_t3_mod2: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + 
## sb5_t3_mod2:     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets + 
## sb5_t3_mod2:     pos_scale_defs + (1 | seedword)
##             Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## sb5_t3_mod3  5 1154.8 1173.5 -572.39   1144.8                         
## sb5_t3_mod2 11 1159.0 1200.2 -568.50   1137.0 7.7788      6     0.2548

8.2.2 WJ

Here, hypernymy remains a significant predictor even after accounting for other word characteristics. But why is higher hypernymy predicting better WJ performance? (we would expect lower-hypernym words to be more helpful…)

wj_coeffs_t3 <- word_coeffs_wj_t3 %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

wj_coeffs_t3_alldata <- wj_coeffs_t3 %>%
  filter(!is.na(pos_scale_hypernyms), !is.na(pos_scale_synsets), !is.na(pos_scale_defs),
         !is.na(mean_helpfulness), !is.na(mean_preschoolness), !is.na(mean_parent_importance))

#hypernymy still matters here after accounting for frequency & aoa, and helpfulness
wj_t3_mod1 <- lmer(Estimate ~ pos_scale_hypernyms + (1|seedword), data=wj_coeffs_t3_alldata)
summary(wj_t3_mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Estimate ~ pos_scale_hypernyms + (1 | seedword)
##    Data: wj_coeffs_t3_alldata
## 
## REML criterion at convergence: 1374.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2916 -0.6591  0.0058  0.6458  4.4526 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.6227   0.7891  
##  Residual             4.3116   2.0764  
## Number of obs: 312, groups:  seedword, 73
## 
## Fixed effects:
##                     Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)          -0.2470     0.1629  45.5491  -1.517    0.136    
## pos_scale_hypernyms   0.5783     0.1258 306.2179   4.598 6.24e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## ps_scl_hypr 0.079
wj_t3_mod2 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa + 
               mean_preschoolness + mean_parent_importance + pos_scale_synsets + pos_scale_defs + (1|seedword),data=wj_coeffs_t3_alldata)
summary(wj_t3_mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +  
##     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets +  
##     pos_scale_defs + (1 | seedword)
##    Data: wj_coeffs_t3_alldata
## 
## REML criterion at convergence: 1350.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9710 -0.6479 -0.0584  0.5909  4.1701 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.4075   0.6384  
##  Residual             3.9912   1.9978  
## Number of obs: 312, groups:  seedword, 73
## 
## Fixed effects:
##                         Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)             -3.72180    1.29527 291.51949  -2.873 0.004359 ** 
## pos_scale_hypernyms      0.46110    0.12693 291.61947   3.633 0.000331 ***
## mean_helpfulness        -0.49188    0.27051 298.07418  -1.818 0.070018 .  
## adult_log_freq           0.23923    0.10688 301.81065   2.238 0.025940 *  
## aoa                      0.42644    0.10433 302.56631   4.087  5.6e-05 ***
## mean_preschoolness       0.06059    0.19323 277.83753   0.314 0.754105    
## mean_parent_importance   0.31968    0.21467 293.84116   1.489 0.137517    
## pos_scale_synsets       -0.11762    0.19051 296.79836  -0.617 0.537425    
## pos_scale_defs          -0.03673    0.18341 297.90644  -0.200 0.841405    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ps_scl_h mn_hlp adlt__ aoa    mn_prs mn_pr_ ps_scl_s
## ps_scl_hypr -0.131                                                     
## mn_hlpflnss -0.407  0.043                                              
## adlt_lg_frq -0.313  0.061   -0.334                                     
## aoa         -0.866  0.029    0.391  0.246                              
## mn_prschlns -0.116  0.076   -0.423  0.044  0.084                       
## mn_prnt_mpr -0.285  0.089   -0.208 -0.182  0.021 -0.006                
## ps_scl_syns  0.138  0.064   -0.013 -0.187 -0.005  0.066 -0.121         
## pos_scl_dfs -0.127  0.092   -0.016  0.010  0.054  0.025  0.203 -0.671
wj_t3_mod3 <- lmer(Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + aoa + (1|seedword),data=wj_coeffs_t3_alldata)
summary(wj_t3_mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq +  
##     aoa + (1 | seedword)
##    Data: wj_coeffs_t3_alldata
## 
## REML criterion at convergence: 1347.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9986 -0.6868 -0.0379  0.5747  4.2003 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  seedword (Intercept) 0.3643   0.6036  
##  Residual             4.0152   2.0038  
## Number of obs: 312, groups:  seedword, 73
## 
## Fixed effects:
##                     Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)          -3.0414     1.2216 295.1326  -2.490 0.013334 *  
## pos_scale_hypernyms   0.4658     0.1238 293.6191   3.762 0.000203 ***
## mean_helpfulness     -0.3618     0.2377 291.6385  -1.522 0.129125    
## adult_log_freq        0.2401     0.1013 289.9098   2.369 0.018482 *  
## aoa                   0.4279     0.1037 305.9709   4.126 4.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ps_sc_ mn_hlp adlt__
## ps_scl_hypr -0.116                     
## mn_hlpflnss -0.620  0.103              
## adlt_lg_frq -0.373  0.117 -0.419       
## aoa         -0.905  0.012  0.489  0.268
anova(wj_t3_mod2, wj_t3_mod3)
## refitting model(s) with ML (instead of REML)
## Data: wj_coeffs_t3_alldata
## Models:
## wj_t3_mod3: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + 
## wj_t3_mod3:     aoa + (1 | seedword)
## wj_t3_mod2: Estimate ~ pos_scale_hypernyms + mean_helpfulness + adult_log_freq + 
## wj_t3_mod2:     aoa + mean_preschoolness + mean_parent_importance + pos_scale_synsets + 
## wj_t3_mod2:     pos_scale_defs + (1 | seedword)
##            Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## wj_t3_mod3  7 1349.8 1376.0 -667.91   1335.8                         
## wj_t3_mod2 11 1354.0 1395.2 -666.00   1332.0 3.8106      4     0.4323

9 Exploratory analyses of other variables

9.1 Survey opinion questions

child_vocab_summary <- vocab_survey_data %>% 
  group_by(subjCode, timepoint) %>% 
  summarise(total_says = sum(says), total_understands = sum(understands)) %>% 
  pivot_wider(names_from = timepoint, values_from = total_says:total_understands)

exp_survey_qs <- non_vocab_survey_data %>% 
  left_join(child_vocab_summary, by="subjCode") %>% 
  mutate(vocab_predict_academics_num = ifelse(vocab_predict_academics=="No",0,1),
         importance_of_reading_num = case_when(
           importance_of_reading == "Very important" ~ 4,
           importance_of_reading == "Extremely important" ~ 5
         ),
         parent_vocab_knowledge_num = case_when(
           parent_vocab_knowledge == "Below average" ~ 0,
           parent_vocab_knowledge == "Average" ~ 1,
           parent_vocab_knowledge == "Above average" ~ 2
         ),
         parent_vocab_knowledge_binary = ifelse(parent_vocab_knowledge_num==2,1,0),
         bilingualism_bad_num = ifelse(bilingualism_bad== "No",0,1),
         productive_vocab_growth = total_says_3-total_says_1,
         receptive_vocab_growth = total_understands_3-total_understands_1)

parent_vocab_knowledge_num: Do you think your vocabulary is below average, average, or above average? (coded as 0, 1, 2, respectively)
parent_vocab_knowledge_binary: Recoded to group parents into 2 groups (below/average, and above average) as there was only one “below average” response
vocab_predict_academics_num: Do you think early vocabulary predicts later academics? (0 = no, 1 = yes)
bilingualism_bad_num: Do you think the costs of bilingualism outweigh the benefits? (0 = no, 1 = yes)
est_threeyo_vocab: How many words do you think the average 3-year-old knows?

hist(exp_survey_qs$parent_vocab_knowledge_num)

hist(exp_survey_qs$vocab_predict_academics_num)

hist(exp_survey_qs$bilingualism_bad_num)

hist(exp_survey_qs$est_threeyo_vocab)

Unfortunately the range of these variables is too small to look at any correlations. But can look at survey responses as categorical predictors:
### Does child vocab size or growth relate to parent self-report vocab?

summary(lm(total_says_1 ~ childAge_days + parent_vocab_knowledge_binary, data = exp_survey_qs))
## 
## Call:
## lm(formula = total_says_1 ~ childAge_days + parent_vocab_knowledge_binary, 
##     data = exp_survey_qs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -116.55  -44.97   16.55   34.09  102.20 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -43.71851   40.85160  -1.070   0.2913    
## childAge_days                   0.15249    0.02627   5.804 1.06e-06 ***
## parent_vocab_knowledge_binary  31.29472   16.35868   1.913   0.0633 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.97 on 38 degrees of freedom
## Multiple R-squared:  0.507,  Adjusted R-squared:  0.481 
## F-statistic: 19.54 on 2 and 38 DF,  p-value: 1.46e-06
summary(lm(total_says_3 ~ childAge_days + parent_vocab_knowledge_binary, data = exp_survey_qs))
## 
## Call:
## lm(formula = total_says_3 ~ childAge_days + parent_vocab_knowledge_binary, 
##     data = exp_survey_qs)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.879  -26.389    7.452   30.603   90.400 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   44.93243   38.84877   1.157   0.2547    
## childAge_days                  0.11508    0.02498   4.606 4.51e-05 ***
## parent_vocab_knowledge_binary 29.57490   15.55666   1.901   0.0649 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.47 on 38 degrees of freedom
## Multiple R-squared:  0.4078, Adjusted R-squared:  0.3766 
## F-statistic: 13.08 on 2 and 38 DF,  p-value: 4.751e-05
summary(lm(productive_vocab_growth ~ childAge_days + parent_vocab_knowledge_binary, data = exp_survey_qs))
## 
## Call:
## lm(formula = productive_vocab_growth ~ childAge_days + parent_vocab_knowledge_binary, 
##     data = exp_survey_qs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.314 -13.368   0.255  13.981  52.404 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   88.65094   18.00202   4.924 1.68e-05 ***
## childAge_days                 -0.03742    0.01158  -3.232  0.00254 ** 
## parent_vocab_knowledge_binary -1.71982    7.20876  -0.239  0.81272    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.46 on 38 degrees of freedom
## Multiple R-squared:  0.2191, Adjusted R-squared:  0.178 
## F-statistic:  5.33 on 2 and 38 DF,  p-value: 0.009113

9.1.1 Does child vocab size or growth relate to parent beliefs about vocabulary impacting academics?

summary(lm(total_says_1 ~ childAge_days + vocab_predict_academics_num, data = exp_survey_qs))
## 
## Call:
## lm(formula = total_says_1 ~ childAge_days + vocab_predict_academics_num, 
##     data = exp_survey_qs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -134.07  -34.49   10.08   36.10  114.79 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -28.65394   54.47749  -0.526    0.602    
## childAge_days                 0.15563    0.02921   5.329 4.74e-06 ***
## vocab_predict_academics_num  -0.83306   23.57496  -0.035    0.972    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.37 on 38 degrees of freedom
## Multiple R-squared:  0.4595, Adjusted R-squared:  0.4311 
## F-statistic: 16.15 on 2 and 38 DF,  p-value: 8.372e-06
summary(lm(total_says_3 ~ childAge_days + vocab_predict_academics_num, data = exp_survey_qs))
## 
## Call:
## lm(formula = total_says_3 ~ childAge_days + vocab_predict_academics_num, 
##     data = exp_survey_qs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -156.30  -18.04    7.27   31.18  102.70 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 49.48180   51.73324   0.956    0.345    
## childAge_days                0.12084    0.02773   4.357 9.66e-05 ***
## vocab_predict_academics_num  5.81733   22.38740   0.260    0.796    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.68 on 38 degrees of freedom
## Multiple R-squared:  0.3526, Adjusted R-squared:  0.3186 
## F-statistic: 10.35 on 2 and 38 DF,  p-value: 0.0002581
summary(lm(productive_vocab_growth ~ childAge_days + vocab_predict_academics_num, data = exp_survey_qs))
## 
## Call:
## lm(formula = productive_vocab_growth ~ childAge_days + vocab_predict_academics_num, 
##     data = exp_survey_qs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.162 -11.870  -0.994  13.728  51.716 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                 78.13574   22.80954   3.426  0.00149 **
## childAge_days               -0.03479    0.01223  -2.845  0.00712 **
## vocab_predict_academics_num  6.65039    9.87076   0.674  0.50455   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.35 on 38 degrees of freedom
## Multiple R-squared:  0.2271, Adjusted R-squared:  0.1865 
## F-statistic: 5.584 on 2 and 38 DF,  p-value: 0.007483

9.1.2 How do parents’ importance estimates map onto our designations of words as seed/control?

t.test(mean_parent_importance ~ as.factor(type), data = word_info)
## 
##  Welch Two Sample t-test
## 
## data:  mean_parent_importance by as.factor(type)
## t = -3.1043, df = 119.15, p-value = 0.002383
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5176086 -0.1144512
## sample estimates:
## mean in group control    mean in group seed 
##              3.048542              3.364572

9.2 Correlations among subjective ratings about words

Some nice convergent validity here - e.g. words that Turkers think are helpful are also rated by parents to be important to teach.

word_info_forcorr <- word_info %>%
  select(mean_helpfulness, mean_babiness, mean_preschoolness, mean_parent_importance)

info_data_corrmat <- cor(word_info_forcorr, use="pairwise.complete.obs")
info_pmat <- cor.mtest(word_info_forcorr)

info_sig_matrix <- info_pmat$p

corrplot(corr = info_data_corrmat, method="color", type="lower",
         diag=FALSE, addCoef.col = "black", p.mat=info_sig_matrix, insig="blank",
         tl.srt = 45, tl.col="black", tl.cex=.8, number.cex = .5)