SES_data <- read_csv("SES_data.csv")

task_data <- read_csv("combined_data_wide.csv")
names(task_data) <- c("subjCode","childAge_days_t1","childAge_days_t2","childAge_months_t1","childAge_months_t2",
                              "childAge_years_t1","childAge_years_t2", "subjCode_lab_t2", "ASB5", "Color",
                              "ppvt_overlapping_survey", "Shape","WJ", "WJ_scaled", "ppvt_first_set", "ppvt_n_correct",
                              "ppvt_n_attempted", "ppvt_standard_score", "ppvt_percentile", "ppvt_prop_correct",
                              "total_understands_control_t1", "total_understands_control_t2", "total_understands_seed_t1",
                              "total_understands_seed_t2", "total_says_control_t1", "total_says_control_t2", "total_says_seed_t1",
                              "total_says_seed_t2")

non_vocab_survey_data <- read_csv("non_vocab_survey_data.csv")

current_task_data <- task_data %>% 
  left_join(SES_data, by="subjCode") %>% 
  left_join(non_vocab_survey_data, by="subjCode")

vocab_survey_data <- read_csv("vocab_data_by_word.csv") %>% filter(!is.na(subjCode))
vocab_tasks_byword <- read_csv("vocab_task_data_by_word.csv")

word_importance <- vocab_survey_data %>% 
  select(-seedword) %>% 
  distinct() %>% 
  filter(!is.na(importance_to_teach)) %>% 
  mutate(importance_num = case_when(
    importance_to_teach == "Not at all important" ~ 1,
    importance_to_teach == "Slightly important" ~ 2,
    importance_to_teach == "Moderately important" ~ 3,
    importance_to_teach == "Very important" ~ 4,
    importance_to_teach == "Extremely important" ~ 5
  )) %>% 
  group_by(word) %>% 
  summarize(mean_parent_importance = mean(importance_num), n_importance_ratings = n())

helpfulness <- read_csv("../word_norms/helpfulness_ratings.csv") %>% select(word, mean_helpfulness = resp_mean) %>% 
  group_by(word) %>% 
  summarise(mean_helpfulness = mean(mean_helpfulness)) %>% 
  ungroup()

babiness_preschoolness <- read_csv("../word_norms/babiness_ratings.csv") %>% 
  select(word, mean_babiness = babiness_mean, mean_preschoolness = preschoolness_mean)

defs_synsets <- read_csv("../word_norms/all_words_synsets_defs.csv")

no_generality <- read_csv("../word_norms/old_files/no_generality.csv") %>% select(-mean_generality, -num_item_id)
generality_cdi_childes <- read_csv("../word_norms/generality_ratings_byWord_cdi_childes.csv")
generality_otherwords <- read_csv("../word_norms/new_generality_ratings.csv") %>% 
  rename(pos_fullword = pos, word_descriptive = word, word = word2) %>% 
  mutate(pos = case_when(
    pos_fullword == "noun" ~ "n",
    pos_fullword == "verb" ~ "v"
  ))

generality_otherwords_getpos <- generality_otherwords %>% left_join(no_generality, by="word") %>% 
  mutate(pos_resolved = coalesce(pos.x, pos.y)) %>% 
  select(word, pos = pos_resolved, mean_generality, n_generality_ratings = n_gen_ratings) %>% 
  distinct()

concreteness <- read_tsv("../word_norms/concreteness_brysbaert.csv") %>% 
  select(word = Word, concreteness = Conc.M)

#used to select seedword but now we don't care about those so removed it - 351 obs
word_info <- read_csv("../word_norms/word_info_with_cdi_ids.csv") %>% 
  select(num_item_id,word, type, pos, num_hypernyms, num_hyponyms, aoa, adult_log_freq) %>% 
  distinct() %>% 
  left_join(defs_synsets, by=c("word", "pos")) %>% 
  left_join(concreteness, by="word") %>% 
  left_join(helpfulness, by="word") %>% 
  left_join(babiness_preschoolness, by="word") %>% 
  left_join(word_importance, by="word") %>% 
  left_join(generality_cdi_childes, by="num_item_id") %>% 
  select(-word.y, word=word.x) %>% 
  left_join(generality_otherwords_getpos, by=c("word", "pos")) %>% 
  mutate(n_generality_ratings = coalesce(n_generality_ratings.x, n_generality_ratings.y),
         mean_generality = coalesce(mean_generality.x, mean_generality.y)) %>% 
  select(-n_generality_ratings.x, -n_generality_ratings.y, -mean_generality.x, -mean_generality.y) %>% 
  group_by(pos) %>% 
  mutate(pos_scale_hypernyms = scale(num_hypernyms),
         pos_scale_log_hyponyms = scale(log(1+num_hyponyms)),
         pos_scale_synsets = scale(n_synsets),
         pos_scale_defs = scale(n_definitions),
         pos_scale_generality = scale(mean_generality)) %>% 
  ungroup()

child_mean_word_info <- word_info %>% 
  select(word, mean_helpfulness, mean_generality, pos_scale_generality, adult_log_freq) %>% 
  left_join(vocab_survey_data, by="word") %>% 
  select(subjCode, timepoint, childAge_days, word, understands, says, everything()) %>% 
  distinct() %>%
  filter(!is.na(subjCode) & says==1) %>% 
  group_by(subjCode, timepoint) %>% 
  summarize(mean_helpfulness_of_vocab = mean(mean_helpfulness, na.rm=TRUE),
            mean_aoa_of_vocab = mean(aoa, na.rm=TRUE),
            mean_freq_of_vocab = mean(adult_log_freq, na.rm=TRUE),
            mean_hypernymy_of_vocab_unscaled = mean(num_hypernyms, na.rm=TRUE),
            mean_hypernymy_of_vocab = mean(pos_scale_hypernyms, na.rm=TRUE),
            mean_generality_of_vocab_unscaled = mean(mean_generality, na.rm=TRUE),
            mean_generality_of_vocab = mean(pos_scale_generality, na.rm=TRUE),
            ) %>% 
  pivot_wider(names_from = timepoint, names_prefix="t",
              values_from = mean_helpfulness_of_vocab:mean_generality_of_vocab)

child_mean_word_info_unknown_known <- word_info %>% 
  select(word, mean_helpfulness, mean_generality, pos_scale_generality, adult_log_freq) %>% 
  left_join(vocab_survey_data, by="word") %>% 
  select(subjCode, timepoint, childAge_days, word, understands, says, everything()) %>% 
  distinct() %>%
  filter(!is.na(subjCode)) %>% 
  mutate(says_character = ifelse(says==0, "unknown","known")) %>% 
  group_by(subjCode, timepoint, says_character) %>% 
  summarize(mean_helpfulness_of_vocab = mean(mean_helpfulness, na.rm=TRUE),
            mean_aoa_of_vocab = mean(aoa, na.rm=TRUE),
            mean_freq_of_vocab = mean(adult_log_freq, na.rm=TRUE),
            mean_hypernymy_of_vocab_unscaled = mean(num_hypernyms, na.rm=TRUE),
            mean_hypernymy_of_vocab = mean(pos_scale_hypernyms, na.rm=TRUE),
            mean_generality_of_vocab_unscaled = mean(mean_generality, na.rm=TRUE),
            mean_generality_of_vocab = mean(pos_scale_generality, na.rm=TRUE)
            ) %>% 
  pivot_wider(names_from = c("timepoint","says_character"), names_prefix="t",
              values_from = mean_helpfulness_of_vocab:mean_generality_of_vocab)

1 tl;dr summary

Hypernymy is doing stuff: kids’ mean hypernymy of their vocabulary at T1 predicts ASB5 performance at T2 (controlling for age and T1 vocab size), and when we look at individual word knowledge as a predictor of ASB5 performance, hypernymy predicts the t-values of those words (controlling for word frequency and AoA), at p = .049.

Unresolved potential issues:
- In mean vocabulary models, VIF is high – total vocab, mean_aoa, and mean_frequency are all highly intercorrelated (which makes sense). How to address?
- These models don’t replicate if we look at T2 vocabulary characteristics instead (ASB5 ~ T2 age + T2 total vocab + T2 mean hypernymy, etc.). It doesn’t seem like much is going on if we compare the predictiveness of knowing a word at T1 vs. only knowing it at T2 (see very last analysis) so I don’t think it’s about how long you’ve known the word. Also doesn’t seem to be due to potential outliers. Power issue?

2 Variables/codebook

The goal of these analyses is to investigate how 2-4yo’s vocabulary knowledge might predict performance on cognitive tasks. We investigate vocabulary at the composite level (e.g., how many total seed/control words) as well as individual word knowledge to identify which specific words might be most helpful for kids to know.

t1: first timepoint, when parents did online survey (summer 2019)
t2: second timepoint, when parents brought kids into lab (fall 2019/winter 2020)

ASB5: number of items child got correct on our adapted SB-5 – max of 12
mean_aoa_of_vocab: mean age of acquisition of all the words child is reported to know (from Kuperman norms)
mean_freq_of_vocab: mean frequency of all the words child is reported to know (from adult speech in CHILDES, log-transformed)
mean_hypernymy_of_vocab (scaled by pos): mean number of hypernyms of all the words child is reported to know (from Wordnet)

3 Correlations among variables

Regressed age out of ASB5; add other characteristics of kids’ vocabularies
ASB5.age: ASB5 performance, controlling for age

age_sb5 <- lm(ASB5 ~ childAge_days_t2, data=current_task_data)
#age_wj <- lm(WJ ~ childAge_days_t2, data=current_task_data_forcorr)

resid_task_data <- current_task_data %>%
  left_join(child_mean_word_info, by="subjCode") %>% 
  mutate(ASB5.age = age_sb5$residuals,
         total_says_t1 = total_says_seed_t1 + total_says_control_t1,
         total_says_t2 = total_says_seed_t2 + total_says_control_t2) %>% 
  rename(mean_helpfulness_of_vocab_t2 = mean_helpfulness_of_vocab_t3,
         mean_aoa_of_vocab_t2 = mean_aoa_of_vocab_t3, mean_freq_of_vocab_t2 = mean_freq_of_vocab_t3,
         mean_hypernymy_of_vocab_t2 = mean_hypernymy_of_vocab_t3)

resid_task_data_forcorr <- resid_task_data %>% 
  select(childAge_days_t2, ASB5.age, total_says_t1, total_says_t2,
         mean_aoa_of_vocab_t1, mean_freq_of_vocab_t1, mean_hypernymy_of_vocab_t1,
         mean_aoa_of_vocab_t2, mean_freq_of_vocab_t2,
         mean_hypernymy_of_vocab_t2)

resid_data_corrmat <- cor(resid_task_data_forcorr, use="pairwise.complete.obs")
resid_pmat <- cor.mtest(resid_task_data_forcorr)

resid_sig_matrix <- resid_pmat$p

corrplot(corr = resid_data_corrmat, method="color", type="lower",
         diag=FALSE, addCoef.col = "black", p.mat=resid_sig_matrix, insig="blank",
         tl.srt = 45, tl.col="black", tl.cex=1.1, number.cex = .9)

4 Investigate aggregate vocabulary characteristics (mean hypernymy, etc.) as predictor of ASB5 performance

4.1 Characteristics of T1 vocabulary: estimates and model comparison

Note: Issues with collinearity among predictors. Total vocab, mean_aoa, and mean_freq highly intercorrelated. The model that contains age, T1 vocab size, and mean T1 vocab hypernymy has the highest adjusted R-squared.

# all_t1_characteristics <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_helpfulness_of_vocab_t1 + mean_aoa_of_vocab_t1 +
#                          mean_freq_of_vocab_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1, data = resid_task_data)
t1voc_aoa <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_aoa_of_vocab_t1, data = resid_task_data)
t1voc_freq <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_freq_of_vocab_t1, data = resid_task_data)
t1voc_aoa_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1 + mean_aoa_of_vocab_t1,
                      data = resid_task_data)
t1voc_freq_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1 + mean_freq_of_vocab_t1,
                       data = resid_task_data)

ASB5_t1voc_hyper <- lm(mean_hypernymy_of_vocab_t1 ~ ASB5 + childAge_days_t2 + total_says_t1, data = resid_task_data)

tab_model(t1voc_aoa, t1voc_freq, t1voc_hyper, t1voc_aoa_hyper, t1voc_freq_hyper)
  ASB 5 ASB 5 ASB 5 ASB 5 ASB 5
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) 15.49 -13.56 – 44.53 0.285 -55.15 -101.67 – -8.62 0.022 -1.80 -4.84 – 1.25 0.238 -10.74 -42.69 – 21.22 0.498 -23.06 -76.59 – 30.46 0.386
childAge_days_t2 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.00 0.00 – 0.01 <0.001 0.00 0.00 – 0.01 0.001 0.00 0.00 – 0.01 0.001
total_says_t1 0.02 -0.02 – 0.06 0.250 0.04 0.00 – 0.07 0.034 0.00 -0.01 – 0.02 0.449 -0.01 -0.05 – 0.04 0.749 0.02 -0.02 – 0.06 0.318
mean_aoa_of_vocab_t1 -4.29 -11.47 – 2.88 0.232 2.22 -5.69 – 10.13 0.571
mean_freq_of_vocab_t1 7.11 0.93 – 13.29 0.026 2.83 -4.29 – 9.95 0.423
mean_hypernymy_of_vocab_t1 -18.57 -30.39 – -6.74 0.003 -20.94 -35.58 – -6.30 0.007 -15.31 -29.76 – -0.85 0.039
Observations 36 36 36 36 36
R2 / R2 adjusted 0.551 / 0.509 0.599 / 0.562 0.644 / 0.611 0.648 / 0.603 0.652 / 0.607

4.1.1 Plot vocab characteristics vs. performance (controlling for age and T1 vocab)

t1_age_voc <- lm(ASB5 ~ childAge_days_t2 + total_says_t1, data = resid_task_data)
residual_plots <- resid_task_data %>% mutate(ASB5.age.voc = t1_age_voc$residuals)
cor.test(residual_plots$ASB5.age.voc, residual_plots$mean_hypernymy_of_vocab_t1)
## 
##  Pearson's product-moment correlation
## 
## data:  residual_plots$ASB5.age.voc and residual_plots$mean_hypernymy_of_vocab_t1
## t = -2.9794, df = 34, p-value = 0.005299
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6816495 -0.1487060
## sample estimates:
##       cor 
## -0.455012

4.1.1.1 Hypernymy

ggplot(residual_plots, aes(x = mean_hypernymy_of_vocab_t1, y = ASB5.age.voc))+
  geom_point(size=3)+
  geom_smooth(method=lm)+
  theme_classic()+
  #geom_label()+
  labs(x = "Mean hypernymy of vocabulary at T1", y = "Induction task performance (controlling for age + vocab size")+
  scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
  theme(text = element_text(size=18))
## `geom_smooth()` using formula 'y ~ x'

4.1.1.2 Frequency

ggplot(residual_plots, aes(x = mean_freq_of_vocab_t1, y = ASB5.age.voc))+
  geom_point(size=3)+
  geom_smooth(method=lm)+
  theme_classic()+
  #geom_label()+
  labs(x = "Mean frequency of vocabulary at T1", y = "Induction task performance (controlling for age + vocab size")+
  #scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
  theme(text = element_text(size=18))
## `geom_smooth()` using formula 'y ~ x'

4.1.1.3 AoA

ggplot(residual_plots, aes(x = mean_aoa_of_vocab_t1, y = ASB5.age.voc))+
  geom_point(size=3)+
  geom_smooth(method=lm)+
  theme_classic()+
  #geom_label()+
  labs(x = "Mean AoA of vocabulary at T1", y = "Induction task performance (controlling for age + vocab size")+
  #scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
  theme(text = element_text(size=18))
## `geom_smooth()` using formula 'y ~ x'

4.1.2 Check hypernymy model for outliers

xHats <- modelCaseAnalysis(t1voc_hyper, Type = "HATVALUES", ID=as.character(resid_task_data$subjCode))

# need to click on cases to investigate when running this
# there's one extreme case - subjCode 217

xResids <- modelCaseAnalysis(t1voc_hyper, Type="RESIDUALS",ID=as.character(resid_task_data$subjCode))

# studentized residuals look okay (barely) - possible extreme case is subjCode 240

xCooks <- modelCaseAnalysis(t1voc_hyper, Type="COOKSD",ID=as.character(resid_task_data$subjCode))

# cook's distance looks okay - no observations beyond rule-of-thumb threshold

# influence plot
xInfs <- modelCaseAnalysis(t1voc_hyper, Type="INFLUENCEPLOT",ID=as.character(resid_task_data$subjCode))

# subjCode 217 is having a lot of influence and might be an outlier. what happens if we remove it?
d2 <- dfRemoveCases(resid_task_data, 25)
t1voc_hyper_2 <- lm(ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1, data = d2)

summary(t1voc_hyper)
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1, 
##     data = resid_task_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5029 -0.6391  0.0266  0.8475  2.4544 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.798650   1.494708  -1.203 0.237670    
## childAge_days_t2             0.004911   0.001268   3.873 0.000499 ***
## total_says_t1                0.004585   0.005976   0.767 0.448566    
## mean_hypernymy_of_vocab_t1 -18.568876   5.805520  -3.198 0.003109 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.646 on 32 degrees of freedom
## Multiple R-squared:  0.6443, Adjusted R-squared:  0.6109 
## F-statistic: 19.32 on 3 and 32 DF,  p-value: 2.475e-07
summary(t1voc_hyper_2)
## 
## Call:
## lm(formula = ASB5 ~ childAge_days_t2 + total_says_t1 + mean_hypernymy_of_vocab_t1, 
##     data = d2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4989 -0.6802  0.0042  0.9200  2.4826 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.571240   1.702260  -0.923 0.363120    
## childAge_days_t2             0.005004   0.001325   3.778 0.000675 ***
## total_says_t1                0.003417   0.007250   0.471 0.640735    
## mean_hypernymy_of_vocab_t1 -17.365579   7.171754  -2.421 0.021506 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.67 on 31 degrees of freedom
## Multiple R-squared:  0.5737, Adjusted R-squared:  0.5325 
## F-statistic: 13.91 on 3 and 31 DF,  p-value: 6.412e-06
# R-squared goes down when 217 is excluded, but hypernymy still matters over and above total vocab

4.2 Characteristics of T2 vocabulary: estimates and model comparison

Note: Issues with collinearity among predictors. Total vocab, mean_aoa, and mean_freq highly intercorrelated. AoA, freq, hypernymy don’t significantly predict ASB5 at T2.

# all_t2_characteristics <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_helpfulness_of_vocab_t2 + mean_aoa_of_vocab_t2 +
#                          mean_freq_of_vocab_t2 + mean_hypernymy_of_vocab_t2, data = resid_task_data)
t2voc_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_hypernymy_of_vocab_t2, data = resid_task_data)
t2voc_aoa <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_aoa_of_vocab_t2, data = resid_task_data)
t2voc_freq <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_freq_of_vocab_t2, data = resid_task_data)
t2voc_aoa_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_hypernymy_of_vocab_t2 + mean_aoa_of_vocab_t2,
                      data = resid_task_data)
t2voc_freq_hyper <- lm(ASB5 ~ childAge_days_t2 + total_says_t2 + mean_hypernymy_of_vocab_t2 + mean_freq_of_vocab_t2,
                       data = resid_task_data)


tab_model(t2voc_aoa, t2voc_freq, t2voc_hyper, t2voc_aoa_hyper, t2voc_freq_hyper)
  ASB 5 ASB 5 ASB 5 ASB 5 ASB 5
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) 3.27 -30.91 – 37.45 0.847 -40.80 -110.95 – 29.34 0.245 -4.07 -8.74 – 0.61 0.086 -5.91 -41.99 – 30.16 0.740 -30.53 -102.32 – 41.25 0.392
childAge_days_t2 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001 0.01 0.00 – 0.01 <0.001
total_says_t2 0.01 -0.04 – 0.05 0.704 0.03 -0.02 – 0.08 0.255 0.01 -0.01 – 0.02 0.453 0.00 -0.04 – 0.05 0.878 0.02 -0.03 – 0.07 0.348
mean_aoa_of_vocab_t2 -1.34 -10.05 – 7.36 0.755 0.46 -8.48 – 9.41 0.917
mean_freq_of_vocab_t2 5.13 -4.12 – 14.38 0.267 3.54 -6.03 – 13.11 0.457
mean_hypernymy_of_vocab_t2 -11.21 -26.45 – 4.03 0.144 -11.45 -27.62 – 4.73 0.159 -9.57 -25.56 – 6.43 0.232
Observations 36 36 36 36 36
R2 / R2 adjusted 0.533 / 0.489 0.549 / 0.507 0.562 / 0.521 0.562 / 0.506 0.570 / 0.515

5 Investigate predictiveness of individual word knowledge at T1 and t2 for t2 outcomes, separately

Because the long-term goal is to identify specific words that could be helpful for cognitive/vocabulary development, shift analyses to look at the role of individual words

## put everything together into df
task_data_wide <- current_task_data %>% 
  mutate(total_says_t1 = total_says_control_t1+total_says_seed_t1,
         total_says_t2 = total_says_control_t2+total_says_seed_t2,
         total_understands_t1 = total_understands_control_t1+total_understands_seed_t1,
         total_understands_t2 = total_understands_control_t2+total_understands_seed_t2) %>% 
  select(subjCode, childAge_days_t1, childAge_days_t2, total_understands_t1, total_says_t1,total_understands_t2, total_says_t2,
         ASB5, WJ, WJ_scaled, ppvt_prop_correct, ppvt_standard_score, ppvt_percentile)
  
##inner join to only include kids who've come into lab
df_to_analyze <- vocab_survey_data %>% 
  select(-seedword) %>% 
  distinct() %>% 
  filter(!is.na(word)) %>% 
  inner_join(task_data_wide, by="subjCode") %>% 
  select(-importance_to_teach, -childAge_days) %>% 
  rename(item=word)

###code from Molly
#params
predict_asb5 <- "ASB5 ~ says + childAge_days_t1 + total_says_t1"
predict_asb5_fromt2 <- "ASB5 ~ says + childAge_days_t2 + total_says_t2"


# coefficient function
get_word_beta <- function(word, mod_formula, time, df){
  relevant_df <- df %>% 
    filter(item == word & timepoint == time)
  model <- lm(mod_formula, relevant_df)
  summary(model)$coefficients %>%
    data.frame() %>%
    rownames_to_column("term") %>% 
    filter(term == 'says') %>% 
    mutate(item = word)
}

#test case
#get_word_beta("squeak", as.formula(predict_t2_vocab), 3, df_to_analyze)

5.1 First, how many children know each word at each timepoint?

t2_knowledge <- df_to_analyze %>% 
  group_by(item) %>% 
  filter(timepoint==3) %>% 
  summarise(n_say_t2=sum(says))
## `summarise()` ungrouping output (override with `.groups` argument)
word_knowledge <- df_to_analyze %>% 
  group_by(item) %>% 
  filter(timepoint==1) %>% 
  summarise(n_say_t1=sum(says)) %>% 
  left_join(t2_knowledge, by="item")
## `summarise()` ungrouping output (override with `.groups` argument)
DT::datatable(word_knowledge)

5.2 Longitudinal estimates: T1 knowledge predicting t2 outcomes

5.2.1 ASB5 performance

word_coeffs_asb5_t1 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_asb5),
                      1,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_asb5_t1)

5.3 Contemporaneous estimates: t2 knowledge predicting t2 outcomes

5.3.1 ASB5 performance

word_coeffs_asb5_t2 <- map_df(paste("",unique(df_to_analyze$item),"",sep=""), 
                      get_word_beta,
                      as.formula(predict_asb5_fromt2),
                      3,
                      df_to_analyze) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_asb5_t2)

6 How does hypernymy predict t-values?

The t-values above indicate the degree/reliability that knowledge of a specific word boosts performance on the ASB5 and WJ, controlling for age and total vocabulary knowledge. Now we want to test the hypothesis that lower hypernymy is a characteristic of these more helpful words (i.e. words with higher estimates). So, we look at the relation between hypernymy and the t-value for each word.

6.1 t-values generated by T1 word knowledge

6.1.1 ASB5

It looks like there is a relationship here:

sb5_coeffs_t1 <- word_coeffs_asb5_t1 %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

ggplot(sb5_coeffs_t1, aes(x = pos_scale_hypernyms, y = tval, label=word))+
  geom_point(size=3)+
  geom_smooth(method=lm)+
  theme_classic()+
  #geom_label()+
  labs(x = "Hypernyms (scaled by pos)", y = "t-value of Coefficient Estimate")+
  scale_x_continuous(breaks=c(-5, -2.5, 0, 2.5, 5), labels=c(-5, -2.5, 0, 2.5, 5))+
  theme(text = element_text(size=25))
## `geom_smooth()` using formula 'y ~ x'

Does hypernymy predict t-value? Yes

#t-value of word knowledge at T1 (how well it predicts ASB5 at t2) is predicted by hypernymy, controlling for pos
sb5_coeffs_t1_alldata <- sb5_coeffs_t1 %>%
  filter(!is.na(pos_scale_hypernyms),
         !is.na(mean_preschoolness), !is.na(concreteness))

sb5_t1_mod1 <- lm(tval ~ pos_scale_hypernyms, data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod1)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms, data = sb5_coeffs_t1_alldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0252 -0.7974 -0.0464  0.7499  3.7109 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.11113    0.06514   1.706 0.088978 .  
## pos_scale_hypernyms -0.24966    0.06685  -3.735 0.000222 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.165 on 318 degrees of freedom
## Multiple R-squared:  0.04202,    Adjusted R-squared:  0.03901 
## F-statistic: 13.95 on 1 and 318 DF,  p-value: 0.0002225

…but the effect weakens when you account for frequency & AoA. We want to account for these things because they are related to hypernymy and we’re interested in the effect of hypernymy alone.

# note: aoa and frequency are correlated at -.62, but VIF isn't too concerning, so okay to have them in same model
sb5_t1_mod2 <- lm(tval ~ pos_scale_hypernyms + adult_log_freq + aoa,
               data=sb5_coeffs_t1_alldata)
summary(sb5_t1_mod2)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + adult_log_freq + aoa, 
##     data = sb5_coeffs_t1_alldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9747 -0.7371 -0.0541  0.7894  3.4922 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.57513    0.51602  -1.115 0.265929    
## pos_scale_hypernyms -0.12981    0.06750  -1.923 0.055383 .  
## adult_log_freq       0.16378    0.04891   3.349 0.000915 ***
## aoa                 -0.04774    0.04998  -0.955 0.340312    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.114 on 302 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:   0.11,  Adjusted R-squared:  0.1011 
## F-statistic: 12.44 on 3 and 302 DF,  p-value: 1.087e-07
sb5_t1_mod3 <- lm(tval ~ adult_log_freq + pos_scale_hypernyms, data = filter(sb5_coeffs_t1_alldata, !is.na(aoa)))
summary(sb5_t1_mod3)
## 
## Call:
## lm(formula = tval ~ adult_log_freq + pos_scale_hypernyms, data = filter(sb5_coeffs_t1_alldata, 
##     !is.na(aoa)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9150 -0.7693 -0.0453  0.8274  3.4289 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.01150    0.23979  -4.218 3.26e-05 ***
## adult_log_freq       0.19161    0.03927   4.879 1.72e-06 ***
## pos_scale_hypernyms -0.13295    0.06741  -1.972   0.0495 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.114 on 303 degrees of freedom
## Multiple R-squared:  0.1073, Adjusted R-squared:  0.1014 
## F-statistic:  18.2 on 2 and 303 DF,  p-value: 3.424e-08
freq.resid <- lm(tval ~ adult_log_freq, sb5_coeffs_t1_alldata)
conc.resid <- lm(tval ~ concreteness, sb5_coeffs_t1_alldata)
hyper.resid <- lm(tval ~ pos_scale_hypernyms, sb5_coeffs_t1_alldata)

anova(sb5_t1_mod2, sb5_t1_mod3)
## Analysis of Variance Table
## 
## Model 1: tval ~ pos_scale_hypernyms + adult_log_freq + aoa
## Model 2: tval ~ adult_log_freq + pos_scale_hypernyms
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    302 374.63                           
## 2    303 375.76 -1   -1.1315 0.9121 0.3403
sb5_coeffs_resid <- sb5_coeffs_t1_alldata %>%
  mutate(tval_control_for_freq = freq.resid$residuals,
         tval_control_for_hyper = hyper.resid$residuals)

6.1.1.1 Residual plots

training_words <- c("address", "body", "bunch", "cloth", "end", "land", "list", "meal", "month", "number", "piece", "place", "sound", "vegetable", "way", "work")

# ggplot(data = sb5_coeffs_resid, aes(x = pos_scale_hypernyms, y = tval, label = word))+
#   geom_point()+
#   geom_label(size=3)+
#   geom_smooth(method="lm")+
#   theme_classic()

ggplot(sb5_coeffs_resid, aes(x = adult_log_freq, y = tval_control_for_hyper, label=word))+
  geom_label()+
  geom_smooth(method="lm")+
  theme_classic()+
  labs(x = "Frequency", y = "t-value (controlling for hypernyms)")
## `geom_smooth()` using formula 'y ~ x'

ggplot(sb5_coeffs_resid, aes(x = pos_scale_hypernyms, y = tval_control_for_freq, label=word))+
  geom_point()+
  geom_label()+
  geom_smooth(method="lm")+
  theme_classic()+
  labs(x = "Hypernyms (scaled by pos)", y = "t-value (controlling for frequency)")
## `geom_smooth()` using formula 'y ~ x'

# ggplot(sb5_coeffs_resid, aes(x = adult_log_freq, y = Estimate, label = word))+
#   geom_point()+
#   geom_label(data= filter(sb5_coeffs_resid, word %in% training_words), size=5)+
#   geom_smooth(method="lm")+
#   theme_classic()

6.1.2 Check t-value prediction models for outlier words (sb5_t1_mod3)

xHats_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type = "HATVALUES",
                                 ID=as.character(filter(sb5_coeffs_t1_alldata, !is.na(aoa))$word))

# need to click on cases to investigate when running this
sb5_coeffs_t1_alldata[xHats_tvals$Rownames,]
##  [1] word                   Estimate               SE                    
##  [4] tval                   p                      num_item_id           
##  [7] type                   pos                    num_hypernyms         
## [10] num_hyponyms           aoa                    adult_log_freq        
## [13] n_synsets              n_definitions          concreteness          
## [16] mean_helpfulness       mean_babiness          mean_preschoolness    
## [19] mean_parent_importance n_importance_ratings   n_generality_ratings  
## [22] mean_generality        pos_scale_hypernyms    pos_scale_log_hyponyms
## [25] pos_scale_synsets      pos_scale_defs         pos_scale_generality  
## <0 rows> (or 0-length row.names)
xResids_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type="RESIDUALS",
                                   ID=as.character(filter(sb5_coeffs_t1_alldata, !is.na(aoa))$word))

# studentized residuals look okay

xCooks_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type="COOKSD",ID=as.character(resid_task_data$subjCode))

# cook's distance looks okay - a couple observations beyond rule-of-thumb threshold though
sb5_coeffs_t1_alldata[xCooks_tvals$Rownames,]
##  [1] word                   Estimate               SE                    
##  [4] tval                   p                      num_item_id           
##  [7] type                   pos                    num_hypernyms         
## [10] num_hyponyms           aoa                    adult_log_freq        
## [13] n_synsets              n_definitions          concreteness          
## [16] mean_helpfulness       mean_babiness          mean_preschoolness    
## [19] mean_parent_importance n_importance_ratings   n_generality_ratings  
## [22] mean_generality        pos_scale_hypernyms    pos_scale_log_hyponyms
## [25] pos_scale_synsets      pos_scale_defs         pos_scale_generality  
## <0 rows> (or 0-length row.names)
# influence plot
xInfs_tvals <- modelCaseAnalysis(sb5_t1_mod3, Type="INFLUENCEPLOT",
                                 ID=as.character(filter(sb5_coeffs_t1_alldata, !is.na(aoa))$word))

# none of these words appear to be having undue influence so I think we're okay

6.2 Coefficients generated by t2 word knowledge

6.2.1 ASB5

Does hypernymy predict coefficients? Again, yes, but goes away when you account for other word features. R-squared quite small too.

sb5_coeffs_t2 <- word_coeffs_asb5_t2 %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

sb5_coeffs_t2_alldata <- sb5_coeffs_t2 %>%
  filter(!is.na(pos_scale_hypernyms), !is.na(aoa))

sb5_t2_mod1 <- lm(tval ~ pos_scale_hypernyms, data=sb5_coeffs_t2_alldata)
summary(sb5_t2_mod1)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms, data = sb5_coeffs_t2_alldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6758 -0.9300 -0.0780  0.8429  2.9634 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          0.11753    0.06910   1.701   0.0900 .
## pos_scale_hypernyms -0.16666    0.06822  -2.443   0.0152 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.193 on 296 degrees of freedom
## Multiple R-squared:  0.01976,    Adjusted R-squared:  0.01645 
## F-statistic: 5.968 on 1 and 296 DF,  p-value: 0.01515
sb5_t2_mod2 <- lm(tval ~ pos_scale_hypernyms + adult_log_freq + aoa,
               data=sb5_coeffs_t2_alldata)

summary(sb5_t2_mod2)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + adult_log_freq + aoa, 
##     data = sb5_coeffs_t2_alldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8208 -0.8967  0.0160  0.8863  3.2628 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         -0.48639    0.54640  -0.890  0.37410   
## pos_scale_hypernyms -0.08680    0.06881  -1.262  0.20811   
## adult_log_freq       0.14728    0.05137   2.867  0.00444 **
## aoa                 -0.04590    0.05276  -0.870  0.38506   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.16 on 294 degrees of freedom
## Multiple R-squared:  0.07953,    Adjusted R-squared:  0.07014 
## F-statistic: 8.468 on 3 and 294 DF,  p-value: 2.055e-05
sb5_t2_mod3 <- lm(tval ~ adult_log_freq, data=sb5_coeffs_t2_alldata)
summary(sb5_t2_mod3)
## 
## Call:
## lm(formula = tval ~ adult_log_freq, data = sb5_coeffs_t2_alldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8430 -0.8740 -0.0041  0.8857  3.3558 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.99172    0.24122  -4.111  5.1e-05 ***
## adult_log_freq  0.18840    0.03937   4.785  2.7e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.161 on 296 degrees of freedom
## Multiple R-squared:  0.0718, Adjusted R-squared:  0.06866 
## F-statistic:  22.9 on 1 and 296 DF,  p-value: 2.704e-06
freq.resid.t2 <- lm(tval ~ adult_log_freq, sb5_coeffs_t2_alldata)
hyper.resid.t2 <- lm(tval ~ pos_scale_hypernyms, sb5_coeffs_t2_alldata)

sb5_coeffs_resid_t2 <- sb5_coeffs_t2_alldata %>%
  mutate(tval_control_for_freq = freq.resid.t2$residuals,
         tval_control_for_hyper = hyper.resid.t2$residuals)

6.2.1.1 Residual plots

training_words <- c("address", "body", "bunch", "cloth", "end", "land", "list", "meal", "month", "number", "piece", "place", "sound", "vegetable", "way", "work")
# 
# ggplot(data = sb5_coeffs_resid_t2, aes(x = pos_scale_hypernyms, y = tval, label = word))+
#   geom_point()+
#   geom_label(size=3)+
#   geom_smooth(method="lm")+
#   theme_classic()

ggplot(sb5_coeffs_resid_t2, aes(x = adult_log_freq, y = tval_control_for_hyper))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()+
  labs(x = "Frequency", y = "t-value (controlling for hypernyms)")
## `geom_smooth()` using formula 'y ~ x'

ggplot(sb5_coeffs_resid_t2, aes(x = pos_scale_hypernyms, y = tval_control_for_freq))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()+
  labs(x = "Hypernyms (scaled by pos)", y = "t-value (controlling for frequency)")
## `geom_smooth()` using formula 'y ~ x'

# ggplot(sb5_coeffs_resid, aes(x = adult_log_freq, y = Estimate, label = word))+
#   geom_point()+
#   geom_label(data= filter(sb5_coeffs_resid, word %in% training_words), size=5)+
#   geom_smooth(method="lm")+
#   theme_classic()

6.3 Why is T1 vocab prective but contemporaneous vocab isn’t?

How are kids’ vocabularies changing between T1 and T2?

mean_word_info_plot <- child_mean_word_info %>% 
  pivot_longer(cols = mean_helpfulness_of_vocab_t1:mean_generality_of_vocab_t3, names_to = "measure", values_to = "value") %>% 
  separate(measure, into = c("measure","timepoint"),sep = -3) %>% 
  filter(subjCode %in% task_data$subjCode)
mean_word_info_plot$timepoint = toupper(str_replace(mean_word_info_plot$timepoint, "_", ""))
mean_word_info_plot$timepoint = str_replace(mean_word_info_plot$timepoint, "3", "2")

6.3.1 Hypernymy

ggplot(filter(mean_word_info_plot, measure=="mean_hypernymy_of_vocab"), aes(x = as.factor(timepoint), y = value, group = subjCode))+
  geom_point()+
  geom_line()+
  theme_classic()+
  labs(x = "Timepoint", y = "Mean hypernymy of vocab (scaled by PoS)")

6.3.2 Frequency

ggplot(filter(mean_word_info_plot, measure=="mean_freq_of_vocab"), aes(x = as.factor(timepoint), y = value, group = subjCode))+
  geom_point()+
  geom_line()+
  theme_classic()+
  labs(x = "Timepoint", y = "Mean frequency of vocab")

7 Optional reading: New analyses using effect coding for timepoint and word knowledge

knowledge_factor_coding: never learned = -1, knew at T1 = 0, learned at T2 = 1

dummy_coded_data <- vocab_tasks_byword %>%
  filter(subjCode %in% df_to_analyze$subjCode) %>% 
  select(subjCode, timepoint, word, says) %>%
  distinct() %>%
  pivot_wider(names_from="timepoint", values_from="says", names_prefix="t") %>%
  mutate(
    knew_at_t1 = ifelse(t1==1, 1, 0),
    learned_at_t2 = ifelse((t1==0 & t3==1),1,0),
    never_learned = ifelse((t1==0 & t3==0),1,0),
    knowledge_factor_coding = case_when(
      never_learned == 1 ~ -1,
      knew_at_t1 == 1 ~ 0,
      learned_at_t2 == 1 ~ 1    )
  ) %>%
  select(-t1, -t3)

word_knowledge_summary <- dummy_coded_data %>% group_by(word, knowledge_factor_coding) %>% summarise(n=n()) %>% 
  pivot_wider(names_from = knowledge_factor_coding, names_prefix = "knew", values_from = n) %>% 
  rename("never_knew" = "knew-1", "knew_at_t1" = "knew0", "learned_at_t2" = "knew1")
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
words_to_use <- word_knowledge_summary %>% filter(knew_at_t1 > 8 & learned_at_t2 > 8)

dummy_coding_analysis <- dummy_coded_data %>%
  left_join(task_data_wide, by="subjCode") %>%
  rename(item = word)
dummy_coding_analysis$knowledge_factor_coding <- factor(as.character(dummy_coding_analysis$knowledge_factor_coding))

predict_asb5_effect <- "ASB5 ~ knowledge_factor_coding + childAge_days_t2 + total_says_t1"

# new coefficient function
get_word_beta_2 <- function(word, mod_formula, df){
  relevant_df <- df %>%
    filter(item == word)
  model <- lm(mod_formula, relevant_df)
  summary(model)$coefficients %>%
    data.frame() %>%
    rownames_to_column("term")%>%
    filter(term == 'knowledge_factor_coding0' | term == "knowledge_factor_coding1") %>%
    mutate(item = word)
}

#test case
vegetable <- get_word_beta_2("vegetable", predict_asb5_effect, dummy_coding_analysis)

#filter out words with no variability
word_knowledge_variability <- dummy_coding_analysis %>% group_by(item) %>% filter(!is.na(knowledge_factor_coding)) %>%
  filter(!is.na(ASB5)) %>% summarize(factor_variability = n_distinct(knowledge_factor_coding))
## `summarise()` ungrouping output (override with `.groups` argument)
problem_words <- word_knowledge_variability %>% filter(factor_variability==1)
dummy_coding_noproblems <- dummy_coding_analysis %>% filter(!(item %in% problem_words$item))

##alternative: filter out words with kind of meaningless groups
dummy_coding_words_to_use <- dummy_coding_analysis %>% filter(item %in% words_to_use$word)

# do the new thing
word_coeffs_asb5_effect <- map_df(paste("",unique(dummy_coding_words_to_use$item),"",sep=""),
                      get_word_beta_2,
                      as.formula(predict_asb5_effect),
                      dummy_coding_words_to_use) %>%
  select(item,term,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
  group_by(item) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_asb5_effect)

7.1 Predict tval for words: separate never-knew vs. knew at t1 and never-knew vs. knew at t2

word_coeffs_never_vs_t1 <- word_coeffs_asb5_effect %>% filter(term=="knowledge_factor_coding0") %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")
word_coeffs_never_vs_t2 <- word_coeffs_asb5_effect %>% filter(term=="knowledge_factor_coding1") %>% 
  rename(word=item) %>% 
  left_join(word_info, by="word")

7.1.1 Never-knew vs. Knew at T1

never_vs_t1_mod1 <- lm(tval ~ pos_scale_hypernyms + aoa + concreteness + mean_helpfulness + mean_preschoolness+
                         adult_log_freq, data=word_coeffs_never_vs_t1)
summary(never_vs_t1_mod1)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa + concreteness + 
##     mean_helpfulness + mean_preschoolness + adult_log_freq, data = word_coeffs_never_vs_t1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1214 -0.5419 -0.0472  0.5417  3.8334 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)          0.49167    1.87314   0.262    0.794
## pos_scale_hypernyms -0.12109    0.14398  -0.841    0.403
## aoa                 -0.19821    0.13091  -1.514    0.135
## concreteness         0.01433    0.18025   0.079    0.937
## mean_helpfulness     0.07208    0.33262   0.217    0.829
## mean_preschoolness  -0.01157    0.21663  -0.053    0.958
## adult_log_freq       0.05987    0.12859   0.466    0.643
## 
## Residual standard error: 1.068 on 64 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1263, Adjusted R-squared:  0.04443 
## F-statistic: 1.542 on 6 and 64 DF,  p-value: 0.1788
never_vs_t1_mod2 <- lm(tval ~ pos_scale_hypernyms + aoa, data=word_coeffs_never_vs_t1)
summary(never_vs_t1_mod2)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa, data = word_coeffs_never_vs_t1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0234 -0.5323 -0.0115  0.5235  3.9279 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.3972     0.5680   2.460   0.0164 *
## pos_scale_hypernyms  -0.1387     0.1333  -1.040   0.3018  
## aoa                  -0.2476     0.0985  -2.514   0.0143 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.041 on 68 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1183, Adjusted R-squared:  0.09232 
## F-statistic:  4.56 on 2 and 68 DF,  p-value: 0.01386

7.1.2 Never-knew vs. Knew at T2

never_vs_t2_mod1 <- lm(tval ~ pos_scale_hypernyms + aoa + concreteness + mean_helpfulness + mean_preschoolness+
                         adult_log_freq, data=word_coeffs_never_vs_t2)
summary(never_vs_t2_mod1)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa + concreteness + 
##     mean_helpfulness + mean_preschoolness + adult_log_freq, data = word_coeffs_never_vs_t2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8774 -0.8406 -0.1364  0.6600  3.2891 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          3.852602   1.856967   2.075  0.04198 * 
## pos_scale_hypernyms -0.003576   0.142744  -0.025  0.98009   
## aoa                 -0.413546   0.129778  -3.187  0.00221 **
## concreteness        -0.147383   0.177399  -0.831  0.40913   
## mean_helpfulness    -0.330542   0.329537  -1.003  0.31956   
## mean_preschoolness  -0.030715   0.214317  -0.143  0.88648   
## adult_log_freq       0.047544   0.127415   0.373  0.71026   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.059 on 65 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1756, Adjusted R-squared:  0.09949 
## F-statistic: 2.307 on 6 and 65 DF,  p-value: 0.04422
never_vs_t2_mod2 <- lm(tval ~ aoa, data=word_coeffs_never_vs_t2)
summary(never_vs_t2_mod2)
## 
## Call:
## lm(formula = tval ~ aoa, data = word_coeffs_never_vs_t2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7794 -0.8126 -0.0837  0.7233  3.1931 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.05154    0.54473   3.766 0.000342 ***
## aoa         -0.33912    0.09502  -3.569 0.000653 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.033 on 70 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1539, Adjusted R-squared:  0.1419 
## F-statistic: 12.74 on 1 and 70 DF,  p-value: 0.0006525

7.2 Follow-up comparison: Knew at T1 vs. Knew at T2

Factors: knew at T1 = 0, knew at (only) T2 = 1. So coefficients generated here show the effect of knowing the word at T2, relative to kids who knew it at T1. T-values generally small and not very reliable.

t1_t2_effect_comparison <- dummy_coding_words_to_use %>% 
  filter(knowledge_factor_coding != -1) %>% 
  mutate(t1_t2_factor_coding = ifelse(knowledge_factor_coding==1, "1","0"))

t1_t2_effect_comparison$t1_t2_factor_coding = factor(as.character(t1_t2_effect_comparison$t1_t2_factor_coding))

get_word_beta_3 <- function(word, mod_formula, df){
  relevant_df <- df %>%
    filter(item == word)
  model <- lm(mod_formula, relevant_df)
  summary(model)$coefficients %>%
    data.frame() %>%
    rownames_to_column("term") %>%
    filter(term=="t1_t2_factor_coding1") %>% 
    mutate(item = word)
}

word_coeffs_asb5_t1_v_t2 <- map_df(paste("",unique(t1_t2_effect_comparison$item),"",sep=""),
                      get_word_beta_3,
                      as.formula("ASB5 ~ t1_t2_factor_coding + childAge_days_t2 + total_says_t1"),
                      t1_t2_effect_comparison) %>%
  select(item,term,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>%
  group_by(item) %>% 
  arrange(desc(tval))

DT::datatable(word_coeffs_asb5_t1_v_t2)
hist(word_coeffs_asb5_t1_v_t2$tval)

7.2.1 Predict t

(what does knowing a word earlier at T1 get you compared to if you only knew it at T2?) (absolutely nothing - adjusted R-squared is negligible)

t1_vs_t2_coeffs <- word_coeffs_asb5_t1_v_t2 %>% 
  rename(word = item) %>% 
  left_join(word_info, by="word")

t1_vs_t2_mod1 <- lm(tval ~ pos_scale_hypernyms + aoa + concreteness + mean_helpfulness + mean_preschoolness+
                         adult_log_freq, data=t1_vs_t2_coeffs)
summary(t1_vs_t2_mod1)
## 
## Call:
## lm(formula = tval ~ pos_scale_hypernyms + aoa + concreteness + 
##     mean_helpfulness + mean_preschoolness + adult_log_freq, data = t1_vs_t2_coeffs)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.75505 -0.45506 -0.00322  0.57751  2.36287 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          4.08533    1.90584   2.144   0.0358 *
## pos_scale_hypernyms  0.15040    0.14650   1.027   0.3084  
## aoa                 -0.29306    0.13319  -2.200   0.0313 *
## concreteness        -0.18006    0.18207  -0.989   0.3263  
## mean_helpfulness    -0.42828    0.33821  -1.266   0.2099  
## mean_preschoolness   0.03702    0.21996   0.168   0.8669  
## adult_log_freq      -0.06649    0.13077  -0.508   0.6129  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.087 on 65 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.08899,    Adjusted R-squared:  0.004893 
## F-statistic: 1.058 on 6 and 65 DF,  p-value: 0.3967