Get t-values for each word

Formula: total ~ known + childage_days + prop_known

MODEL_FORMULA <- "total ~ known + childage_days + prop_known"

# coefficient function
get_word_beta <- function(word, mod_formula, df){
  relevant_df <- df %>% 
    filter(item == word)
  model <- lm(mod_formula, relevant_df)
  summary(model)$coefficients %>%
    data.frame() %>% 
    rownames_to_column("predictor") %>%
    filter(predictor == 'known') %>%
    mutate(item = word)
}

ache <- get_word_beta('ache',as.formula(MODEL_FORMULA),seed_control_words)

#### DO THE THING ####
word_coeffs <- map_df(paste("",unique(seed_control_words$item),"",sep=""), 
                      get_word_beta,
                      as.formula(MODEL_FORMULA),
                      seed_control_words) %>% 
  select(item,Estimate,SE=Std..Error, tval=t.value, p=Pr...t..) %>% 
  arrange(tval)

word_coeffs_by_type <- word_coeffs %>% 
  left_join(types_only, by="item")
## Warning: Column `item` joining character vector and factor, coercing into
## character vector

Plot distribution of t-values

ggplot(word_coeffs_by_type, aes(tval, fill=type))+
  geom_density(alpha = .2)+
  theme_classic()

Does bimodal seed distribution have to do with category?

categories <- read.csv("C:/Users/Christina/Dropbox/UW Madison/Grant/AOA parent survey/survey_words_categories_2.csv") %>% 
  rename(item = Word) %>% 
  select(-MCDI_Cat)

word_coeffs_with_cat <- left_join(word_coeffs_by_type, categories, by="item")
## Warning: Column `item` joining character vector and factor, coercing into
## character vector
word_coeffs_with_cat$CatName <- as.factor(word_coeffs_with_cat$CatName)

DT::datatable(word_coeffs_with_cat)

T values by category (seed words)

ggplot(filter(word_coeffs_with_cat, !(is.na(CatName)) & type=="seed"), aes(tval, fill = CatName))+
  geom_density(alpha = .5)+
  facet_wrap(~CatName)+
  theme_classic()
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

T values by category (control words)

All categories

ggplot(filter(word_coeffs_with_cat, type=="control" & !(is.na(CatName))), aes(tval, fill = CatName))+
  geom_density(alpha = .5)+
  facet_wrap(~CatName)+
  theme_classic()

Categories that looked like high density

ggplot(filter(word_coeffs_with_cat, CatName=="prepositions and locations" | CatName=="people" | CatName=="body parts"), aes(tval, fill = CatName))+
  geom_density(alpha = .5)+
  facet_wrap(~CatName)+
  theme_classic()

  • There are 2 control words in “prepositions and locations” - “whole” and “middle”… I guess that makes sense

Compare seed and control T values by category

ggplot(filter(word_coeffs_with_cat, !(is.na(CatName)) & CatName != "prepositions and locations"), aes(tval, fill = type))+
  geom_density(alpha = .5)+
  facet_wrap(~CatName)+
  theme_classic()
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

Get categories where seed > control

specific_words <- word_coeffs_with_cat %>% filter(CatName == "small household items" | CatName == "descriptive words" | CatName == "mental states and attributes")

What are the most helpful seed words? (based on category)

helpfulcats <- c("mental states and attributes", "places to go", "small household items", "words about time")
helpful_seed <- filter(word_coeffs_with_cat, type == "seed" & (CatName %in% helpfulcats))

DT::datatable(helpful_seed)

What are the most helpful seed words, just based on t > 6?

seed_larger_t <- filter(word_coeffs_with_cat, type=="seed" & tval > 5.5)
DT::datatable(seed_larger_t)

What proportion of older (36-42 months) and younger (30-35.9 months) kids know these words?

Understands

Says

Of these words, which might be good to teach?

Understands - known by <50% of younger group

Understands - biggest change between age groups

Says - known by <50% of younger group

Says - biggest change between age groups