Regression analysis for each word

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(feather)
library(langcog)
library(modelr)
library(broom)
library(corrplot)


theme_set(theme_classic(base_size = 10))

Predicting mtld at t2

MODEL: lm(log_mtld_t2 ~ know_word_at_t1 + log_mtld_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5_t2 <- read_csv("word_coeffs_log_mtld_t2.csv")

ggplot(word_coeffs_min5_t2, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5_t2 %>%
  arrange(-t) %>%
  DT::datatable()

Prediciting t with centrality

freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
  rename(word = Word,
         log_freq = Lg10WF)

#https://github.com/billdthompson/semantic-density-norms/tree/master/results
density_norms <-read_csv(RCurl::getURL("https://raw.githubusercontent.com/billdthompson/semantic-density-norms/master/results/en-semantic-densities-N100000.csv?token=AF32iQ1bVCk5vJYPFGzapTG5b0JoZELhks5bWJNGwA%3D%3D")) %>%
  rename(centrality = `global-centrality`) %>%
  select(word,centrality) 


word_coeffs_min5_t2_with_vars <- word_coeffs_min5_t2 %>%
  left_join(density_norms) %>%
  left_join(freq)

lm(t ~ centrality + log_freq, word_coeffs_min5_t2_with_vars) %>%
  summary()

## 
## Call:
## lm(formula = t ~ centrality + log_freq, data = word_coeffs_min5_t2_with_vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98367 -0.42708  0.04979  0.41267  1.83862 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.06692    0.16852   0.397    0.691    
## centrality  -0.59248    1.14546  -0.517    0.605    
## log_freq     0.14598    0.01789   8.158 7.93e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5929 on 1311 degrees of freedom
##   (897 observations deleted due to missingness)
## Multiple R-squared:  0.05215,    Adjusted R-squared:  0.0507 
## F-statistic: 36.06 on 2 and 1311 DF,  p-value: 5.661e-16

Predicting change in mtld

MODEL: lm(mtld_diff ~ know_word_at_t1 + age_t1 + age_diff + log(n_transcripts_t1) + log(n_transcripts_t2), complete_df)

word_coeffs_min5 <- read_csv("word_coeffs_log_mtld_diff.csv")

ggplot(word_coeffs_min5, aes(t)) +
  geom_histogram() +
  ggtitle("t-distribution ") +
  geom_vline(aes(xintercept = 2), color = "red") +
  geom_vline(aes(xintercept = -2), color = "red") +
  theme_classic()

 word_coeffs_min5 %>%
  arrange(-t) %>%
  DT::datatable()

Sanity Check

word_counts <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") %>%
  filter(tbin == "t1") %>%
  select(target_child_id, gloss, count)  %>%
  filter(count > 5)

t1_word_counts_with_ts <- word_counts  %>%
  left_join(word_coeffs_min5_t2 %>% select(word, t), 
            by = c("gloss" = "word")) %>%
  mutate(weighted_t = t*count,
         weighted_log_t = t * log(count)) %>%
  select(-gloss, -count) %>%
  group_by(target_child_id) %>%
  summarize_all(sum)

mtld <- read_csv("../3_kid_vocabs/semantic_density_df.csv") %>%
  select(target_child_id, log_mtld_t2)

t1_word_counts_with_ts_mtld <- t1_word_counts_with_ts %>%
  left_join(mtld) %>%
  select(-target_child_id)


corr_mat <- cor(t1_word_counts_with_ts_mtld, 
                use = "pairwise.complete.obs")

library(corrplot)
p.mat <- cor.mtest(t1_word_counts_with_ts_mtld, 
                  conf.level = .95,  
                  use = "pairwise.complete.obs")$p

cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot(corr_mat, method = "color",  col = cols,
         type = "full", order = "hclust", number.cex = .7,
         addCoef.col = "black", insig = "blank",
         p.mat = p.mat, sig.level = .05, 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

t1_word_counts_with_ts_mtld %>%
ggplot( aes(x =  t , y = log_mtld_t2)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

t1_word_counts_with_ts_mtld_sub <- t1_word_counts_with_ts %>%
  left_join(mtld) %>%
  select(-target_child_id) %>%
  filter(t < 65)


corr_mat <- cor(t1_word_counts_with_ts_mtld_sub, 
                use = "pairwise.complete.obs")

p.mat <- cor.mtest(t1_word_counts_with_ts_mtld_sub, 
                  conf.level = .95,  
                  use = "pairwise.complete.obs")$p

cols <- rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot(corr_mat, method = "color",  col = cols,
         type = "full", order = "hclust", number.cex = .7,
         addCoef.col = "black", insig = "blank",
         p.mat = p.mat, sig.level = .05, 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

ggplot(t1_word_counts_with_ts_mtld_sub, aes(x =  t , y = log_mtld_t2)) +
  geom_point()  +
  geom_smooth(method = "lm") +
  theme_classic()

Regression analysis for each word

2018-07-18

Predicting mtld at t2

Prediciting t with centrality

Predicting change in mtld

Sanity Check