Read in relevant files & merge

mcdi <- read.csv("hypernym_analysis-master/wordbank/wordbank-aoa-data.csv") %>% 
  filter(language=="English (American)") %>% 
  select(uni_lemma, words, aoa) %>% 
  rename(aoa_months=aoa, Word=uni_lemma) %>% 
  select(Word)
hypernyms <- read.csv("seedWords_aoa_hypernyms_concreteness_CDI_z.csv") %>% 
  select(-Conc.M, conc_z_pos) %>% 
  rename(num_hypernyms=hypernyms)
concreteness <- read.csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep="") %>% 
  filter(Dom_Pos != 0) %>% 
  select(Word, Conc.M)
hyponyms <- read.csv("AOA/aoa_to_wordnet_with_hyponyms1.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_hyponyms, pos)
peers <- read.csv("AOA/aoa_to_wordnet_with_peers.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_peers, pos)
subtlex <- read.csv("subtlex_full.txt", sep="")
candidate_words <- left_join(hypernyms, concreteness, by="Word") %>% 
  filter(!(Word %in% mcdi)) %>% 
  left_join(hyponyms, by=c("Word", "pos")) %>% 
  left_join(peers, by=c("Word","pos")) %>%
  group_by(pos) %>% 
  mutate(conc_z_pos = scale(Conc.M)) %>% 
  ungroup() %>% 
  arrange(Word)
head(candidate_words)

Look up childes frequency

candidate_word_list <- candidate_words$Word
#stop doing this every time you need to run this code, it takes forever
#candidate_word_tokens_raw <- get_tokens(collection = "Eng-NA",
                                    #role_exclude = "Target_Child",
                                    #token=candidate_word_list) 
#read csv instead
candidate_word_tokens_raw <- read.csv("candidate_word_tokens_CHILDES.csv")
candidate_word_tokens <- candidate_word_tokens_raw %>% 
  select(gloss, speaker_id, token_order)
  
candidate_word_token_summary <- candidate_word_tokens %>% 
  mutate(gloss = tolower(gloss)) %>%
  filter(gloss %in% candidate_word_list) %>% 
  group_by(gloss) %>%
  summarise(raw_freq_childes=n()) %>% 
  mutate(logFreq_childes=log(raw_freq_childes+1))
weird_words <- filter(candidate_word_token_summary, !(gloss %in% candidate_word_list))
not_in_childes <- filter(candidate_words, !(Word %in% candidate_word_token_summary$gloss))
#df to merge
childes_frequency <- candidate_word_token_summary %>% 
  rename(Word=gloss)
candidate_words_childes <- left_join(candidate_words, childes_frequency, by="Word")

Add subtlex frequency

subtlex_merge <- select(subtlex, Word, FREQcount)
candidate_words_all_freqs <- left_join(candidate_words_childes, subtlex_merge, by="Word") %>% 
  rename(raw_freq_subtlex=FREQcount) %>% 
  mutate(logFreq_subtlex=log(raw_freq_subtlex+1))
#fix num_hyponyms
candidate_words_all_freqs$num_hyponyms <- as.integer(candidate_words_all_freqs$num_hyponyms)

Play around with graphing stuff and correlations

Correlation matrix

candidate_words_corrmatrix <- select(candidate_words_all_freqs, -Word, -pos, -CatName, -MCDI_Cat)
candidate_words_cor <- cor(candidate_words_corrmatrix, use="pairwise.complete.obs", method="pearson")
p.mat_item <- cor.mtest(candidate_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(candidate_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=10/ncol(candidate_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")

Look at relations between density & category hierarchy

Hyponyms & Hypernyms

ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_hyponyms, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

Hypernyms & Peers

ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_peers, label=as.character(Word)))+
  geom_point()+
  #geom_smooth(method="lm")
  geom_label()+
  theme_classic()

#What are those low-hypernym, highly dense words? (they are all verbs - this makes sense)
lowHyper_highPeer <- filter(candidate_words_all_freqs, (num_peers > 300) & (num_hypernyms < 2)) %>% 
  select(Word, pos, aoa, num_peers, num_hyponyms, num_hypernyms, Conc.M,logFreq_childes)
DT::datatable(lowHyper_highPeer)

Hyponyms & Peers

ggplot(candidate_words_all_freqs, aes(num_hyponyms, num_peers, label=as.character(Word)))+
  geom_point()+
  #geom_smooth(method="lm")
  geom_label()+
  scale_x_continuous(breaks=seq(0,120,15))+
  theme_classic()

Compare childes and subtlex frequencies (r = .76)

Points

ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()

Words

ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

Childes frequency vs. concreteness (r = -.2, ns)

Points

ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()

Words

ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

Childes frequency vs. AOA (r = -.45)

Points

ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()

Words

ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

Childes frequency vs. AOA for lower-than-mean concreteness

Points

ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()

Words

ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

What are the low-concreteness (lower than mean), high-frequency (higher than mean), late-aoa (older than 5) words?

hFreq_hAOA_lConc <- filter(candidate_words_all_freqs, (Conc.M < 3.90) & (aoa>5) & (logFreq_childes > 5.41)) %>% 
  select(Word, pos, aoa, num_hypernyms, num_hyponyms, num_peers, Conc.M, logFreq_childes, logFreq_subtlex)
DT::datatable(hFreq_hAOA_lConc)

Some stats

Does concreteness predict AOA, controlling for frequency? (yes)

predAOA <- lm(aoa ~ logFreq_childes + Conc.M, candidate_words_all_freqs)
summary(predAOA)

Call:
lm(formula = aoa ~ logFreq_childes + Conc.M, data = candidate_words_all_freqs)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.53400 -0.40355  0.05828  0.46345  1.36855 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      6.76833    0.22543  30.025  < 2e-16 ***
logFreq_childes -0.19723    0.02027  -9.728  < 2e-16 ***
Conc.M          -0.19149    0.04408  -4.344  1.9e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.6303 on 307 degrees of freedom
  (9 observations deleted due to missingness)
Multiple R-squared:  0.2465,    Adjusted R-squared:  0.2416 
F-statistic: 50.22 on 2 and 307 DF,  p-value: < 2.2e-16
---
title: "Create df of candidate seed words & poke at it"
output:
  html_notebook:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
library(childesr)
library(lme4)
library(corrplot)
library(ggrepel)
library(psych)
```

### Read in relevant files & merge
```{r}
mcdi <- read.csv("hypernym_analysis-master/wordbank/wordbank-aoa-data.csv") %>% 
  filter(language=="English (American)") %>% 
  select(uni_lemma, words, aoa) %>% 
  rename(aoa_months=aoa, Word=uni_lemma) %>% 
  select(Word)
hypernyms <- read.csv("seedWords_aoa_hypernyms_concreteness_CDI_z.csv") %>% 
  select(-Conc.M, conc_z_pos) %>% 
  rename(num_hypernyms=hypernyms)
concreteness <- read.csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep="") %>% 
  filter(Dom_Pos != 0) %>% 
  select(Word, Conc.M)
hyponyms <- read.csv("AOA/aoa_to_wordnet_with_hyponyms1.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_hyponyms, pos)
peers <- read.csv("AOA/aoa_to_wordnet_with_peers.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_peers, pos)
subtlex <- read.csv("subtlex_full.txt", sep="")

candidate_words <- left_join(hypernyms, concreteness, by="Word") %>% 
  filter(!(Word %in% mcdi)) %>% 
  left_join(hyponyms, by=c("Word", "pos")) %>% 
  left_join(peers, by=c("Word","pos")) %>%
  group_by(pos) %>% 
  mutate(conc_z_pos = scale(Conc.M)) %>% 
  ungroup() %>% 
  arrange(Word)

head(candidate_words)
```

#### Look up childes frequency
```{r}
candidate_word_list <- candidate_words$Word

#stop doing this every time you need to run this code, it takes forever
#candidate_word_tokens_raw <- get_tokens(collection = "Eng-NA",
                                    #role_exclude = "Target_Child",
                                    #token=candidate_word_list) 
#read csv instead
candidate_word_tokens_raw <- read.csv("candidate_word_tokens_CHILDES.csv")
candidate_word_tokens <- candidate_word_tokens_raw %>% 
  select(gloss, speaker_id, token_order)
  
candidate_word_token_summary <- candidate_word_tokens %>% 
  mutate(gloss = tolower(gloss)) %>%
  filter(gloss %in% candidate_word_list) %>% 
  group_by(gloss) %>%
  summarise(raw_freq_childes=n()) %>% 
  mutate(logFreq_childes=log(raw_freq_childes+1))

weird_words <- filter(candidate_word_token_summary, !(gloss %in% candidate_word_list))
not_in_childes <- filter(candidate_words, !(Word %in% candidate_word_token_summary$gloss))

#df to merge
childes_frequency <- candidate_word_token_summary %>% 
  rename(Word=gloss)

candidate_words_childes <- left_join(candidate_words, childes_frequency, by="Word")
```

#### Add subtlex frequency
```{r}
subtlex_merge <- select(subtlex, Word, FREQcount)
candidate_words_all_freqs <- left_join(candidate_words_childes, subtlex_merge, by="Word") %>% 
  rename(raw_freq_subtlex=FREQcount) %>% 
  mutate(logFreq_subtlex=log(raw_freq_subtlex+1))

#fix num_hyponyms
candidate_words_all_freqs$num_hyponyms <- as.integer(candidate_words_all_freqs$num_hyponyms)
```

## Play around with graphing stuff and correlations
### Correlation matrix
```{r}
candidate_words_corrmatrix <- select(candidate_words_all_freqs, -Word, -pos, -CatName, -MCDI_Cat)

candidate_words_cor <- cor(candidate_words_corrmatrix, use="pairwise.complete.obs", method="pearson")

p.mat_item <- cor.mtest(candidate_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(candidate_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=10/ncol(candidate_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")
```
### Look at relations between density & category hierarchy {.tabset}
#### Hyponyms & Hypernyms
```{r}
ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_hyponyms, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

#### Hypernyms & Peers
```{r}
ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_peers, label=as.character(Word)))+
  geom_point()+
  #geom_smooth(method="lm")
  geom_label()+
  theme_classic()

#What are those low-hypernym, highly dense words? (they are all verbs - this makes sense)
lowHyper_highPeer <- filter(candidate_words_all_freqs, (num_peers > 300) & (num_hypernyms < 2)) %>% 
  select(Word, pos, aoa, num_peers, num_hyponyms, num_hypernyms, Conc.M,logFreq_childes)
DT::datatable(lowHyper_highPeer)

```
#### Hyponyms & Peers
```{r}
ggplot(candidate_words_all_freqs, aes(num_hyponyms, num_peers, label=as.character(Word)))+
  geom_point()+
  #geom_smooth(method="lm")
  geom_label()+
  scale_x_continuous(breaks=seq(0,120,15))+
  theme_classic()
```


### Compare childes and subtlex frequencies (*r* = .76) {.tabset}
#### Points
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Childes frequency vs. concreteness (*r* = -.2, ns) {.tabset}
#### Points
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Childes frequency vs. AOA (*r* = -.45) {.tabset}
#### Points
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Childes frequency vs. AOA for lower-than-mean concreteness {.tabset}
#### Points
```{r}
ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

```

### What are the low-concreteness (lower than mean), high-frequency (higher than mean), late-aoa (older than 5) words?
```{r}
hFreq_hAOA_lConc <- filter(candidate_words_all_freqs, (Conc.M < 3.90) & (aoa>5) & (logFreq_childes > 5.41)) %>% 
  select(Word, pos, aoa, num_hypernyms, num_hyponyms, num_peers, Conc.M, logFreq_childes, logFreq_subtlex)

DT::datatable(hFreq_hAOA_lConc)
```

## Some stats
### Does concreteness predict AOA, controlling for frequency? (yes)
```{r}

predAOA <- lm(aoa ~ logFreq_childes + Conc.M, candidate_words_all_freqs)
summary(predAOA)

```