Read in relevant files & merge
mcdi <- read.csv("hypernym_analysis-master/wordbank/wordbank-aoa-data.csv") %>%
filter(language=="English (American)") %>%
select(uni_lemma, words, aoa) %>%
rename(aoa_months=aoa, Word=uni_lemma) %>%
select(Word)
hypernyms <- read.csv("seedWords_aoa_hypernyms_concreteness_CDI_z.csv") %>%
select(-Conc.M, conc_z_pos) %>%
rename(num_hypernyms=hypernyms)
concreteness <- read.csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep="") %>%
filter(Dom_Pos != 0) %>%
select(Word, Conc.M)
hyponyms <- read.csv("AOA/aoa_to_wordnet_with_hyponyms1.csv") %>%
rename(Word=wordnet_lemma, pos=wordnet_PoS) %>%
select(Word, num_hyponyms, pos)
peers <- read.csv("AOA/aoa_to_wordnet_with_peers.csv") %>%
rename(Word=wordnet_lemma, pos=wordnet_PoS) %>%
select(Word, num_peers, pos)
subtlex <- read.csv("subtlex_full.txt", sep="")
candidate_words <- left_join(hypernyms, concreteness, by="Word") %>%
filter(!(Word %in% mcdi)) %>%
left_join(hyponyms, by=c("Word", "pos")) %>%
left_join(peers, by=c("Word","pos")) %>%
group_by(pos) %>%
mutate(conc_z_pos = scale(Conc.M)) %>%
ungroup() %>%
arrange(Word)
head(candidate_words)
Look up childes frequency
candidate_word_list <- candidate_words$Word
#stop doing this every time you need to run this code, it takes forever
#candidate_word_tokens_raw <- get_tokens(collection = "Eng-NA",
#role_exclude = "Target_Child",
#token=candidate_word_list)
#read csv instead
candidate_word_tokens_raw <- read.csv("candidate_word_tokens_CHILDES.csv")
candidate_word_tokens <- candidate_word_tokens_raw %>%
select(gloss, speaker_id, token_order)
candidate_word_token_summary <- candidate_word_tokens %>%
mutate(gloss = tolower(gloss)) %>%
filter(gloss %in% candidate_word_list) %>%
group_by(gloss) %>%
summarise(raw_freq_childes=n()) %>%
mutate(logFreq_childes=log(raw_freq_childes+1))
weird_words <- filter(candidate_word_token_summary, !(gloss %in% candidate_word_list))
not_in_childes <- filter(candidate_words, !(Word %in% candidate_word_token_summary$gloss))
#df to merge
childes_frequency <- candidate_word_token_summary %>%
rename(Word=gloss)
candidate_words_childes <- left_join(candidate_words, childes_frequency, by="Word")
Add subtlex frequency
subtlex_merge <- select(subtlex, Word, FREQcount)
candidate_words_all_freqs <- left_join(candidate_words_childes, subtlex_merge, by="Word") %>%
rename(raw_freq_subtlex=FREQcount) %>%
mutate(logFreq_subtlex=log(raw_freq_subtlex+1))
#fix num_hyponyms
candidate_words_all_freqs$num_hyponyms <- as.integer(candidate_words_all_freqs$num_hyponyms)
Play around with graphing stuff and correlations
Correlation matrix
candidate_words_corrmatrix <- select(candidate_words_all_freqs, -Word, -pos, -CatName, -MCDI_Cat)
candidate_words_cor <- cor(candidate_words_corrmatrix, use="pairwise.complete.obs", method="pearson")
p.mat_item <- cor.mtest(candidate_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(candidate_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=10/ncol(candidate_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")

Look at relations between density & category hierarchy
Hyponyms & Hypernyms
ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_hyponyms, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Hypernyms & Peers
ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_peers, label=as.character(Word)))+
geom_point()+
#geom_smooth(method="lm")
geom_label()+
theme_classic()

#What are those low-hypernym, highly dense words? (they are all verbs - this makes sense)
lowHyper_highPeer <- filter(candidate_words_all_freqs, (num_peers > 300) & (num_hypernyms < 2)) %>%
select(Word, pos, aoa, num_peers, num_hyponyms, num_hypernyms, Conc.M,logFreq_childes)
DT::datatable(lowHyper_highPeer)
Hyponyms & Peers
ggplot(candidate_words_all_freqs, aes(num_hyponyms, num_peers, label=as.character(Word)))+
geom_point()+
#geom_smooth(method="lm")
geom_label()+
scale_x_continuous(breaks=seq(0,120,15))+
theme_classic()

Compare childes and subtlex frequencies (r = .76)
Points
ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Childes frequency vs. concreteness (r = -.2, ns)
Points
ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Childes frequency vs. AOA (r = -.45)
Points
ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Childes frequency vs. AOA for lower-than-mean concreteness
Points
ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

What are the low-concreteness (lower than mean), high-frequency (higher than mean), late-aoa (older than 5) words?
hFreq_hAOA_lConc <- filter(candidate_words_all_freqs, (Conc.M < 3.90) & (aoa>5) & (logFreq_childes > 5.41)) %>%
select(Word, pos, aoa, num_hypernyms, num_hyponyms, num_peers, Conc.M, logFreq_childes, logFreq_subtlex)
DT::datatable(hFreq_hAOA_lConc)
Some stats
Does concreteness predict AOA, controlling for frequency? (yes)
predAOA <- lm(aoa ~ logFreq_childes + Conc.M, candidate_words_all_freqs)
summary(predAOA)
Call:
lm(formula = aoa ~ logFreq_childes + Conc.M, data = candidate_words_all_freqs)
Residuals:
Min 1Q Median 3Q Max
-1.53400 -0.40355 0.05828 0.46345 1.36855
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.76833 0.22543 30.025 < 2e-16 ***
logFreq_childes -0.19723 0.02027 -9.728 < 2e-16 ***
Conc.M -0.19149 0.04408 -4.344 1.9e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6303 on 307 degrees of freedom
(9 observations deleted due to missingness)
Multiple R-squared: 0.2465, Adjusted R-squared: 0.2416
F-statistic: 50.22 on 2 and 307 DF, p-value: < 2.2e-16
---
title: "Create df of candidate seed words & poke at it"
output:
  html_notebook:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
library(childesr)
library(lme4)
library(corrplot)
library(ggrepel)
library(psych)
```

### Read in relevant files & merge
```{r}
mcdi <- read.csv("hypernym_analysis-master/wordbank/wordbank-aoa-data.csv") %>% 
  filter(language=="English (American)") %>% 
  select(uni_lemma, words, aoa) %>% 
  rename(aoa_months=aoa, Word=uni_lemma) %>% 
  select(Word)
hypernyms <- read.csv("seedWords_aoa_hypernyms_concreteness_CDI_z.csv") %>% 
  select(-Conc.M, conc_z_pos) %>% 
  rename(num_hypernyms=hypernyms)
concreteness <- read.csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep="") %>% 
  filter(Dom_Pos != 0) %>% 
  select(Word, Conc.M)
hyponyms <- read.csv("AOA/aoa_to_wordnet_with_hyponyms1.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_hyponyms, pos)
peers <- read.csv("AOA/aoa_to_wordnet_with_peers.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_peers, pos)
subtlex <- read.csv("subtlex_full.txt", sep="")

candidate_words <- left_join(hypernyms, concreteness, by="Word") %>% 
  filter(!(Word %in% mcdi)) %>% 
  left_join(hyponyms, by=c("Word", "pos")) %>% 
  left_join(peers, by=c("Word","pos")) %>%
  group_by(pos) %>% 
  mutate(conc_z_pos = scale(Conc.M)) %>% 
  ungroup() %>% 
  arrange(Word)

head(candidate_words)
```

#### Look up childes frequency
```{r}
candidate_word_list <- candidate_words$Word

#stop doing this every time you need to run this code, it takes forever
#candidate_word_tokens_raw <- get_tokens(collection = "Eng-NA",
                                    #role_exclude = "Target_Child",
                                    #token=candidate_word_list) 
#read csv instead
candidate_word_tokens_raw <- read.csv("candidate_word_tokens_CHILDES.csv")
candidate_word_tokens <- candidate_word_tokens_raw %>% 
  select(gloss, speaker_id, token_order)
  
candidate_word_token_summary <- candidate_word_tokens %>% 
  mutate(gloss = tolower(gloss)) %>%
  filter(gloss %in% candidate_word_list) %>% 
  group_by(gloss) %>%
  summarise(raw_freq_childes=n()) %>% 
  mutate(logFreq_childes=log(raw_freq_childes+1))

weird_words <- filter(candidate_word_token_summary, !(gloss %in% candidate_word_list))
not_in_childes <- filter(candidate_words, !(Word %in% candidate_word_token_summary$gloss))

#df to merge
childes_frequency <- candidate_word_token_summary %>% 
  rename(Word=gloss)

candidate_words_childes <- left_join(candidate_words, childes_frequency, by="Word")
```

#### Add subtlex frequency
```{r}
subtlex_merge <- select(subtlex, Word, FREQcount)
candidate_words_all_freqs <- left_join(candidate_words_childes, subtlex_merge, by="Word") %>% 
  rename(raw_freq_subtlex=FREQcount) %>% 
  mutate(logFreq_subtlex=log(raw_freq_subtlex+1))

#fix num_hyponyms
candidate_words_all_freqs$num_hyponyms <- as.integer(candidate_words_all_freqs$num_hyponyms)
```

## Play around with graphing stuff and correlations
### Correlation matrix
```{r}
candidate_words_corrmatrix <- select(candidate_words_all_freqs, -Word, -pos, -CatName, -MCDI_Cat)

candidate_words_cor <- cor(candidate_words_corrmatrix, use="pairwise.complete.obs", method="pearson")

p.mat_item <- cor.mtest(candidate_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(candidate_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=10/ncol(candidate_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")
```
### Look at relations between density & category hierarchy {.tabset}
#### Hyponyms & Hypernyms
```{r}
ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_hyponyms, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

#### Hypernyms & Peers
```{r}
ggplot(candidate_words_all_freqs, aes(num_hypernyms, num_peers, label=as.character(Word)))+
  geom_point()+
  #geom_smooth(method="lm")
  geom_label()+
  theme_classic()

#What are those low-hypernym, highly dense words? (they are all verbs - this makes sense)
lowHyper_highPeer <- filter(candidate_words_all_freqs, (num_peers > 300) & (num_hypernyms < 2)) %>% 
  select(Word, pos, aoa, num_peers, num_hyponyms, num_hypernyms, Conc.M,logFreq_childes)
DT::datatable(lowHyper_highPeer)

```
#### Hyponyms & Peers
```{r}
ggplot(candidate_words_all_freqs, aes(num_hyponyms, num_peers, label=as.character(Word)))+
  geom_point()+
  #geom_smooth(method="lm")
  geom_label()+
  scale_x_continuous(breaks=seq(0,120,15))+
  theme_classic()
```


### Compare childes and subtlex frequencies (*r* = .76) {.tabset}
#### Points
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, logFreq_subtlex, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Childes frequency vs. concreteness (*r* = -.2, ns) {.tabset}
#### Points
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, Conc.M, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Childes frequency vs. AOA (*r* = -.45) {.tabset}
#### Points
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(candidate_words_all_freqs, aes(logFreq_childes, aoa, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Childes frequency vs. AOA for lower-than-mean concreteness {.tabset}
#### Points
```{r}
ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(logFreq_childes, aoa, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()

```

### What are the low-concreteness (lower than mean), high-frequency (higher than mean), late-aoa (older than 5) words?
```{r}
hFreq_hAOA_lConc <- filter(candidate_words_all_freqs, (Conc.M < 3.90) & (aoa>5) & (logFreq_childes > 5.41)) %>% 
  select(Word, pos, aoa, num_hypernyms, num_hyponyms, num_peers, Conc.M, logFreq_childes, logFreq_subtlex)

DT::datatable(hFreq_hAOA_lConc)
```

## Some stats
### Does concreteness predict AOA, controlling for frequency? (yes)
```{r}

predAOA <- lm(aoa ~ logFreq_childes + Conc.M, candidate_words_all_freqs)
summary(predAOA)

```