Word selection information
Seed words:
1) Got all Kuperman words with 3 < AOA < 6
2) Filtered out Kuperman words that were on MCDI
3) Got hypernyms for Kuperman words
4) Selected lowest 25% as potential seed words (= most superordinate)
Control words:
1) For each seed word, got all hyponyms
2) Filtered based on CHILDES production frequencies:
a) Child log freq > 0
b) Adult log freq > 2
3) Removed control words that were also in the seed word list
4) Removed ‘beast’, ‘group’, ‘part’ due to disproportionately many hyponyms (>2SDs above mean)
5) For remaining words (~1000), got wordnet definition based on synset
a) important because hyponym synsets don’t map 1-1 with how kids would actually use the word
6) For each control word definition, decided if a child would/could use word in this sense
7) Narrowed down control words to words where child WOULD use word in that sense
8) Currently keeping multiple potential controls for some seed words, as ~500 words is reasonable to ask parents about
Read in relevant files
kuperman <- read.csv("../AoA_ratings_Kuperman_et_al_BRM/AoA_ratings_Kuperman_et_al_BRM.csv") %>%
select(Word, aoa=Rating.Mean)
mcdi <- read.csv("hypernym_analysis-master/wordbank/wordbank-aoa-data.csv") %>%
filter(language=="English (American)") %>%
select(uni_lemma, words, aoa) %>%
rename(aoa_months=aoa, Word=uni_lemma) %>%
select(Word)
hypernyms <- read.csv("seedWords_aoa_hypernyms_concreteness_CDI_z.csv") %>%
select(-Conc.M, conc_z_pos) %>%
rename(num_hypernyms=hypernyms)
concreteness <- read.csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep="") %>%
filter(Dom_Pos != 0) %>%
select(Word, Conc.M)
hyponyms <- read.csv("AOA/aoa_to_wordnet_with_hyponyms.csv") %>%
rename(Word=wordnet_lemma, pos=wordnet_PoS) %>%
select(Word, num_hyponyms, pos)
peers <- read.csv("AOA/aoa_to_wordnet_with_peers.csv") %>%
rename(Word=wordnet_lemma, pos=wordnet_PoS) %>%
select(Word, num_peers, pos)
subtlex <- read.csv("subtlex_full.txt", sep="")
all_childes_counts <- read.csv("hypernym_analysis-master/childes/childes_word_freq_adult_and_kid.txt") %>%
rename(Word=word)
individual_hyponyms_int <- read.csv("AOA/all_hyponyms_per_seed_word.csv")
individual_hyponyms_int$hyponym <- as.character(individual_hyponyms_int$hyponym)
individual_hyponyms <- individual_hyponyms_int %>%
select(-X) %>%
separate(hyponym, into=c("Word","pos","wordnet_sense"), sep='\\.') %>%
rename(SeedWord=wordnet_lemma, num_peers=hyponym_num_peers, num_hypernyms=hyponym_num_hypernyms)
Merge into big df for selecting control words
#use existing childes frequency csv, don't need to look them up
candidate_words <- left_join(hypernyms, concreteness, by="Word") %>%
filter(!(Word %in% mcdi)) %>%
left_join(hyponyms, by=c("Word", "pos")) %>%
left_join(peers, by=c("Word","pos")) %>%
group_by(pos) %>%
mutate(conc_z_pos = scale(Conc.M)) %>%
ungroup() %>%
left_join(all_childes_counts, by="Word") %>%
arrange(Word) %>%
mutate(type="seed", SeedWord=NA)
#get hyponym count for each seed word, from hyponym csv
control_word_count <- individual_hyponyms %>%
filter(SeedWord %in% candidate_words$Word) %>%
group_by(SeedWord) %>%
summarise(N_controls=n())
seedList <- candidate_words$Word
#narrow control word list to match words in candidate_words and remove group, beast, part, as they have disproportionately many hyponyms
control_words <- individual_hyponyms %>%
filter(SeedWord %in% candidate_words$Word) %>%
filter(!(SeedWord %in% c("group","beast","part"))) %>%
left_join(all_childes_counts, by="Word") %>%
select(SeedWord,Word,pos,kid_log_freq,adult_log_freq,
num_peers,num_hypernyms) %>%
filter(!is.na(adult_log_freq)) %>%
mutate(type="control") %>%
left_join(kuperman, by="Word")
candidate_words_smaller <- candidate_words %>%
select(names(control_words))
seed_and_controls <- bind_rows(candidate_words_smaller, control_words)
# choose_control_words <- candidate_words %>%
# select(SeedWord=Word, aoa_seed=aoa, num_hypernyms_seed=num_hypernyms, num_hyponyms_seed=num_hyponyms,
# num_peers_seed=num_peers, seed_kid_freq=kid_freq,seed_kid_log_freq=kid_log_freq,
# seed_adult_freq=adult_freq,seed_adult_log_freq=adult_log_freq,seed_pos=pos) %>%
# left_join(control_words, by="SeedWord") %>%
# select(SeedWord,seed_pos,Word,control_pos,seed_adult_log_freq,control_adult_log_freq,aoa_seed,
# num_hypernyms_seed,num_peers_seed,num_hypernyms_control=control_word_num_hypernyms,
# num_peers_control=control_word_num_peers) %>%
# distinct()
#choose control words based on childes adult log frequency
# choose_control_words_by_freq <- choose_control_words %>%
# filter((seed_adult_log_freq-1 < control_adult_log_freq) & (control_adult_log_freq<seed_adult_log_freq+1)) %>%
# filter(seed_pos==control_pos) %>%
# distinct()
Add subtlex frequency
subtlex_merge <- select(subtlex, Word, FREQcount)
seed_and_controls_subtlex <- left_join(seed_and_controls, subtlex_merge, by="Word") %>%
mutate(logFreq_subtlex=log(FREQcount+1))
#need to add hyponyms (seed words only)
seed_and_controls_full <- left_join(seed_and_controls_subtlex, hyponyms, by=c("Word", "pos")) %>%
distinct() %>%
filter(adult_log_freq>2 & kid_log_freq>0) %>%
filter((type=="control" & !(Word %in% seedList)) | type=="seed") %>%
mutate(log_hyponyms = log(num_hyponyms+1))
seeds_only <- filter(seed_and_controls_full, type=="seed")
controls_only <- filter(seed_and_controls_full, type=="control")
Add info about whether child would know the specific sense of the control word
seeds_with_controls <- filter(filtered_seeds_controls_full, !(SeedWord %in% seeds_with_no_controls_list$SeedWord))
Error: unexpected symbol in:
"
seeds_with_controls"
Make df for analyses
new_hyponyms_controls <- read.csv("controls_actual_hypernyms_hyponyms.csv") %>% select(-X, -wordnet_sense, -num_hypernyms, -num_hyponyms_set)
analysis_controls <- left_join(filter(seeds_with_controls, type=="control"), new_hyponyms_controls, by=c("Word","pos")) %>%
select(-num_hyponyms_old, -log_hyponyms_old)
Column `Word` joining character vector and factor, coercing into character vectorColumn `pos` joining character vector and factor, coercing into character vector
analysis_seeds <- filter(seeds_with_controls, type=="seed") %>%
rename(num_hyponyms=num_hyponyms_old) %>%
select(-log_hyponyms_old)
analysis_seeds_controls <- bind_rows(analysis_controls, analysis_seeds) %>%
mutate(log_hyponyms=log(num_hyponyms+1))
Compare seed and control words
Summary statistics
Filtered by adult_log_freq > 2, kid_log_freq > 0, and definition a child would know
sumTable <- analysis_seeds_controls %>%
group_by(type) %>%
summarise(number=n(),
mean_aoa=mean(aoa, na.rm=TRUE),
mean_kid_log_freq=mean(kid_log_freq, na.rm=TRUE),
mean_adult_log_freq=mean(adult_log_freq, na.rm=TRUE),
mean_hypernyms=mean(num_hypernyms),
mean_hyponyms=mean(num_hyponyms, na.rm=TRUE),
mean_peers=mean(num_peers))
as_tibble(sumTable)
Correlation matrices
Seed words
candidate_words_corrmatrix <- analysis_seeds_controls %>%
filter(type=="seed") %>%
select(-Word, -pos, -SeedWord, -type, -FREQcount, -control_def, -child_know, -num_hypernyms_first)
candidate_words_cor <- cor(candidate_words_corrmatrix, use="pairwise.complete.obs", method="pearson")
p.mat_item <- cor.mtest(candidate_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(candidate_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=8/ncol(candidate_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")

Control words
control_words_corrmatrix <- analysis_seeds_controls %>%
filter(type=="control") %>%
select(-Word, -pos, -SeedWord, -type, -FREQcount, -control_def, -child_know, -num_hypernyms_first)
control_words_cor <- cor(control_words_corrmatrix, use="pairwise.complete.obs", method="pearson")
p.mat_item <- cor.mtest(control_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(control_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=8/ncol(control_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")

All words (matrix corrects for multiple comparisons; num_hypernym-log_hyponym correlation p = .002)
all_words_corrmatrix <- analysis_seeds_controls %>%
select(-Word, -pos, -SeedWord, -type, -FREQcount, -control_def, -child_know, -num_hypernyms_first)
all_words_cor <- cor(all_words_corrmatrix, use="pairwise.complete.obs", method="pearson")
p.mat_item <- cor.mtest(all_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(all_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=8/ncol(all_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")

Look at relations between density & category hierarchy
Hyponyms & Hypernyms
ggplot(analysis_seeds_controls, aes(num_hypernyms,log_hyponyms, label=Word, color=type))+
geom_point()+
# geom_label()+
geom_smooth(method=lm)+
theme_classic()

Hyponyms & Peers
ggplot(analysis_seeds_controls, aes(log_hyponyms, num_peers, color=type, label=as.character(Word)))+
geom_point()+
geom_smooth(method="lm")+
# geom_label()+
#scale_x_continuous(breaks=seq(0,900,50))
theme_classic()

Hypernyms & Peers
ggplot(analysis_seeds_controls, aes(num_hypernyms, num_peers, label=as.character(Word), color=type))+
geom_point()+
geom_smooth(method="lm")+
#geom_label()
theme_classic()

#What are those low-hypernym, highly dense words? (they are all verbs - this makes sense)
# lowHyper_highPeer <- filter(seed_and_controls_full, (num_peers > 100) & (num_hypernyms < 3)) %>%
# select(Word, type, pos, aoa, num_peers, num_hyponyms, num_hypernyms, adult_log_freq)
# DT::datatable(lowHyper_highPeer)
#scale hyponyms and hypernyms by part of speech
seed_and_controls_pos <- analysis_seeds_controls %>%
group_by(pos) %>%
mutate(hyper_z_pos = scale(num_hypernyms), hypo_z_pos = scale(num_hyponyms)) %>%
ungroup() %>%
filter(pos %in% c('n','v'))
How are hyponyms-hypernyms correlation when controlling for pos?
Part of speech (points)
ggplot(seed_and_controls_pos, aes(hyper_z_pos, hypo_z_pos, color=pos, label=as.character(Word)))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Part of speech (labels)
ggplot(seed_and_controls_pos, aes(hyper_z_pos, hypo_z_pos, color=pos, label=as.character(Word)))+
geom_point()+
geom_smooth(method=lm)+
geom_label()+
theme_classic()

Correlation looks good
cor.test(seed_and_controls_pos$hyper_z_pos, seed_and_controls_pos$hypo_z_pos)
Pearson's product-moment correlation
data: seed_and_controls_pos$hyper_z_pos and seed_and_controls_pos$hypo_z_pos
t = -6.1789, df = 483, p-value = 1.372e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3512314 -0.1861055
sample estimates:
cor
-0.2706581
Compare childes adult and subtlex frequencies (r = .76)
Points
ggplot(analysis_seeds_controls, aes(adult_log_freq, logFreq_subtlex, color=type))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(analysis_seeds_controls, aes(adult_log_freq, logFreq_subtlex, color=type, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Compare child-produced and adult-produced frequences (childes)
Points
ggplot(analysis_seeds_controls, aes(adult_log_freq, kid_log_freq, color=type))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(analysis_seeds_controls, aes(adult_log_freq, kid_log_freq, color=type, label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Difference in Adult-Child production
produceDiff <- analysis_seeds_controls %>%
mutate(adult_log_freq_z=scale(adult_log_freq), kid_log_freq_z=scale(kid_log_freq)) %>%
select(Word, type, pos, aoa, num_peers, num_hyponyms, num_hypernyms, adult_log_freq_z, kid_log_freq_z) %>%
mutate(productionDiff_z = adult_log_freq_z - kid_log_freq_z) %>%
distinct()
DT::datatable(produceDiff)
Childes frequency vs. AOA
Points
ggplot(analysis_seeds_controls, aes(adult_log_freq, aoa, color=type))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Words
ggplot(analysis_seeds_controls, aes(adult_log_freq, aoa, color=type,label=as.character(Word)))+
geom_point()+
geom_label()+
theme_classic()

Childes frequency vs. hypernyms
Adults
ggplot(analysis_seeds_controls, aes(adult_log_freq, num_hypernyms, color=type))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

Kids
ggplot(analysis_seeds_controls, aes(kid_log_freq, num_hypernyms, color=type))+
geom_point()+
geom_smooth(method="lm")+
theme_classic()

What’s the relation between AOA and hypernyms, controlling for frequency?
Adult-produced frequency
summary(lm(aoa ~ num_hypernyms + adult_log_freq, data=analysis_seeds_controls))
Call:
lm(formula = aoa ~ num_hypernyms + adult_log_freq, data = analysis_seeds_controls)
Residuals:
Min 1Q Median 3Q Max
-2.6713 -0.9032 -0.1844 0.7587 6.9039
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.48806 0.25551 37.134 <2e-16 ***
num_hypernyms -0.03268 0.01997 -1.636 0.103
adult_log_freq -0.59737 0.03305 -18.074 <2e-16 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 1.266 on 469 degrees of freedom
(19 observations deleted due to missingness)
Multiple R-squared: 0.4118, Adjusted R-squared: 0.4092
F-statistic: 164.1 on 2 and 469 DF, p-value: < 2.2e-16
Child-produced frequency
summary(lm(aoa ~ num_hypernyms + kid_log_freq, data=analysis_seeds_controls))
Call:
lm(formula = aoa ~ num_hypernyms + kid_log_freq, data = analysis_seeds_controls)
Residuals:
Min 1Q Median 3Q Max
-2.5617 -0.8955 -0.1123 0.8463 6.5283
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.3676121 0.1761510 41.826 <2e-16 ***
num_hypernyms -0.0001083 0.0205084 -0.005 0.996
kid_log_freq -0.4832487 0.0293781 -16.449 <2e-16 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 1.314 on 469 degrees of freedom
(19 observations deleted due to missingness)
Multiple R-squared: 0.3671, Adjusted R-squared: 0.3644
F-statistic: 136 on 2 and 469 DF, p-value: < 2.2e-16
---
title: "Create df of candidate words & poke at them"
output:
  html_notebook:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
library(childesr)
library(lme4)
library(corrplot)
library(ggrepel)
library(psych)
```
_Word selection information_  
Seed words:  
1) Got all Kuperman words with 3 < AOA < 6  
2) Filtered out Kuperman words that were on MCDI  
3) Got hypernyms for Kuperman words  
4) Selected lowest 25% as potential seed words (= most superordinate)  
  
Control words:  
1) For each seed word, got all hyponyms  
2) Filtered based on CHILDES production frequencies:  
	a) Child log freq > 0  
	b) Adult log freq > 2  
3) Removed control words that were also in the seed word list  
4) Removed 'beast', 'group', 'part' due to disproportionately many hyponyms (>2SDs above mean)  
5) For remaining words (~1000), got wordnet definition based on synset  
	a) important because hyponym synsets don't map 1-1 with how kids would actually use the word  
6) For each control word definition, decided if a child would/could use word in this sense  
7) Narrowed down control words to words where child WOULD use word in that sense  
8) Currently keeping multiple potential controls for some seed words, as ~500 words is reasonable to ask parents about  
  
------------------------------------------------------------    
Read in relevant files
```{r}
kuperman <- read.csv("../AoA_ratings_Kuperman_et_al_BRM/AoA_ratings_Kuperman_et_al_BRM.csv") %>% 
  select(Word, aoa=Rating.Mean)
mcdi <- read.csv("hypernym_analysis-master/wordbank/wordbank-aoa-data.csv") %>% 
  filter(language=="English (American)") %>% 
  select(uni_lemma, words, aoa) %>% 
  rename(aoa_months=aoa, Word=uni_lemma) %>% 
  select(Word)
hypernyms <- read.csv("seedWords_aoa_hypernyms_concreteness_CDI_z.csv") %>% 
  select(-Conc.M, conc_z_pos) %>% 
  rename(num_hypernyms=hypernyms)
concreteness <- read.csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep="") %>% 
  filter(Dom_Pos != 0) %>% 
  select(Word, Conc.M)
hyponyms <- read.csv("AOA/aoa_to_wordnet_with_hyponyms.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_hyponyms, pos)
peers <- read.csv("AOA/aoa_to_wordnet_with_peers.csv") %>% 
  rename(Word=wordnet_lemma, pos=wordnet_PoS) %>% 
  select(Word, num_peers, pos)
subtlex <- read.csv("subtlex_full.txt", sep="")
all_childes_counts <- read.csv("hypernym_analysis-master/childes/childes_word_freq_adult_and_kid.txt") %>% 
  rename(Word=word)
individual_hyponyms_int <- read.csv("AOA/all_hyponyms_per_seed_word.csv")
individual_hyponyms_int$hyponym <- as.character(individual_hyponyms_int$hyponym)
individual_hyponyms <- individual_hyponyms_int %>%
  select(-X) %>% 
  separate(hyponym, into=c("Word","pos","wordnet_sense"), sep='\\.') %>% 
  rename(SeedWord=wordnet_lemma, num_peers=hyponym_num_peers, num_hypernyms=hyponym_num_hypernyms)
```

Merge into big df for selecting control words
```{r}
#use existing childes frequency csv, don't need to look them up
candidate_words <- left_join(hypernyms, concreteness, by="Word") %>% 
  filter(!(Word %in% mcdi)) %>% 
  left_join(hyponyms, by=c("Word", "pos")) %>% 
  left_join(peers, by=c("Word","pos")) %>%
  group_by(pos) %>% 
  mutate(conc_z_pos = scale(Conc.M)) %>% 
  ungroup() %>% 
  left_join(all_childes_counts, by="Word") %>% 
  arrange(Word) %>% 
  mutate(type="seed", SeedWord=NA)

#get hyponym count for each seed word, from hyponym csv
control_word_count <- individual_hyponyms %>%
  filter(SeedWord %in% candidate_words$Word) %>% 
  group_by(SeedWord) %>% 
  summarise(N_controls=n())

seedList <- candidate_words$Word

#narrow control word list to match words in candidate_words and remove group, beast, part, as they have disproportionately many hyponyms
control_words <- individual_hyponyms %>%
  filter(SeedWord %in% candidate_words$Word) %>% 
  filter(!(SeedWord %in% c("group","beast","part"))) %>% 
  left_join(all_childes_counts, by="Word") %>% 
  select(SeedWord,Word,pos,kid_log_freq,adult_log_freq,
         num_peers,num_hypernyms) %>% 
  filter(!is.na(adult_log_freq)) %>% 
  mutate(type="control") %>% 
  left_join(kuperman, by="Word")

candidate_words_smaller <- candidate_words %>%
  select(names(control_words))

seed_and_controls <- bind_rows(candidate_words_smaller, control_words)

# choose_control_words <- candidate_words %>% 
#   select(SeedWord=Word, aoa_seed=aoa, num_hypernyms_seed=num_hypernyms, num_hyponyms_seed=num_hyponyms,
#          num_peers_seed=num_peers, seed_kid_freq=kid_freq,seed_kid_log_freq=kid_log_freq,
#          seed_adult_freq=adult_freq,seed_adult_log_freq=adult_log_freq,seed_pos=pos) %>% 
#   left_join(control_words, by="SeedWord") %>% 
#   select(SeedWord,seed_pos,Word,control_pos,seed_adult_log_freq,control_adult_log_freq,aoa_seed,
#          num_hypernyms_seed,num_peers_seed,num_hypernyms_control=control_word_num_hypernyms,
#          num_peers_control=control_word_num_peers) %>% 
#   distinct()

#choose control words based on childes adult log frequency 
# choose_control_words_by_freq <- choose_control_words %>%
#   filter((seed_adult_log_freq-1 < control_adult_log_freq) & (control_adult_log_freq<seed_adult_log_freq+1)) %>% 
#   filter(seed_pos==control_pos) %>% 
#   distinct()

```


Add subtlex frequency
```{r}
subtlex_merge <- select(subtlex, Word, FREQcount)

seed_and_controls_subtlex <- left_join(seed_and_controls, subtlex_merge, by="Word") %>% 
  mutate(logFreq_subtlex=log(FREQcount+1))

#need to add hyponyms (seed words only)
seed_and_controls_full <- left_join(seed_and_controls_subtlex, hyponyms, by=c("Word", "pos")) %>% 
  distinct() %>%
  filter(adult_log_freq>2 & kid_log_freq>0) %>% 
  filter((type=="control" & !(Word %in% seedList)) | type=="seed") %>% 
  mutate(log_hyponyms = log(num_hyponyms+1))

seeds_only <- filter(seed_and_controls_full, type=="seed")
controls_only <- filter(seed_and_controls_full, type=="control")
```

Add info about whether child would know the specific sense of the control word
```{r}
kid_knowledge <- read.csv("control_senses_with_defs_childknowledge.csv") %>%
  rename(control_def=definition) %>%
  select(-wordnet_sense)

filtered_seeds_controls <- full_join(kid_knowledge, seed_and_controls_full, by=c("SeedWord","Word","pos")) %>% 
  filter(!is.na(type)) %>% 
  filter(child_know==1 | type=="seed")

filtered_seeds <- filtered_seeds_controls %>%
  filter(type=="seed") %>%
  mutate(SeedWord=Word)

filtered_controls <- filtered_seeds_controls %>% filter(type=="control")
filtered_seeds_controls_full <- bind_rows(filtered_controls, filtered_seeds)
seeds_with_no_controls_list <- filtered_seeds_controls_full %>% group_by(SeedWord) %>% summarise(n_words=n()) %>% filter (n_words==1) %>% select(SeedWord)
seeds_with_no_controls <- filter(filtered_seeds_controls_full, Word %in% seeds_with_no_controls_list$SeedWord)
seeds_with_controls <- filter(filtered_seeds_controls_full, !(SeedWord %in% seeds_with_no_controls_list$SeedWord)) %>%
  arrange(SeedWord) %>% 
  rename(num_hyponyms_old=num_hyponyms, log_hyponyms_old=log_hyponyms)

```

Make df for analyses
```{r}
new_hyponyms_controls <- read.csv("controls_actual_hypernyms_hyponyms.csv") %>% select(-X, -wordnet_sense, -num_hypernyms, -num_hyponyms_set)
analysis_controls <- left_join(filter(seeds_with_controls, type=="control"), new_hyponyms_controls, by=c("Word","pos")) %>% 
  select(-num_hyponyms_old, -log_hyponyms_old)

analysis_seeds <- filter(seeds_with_controls, type=="seed") %>% 
  rename(num_hyponyms=num_hyponyms_old) %>% 
  select(-log_hyponyms_old)
analysis_seeds_controls <- bind_rows(analysis_controls, analysis_seeds) %>% 
  mutate(log_hyponyms=log(num_hyponyms+1))
```

## Compare seed and control words
### Summary statistics
Filtered by `adult_log_freq > 2`, `kid_log_freq > 0`, and definition a child would know
```{r}

sumTable <- analysis_seeds_controls %>% 
  group_by(type) %>% 
  summarise(number=n(),
            mean_aoa=mean(aoa, na.rm=TRUE),
            mean_kid_log_freq=mean(kid_log_freq, na.rm=TRUE),
            mean_adult_log_freq=mean(adult_log_freq, na.rm=TRUE),
            mean_hypernyms=mean(num_hypernyms),
            mean_hyponyms=mean(num_hyponyms, na.rm=TRUE),
            mean_peers=mean(num_peers))
as_tibble(sumTable)
```

### Correlation matrices {.tabset}
#### Seed words
```{r}
candidate_words_corrmatrix <- analysis_seeds_controls %>% 
  filter(type=="seed") %>% 
  select(-Word, -pos, -SeedWord, -type, -FREQcount, -control_def, -child_know, -num_hypernyms_first)

candidate_words_cor <- cor(candidate_words_corrmatrix, use="pairwise.complete.obs", method="pearson")

p.mat_item <- cor.mtest(candidate_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(candidate_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=8/ncol(candidate_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")
```
#### Control words
```{r}
control_words_corrmatrix <- analysis_seeds_controls %>% 
  filter(type=="control") %>% 
  select(-Word, -pos, -SeedWord, -type, -FREQcount, -control_def, -child_know, -num_hypernyms_first)

control_words_cor <- cor(control_words_corrmatrix, use="pairwise.complete.obs", method="pearson")

p.mat_item <- cor.mtest(control_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(control_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=8/ncol(control_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")
```
#### All words (matrix corrects for multiple comparisons; num_hypernym-log_hyponym correlation p = .002)
```{r}
all_words_corrmatrix <- analysis_seeds_controls %>% 
  select(-Word, -pos, -SeedWord, -type, -FREQcount, -control_def, -child_know, -num_hypernyms_first)

all_words_cor <- cor(all_words_corrmatrix, use="pairwise.complete.obs", method="pearson")

p.mat_item <- cor.mtest(all_words_cor)
pMatrix_item <- p.mat_item$p
corrplot(all_words_cor, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=8/ncol(all_words_cor), p.mat=pMatrix_item, sig.level = 0.05, insig = "blank")
```

### Look at relations between density & category hierarchy {.tabset}
#### Hyponyms & Hypernyms
```{r}
ggplot(analysis_seeds_controls, aes(num_hypernyms,log_hyponyms, label=Word, color=type))+
  geom_point()+
  # geom_label()+
  geom_smooth(method=lm)+
  theme_classic()
```
#### Hyponyms & Peers
```{r}
ggplot(analysis_seeds_controls, aes(log_hyponyms, num_peers, color=type, label=as.character(Word)))+
  geom_point()+
  geom_smooth(method="lm")+
  # geom_label()+
  #scale_x_continuous(breaks=seq(0,900,50))
  theme_classic()
```

#### Hypernyms & Peers
```{r}
ggplot(analysis_seeds_controls, aes(num_hypernyms, num_peers, label=as.character(Word), color=type))+
  geom_point()+
  geom_smooth(method="lm")+
  #geom_label()
  theme_classic()

#What are those low-hypernym, highly dense words? (they are all verbs - this makes sense)
# lowHyper_highPeer <- filter(seed_and_controls_full, (num_peers > 100) & (num_hypernyms < 3)) %>% 
#   select(Word, type, pos, aoa, num_peers, num_hyponyms, num_hypernyms, adult_log_freq)
# DT::datatable(lowHyper_highPeer)

```

```{r}
#scale hyponyms and hypernyms by part of speech
seed_and_controls_pos <- analysis_seeds_controls %>%
  group_by(pos) %>% 
  mutate(hyper_z_pos = scale(num_hypernyms), hypo_z_pos = scale(num_hyponyms)) %>% 
  ungroup() %>% 
  filter(pos %in% c('n','v'))
```

### How are hyponyms-hypernyms correlation when controlling for pos? {.tabset}
#### Part of speech (points)
```{r}
ggplot(seed_and_controls_pos, aes(hyper_z_pos,  hypo_z_pos, color=pos, label=as.character(Word)))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

```
#### Part of speech (labels)
```{r}
ggplot(seed_and_controls_pos, aes(hyper_z_pos,  hypo_z_pos, color=pos, label=as.character(Word)))+
  geom_point()+
  geom_smooth(method=lm)+
  geom_label()+
  theme_classic()

```
#### Correlation looks good
```{r}
cor.test(seed_and_controls_pos$hyper_z_pos, seed_and_controls_pos$hypo_z_pos)
```

#### Is it related to neighborhood density
```{r}
summary(lm(num_hypernyms ~ num_peers + log_hyponyms, analysis_seeds_controls))

```
### Compare childes adult and subtlex frequencies (*r* = .76) {.tabset}
#### Points
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, logFreq_subtlex, color=type))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, logFreq_subtlex, color=type, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```

### Compare child-produced and adult-produced frequences (childes) {.tabset}
#### Points
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, kid_log_freq, color=type))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, kid_log_freq, color=type, label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```
<!-- #### Adults say, kids say infrequently -->
<!-- ```{r} -->
<!-- adultSay <- filter(seed_and_controls_full, kid_log_freq<1) %>%  -->
<!--   select(Word, type, pos, aoa, num_peers, num_hyponyms, num_hypernyms, adult_log_freq, kid_log_freq) -->
<!-- DT::datatable(adultSay) -->

<!-- ``` -->
#### Difference in Adult-Child production
```{r}
produceDiff <- analysis_seeds_controls %>% 
  mutate(adult_log_freq_z=scale(adult_log_freq), kid_log_freq_z=scale(kid_log_freq)) %>% 
  select(Word, type, pos, aoa, num_peers, num_hyponyms, num_hypernyms, adult_log_freq_z, kid_log_freq_z) %>% 
  mutate(productionDiff_z = adult_log_freq_z - kid_log_freq_z) %>% 
  distinct()
DT::datatable(produceDiff)

```

### Childes frequency vs. AOA {.tabset}
#### Points
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, aoa, color=type))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Words
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, aoa, color=type,label=as.character(Word)))+
  geom_point()+
  geom_label()+
  theme_classic()
```
### Childes frequency vs. hypernyms {.tabset}
#### Adults
```{r}
ggplot(analysis_seeds_controls, aes(adult_log_freq, num_hypernyms, color=type))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```
#### Kids
```{r}
ggplot(analysis_seeds_controls, aes(kid_log_freq, num_hypernyms, color=type))+
  geom_point()+
  geom_smooth(method="lm")+
  theme_classic()
```

### What's the relation between AOA and hypernyms, controlling for frequency? {.tabset}
#### Adult-produced frequency
```{r}
summary(lm(aoa ~ num_hypernyms + adult_log_freq, data=analysis_seeds_controls))

```
#### Child-produced frequency
```{r}
summary(lm(aoa ~ num_hypernyms + kid_log_freq, data=analysis_seeds_controls))

```


<!-- ### Childes frequency vs. concreteness (*r* = -.2, ns; seed words only) {.tabset} -->
<!-- #### Points -->
<!-- ```{r} -->
<!-- ggplot(candidate_words_all_freqs, aes(adult_log_freq, Conc.M))+ -->
<!--   geom_point()+ -->
<!--   geom_smooth(method="lm")+ -->
<!--   theme_classic() -->
<!-- ``` -->
<!-- #### Words -->
<!-- ```{r} -->
<!-- ggplot(candidate_words_all_freqs, aes(adult_log_freq, Conc.M, label=as.character(Word)))+ -->
<!--   geom_point()+ -->
<!--   geom_label()+ -->
<!--   theme_classic() -->
<!-- ``` -->

<!-- ### Childes frequency vs. AOA for lower-than-mean concreteness {.tabset} -->
<!-- #### Points -->
<!-- ```{r} -->
<!-- ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(adult_log_freq, aoa))+ -->
<!--   geom_point()+ -->
<!--   geom_smooth(method="lm")+ -->
<!--   theme_classic() -->
<!-- ``` -->
<!-- #### Words -->
<!-- ```{r} -->
<!-- ggplot(filter(candidate_words_all_freqs, Conc.M<3.90), aes(adult_log_freq, aoa, label=as.character(Word)))+ -->
<!--   geom_point()+ -->
<!--   geom_label()+ -->
<!--   theme_classic() -->

<!-- ``` -->

<!-- ### What are the low-concreteness (lower than mean), high-frequency (higher than mean), late-aoa (older than 5) words? -->
<!-- ```{r} -->
<!-- hFreq_hAOA_lConc <- filter(candidate_words_all_freqs, (Conc.M < 3.90) & (aoa>5) & (adult_log_freq > 5.41)) %>%  -->
<!--   select(Word, pos, aoa, num_hypernyms, num_hyponyms, num_peers, Conc.M, adult_log_freq, logFreq_subtlex) -->

<!-- DT::datatable(hFreq_hAOA_lConc) -->
<!-- ``` -->

<!-- ### Range of control word frequency (per seed word) -->
<!-- ```{r} -->

<!-- seedWord_hists <- function(df, na.rm=TRUE, ...){ -->
<!--   seedword_list <- unique(df$SeedWord) -->
<!--   for (i in seq_along(seedword_list)) { -->
<!--     plot <- -->
<!--       ggplot(subset(df, df$SeedWord==seedword_list[i]), -->
<!--              aes(control_adult_log_freq))+ -->
<!--       geom_bar(width=.2)+ -->
<!--       ggtitle(paste("seed word: ",seedword_list[i])) -->
<!--     print(plot) -->
<!--   } -->
<!-- } -->

<!-- hist_df <- filter(choose_control_words_subtlex, (SeedWord %in% words_with_many_controls$SeedWord) & control_adult_log_freq>0) -->

<!-- seedWord_hists(hist_df) -->

<!-- ``` -->
