SWOW_MODEL_INPATH <- here("analyses/15_validate_distance/data/sample_model_swow_distances.csv")
all_distances <- read_csv(SWOW_MODEL_INPATH)

SWOW_PATH <- here("analyses/15_validate_distance/data/strength.SWOW-EN.R1.csv")

swow1 <- read_tsv(SWOW_PATH) %>%
  clean_names()

Small world of words - people vs. model

swow_wiki <- inner_join(swow1, all_distances,
           by= c(cue = "target_word", 
                 response = "comparision_word")) %>%
  mutate(log_r1_strength = log(r1_strength)) 
hist(swow_wiki$log_r1_strength, main = "human similarity")

hist(swow_wiki$cosine, main = "model similarity for human pairs")

Mean model similarity is 0.34.

ggplot(swow_wiki, aes(x = cosine,
                      y = log_r1_strength)) +
  geom_point() +
  geom_smooth(method = "lm")

cors <- swow_wiki %>%
  filter(cosine != 1) %>%

  mutate(cosine_tile = ntile(cosine, n = 10)) %>% 
  group_by(cosine_tile) %>%
  nest() %>%
  mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
  select(-data) %>%
  unnest() %>%
  arrange(cosine_tile) %>%
  select(-method, -alternative) 

kable(cors)
cosine_tile estimate statistic p.value parameter conf.low conf.high
1 0.0489745 3.4644028 0.0005359 4992 0.0212674 0.0766063
2 0.0056090 0.3963032 0.6918983 4992 -0.0221304 0.0333397
3 0.0201653 1.4250506 0.1542051 4992 -0.0075749 0.0478745
4 0.0126763 0.8957016 0.3704553 4992 -0.0150650 0.0403980
5 0.0060421 0.4269076 0.6694651 4992 -0.0216975 0.0337724
6 0.0338590 2.3936513 0.0167183 4992 0.0061288 0.0615372
7 0.0145134 1.0255396 0.3051584 4992 -0.0132279 0.0422323
8 0.0056056 0.3960265 0.6921024 4991 -0.0221365 0.0333392
9 0.0782133 5.5425109 0.0000000 4991 0.0505843 0.1057226
10 0.1142098 8.1217273 0.0000000 4991 0.0867459 0.1415003
ggplot(cors, aes(x = cosine_tile, y = estimate)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

cors <- swow_wiki %>%
  filter(cosine != 1) %>%
  mutate(cosine_breaks = cut(cosine, breaks = c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)))%>%
  filter(!is.na(cosine_breaks)) %>%
  group_by(cosine_breaks) %>%
  nest() %>%
  mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
  select(-data) %>%
  unnest() %>%
  arrange(cosine_breaks) %>%
  select(-method, -alternative) 

kable(cors)
cosine_breaks estimate statistic p.value parameter conf.low conf.high
(0,0.1] 0.0285939 0.7942894 0.4272714 771 -0.0420057 0.0989095
(0.1,0.2] 0.0514844 4.6600409 0.0000032 8171 0.0298372 0.0730833
(0.2,0.3] 0.0827355 9.7264794 0.0000000 13726 0.0660987 0.0993264
(0.3,0.4] 0.0671836 7.4830560 0.0000000 12350 0.0496069 0.0847187
(0.4,0.5] 0.0101085 0.9078447 0.3639874 8065 -0.0117165 0.0319239
(0.5,0.6] 0.0446113 2.9543737 0.0031497 4377 0.0150113 0.0741331
(0.6,0.7] 0.0138835 0.5884291 0.5563183 1796 -0.0323654 0.0600731
(0.7,0.8] 0.0274967 0.6397996 0.5225738 541 -0.0567786 0.1113830
(0.8,0.9] 0.1006786 1.0804428 0.2822255 114 -0.0831645 0.2778943
(0.9,1] -0.1418997 -0.2867004 0.7885791 4 -0.8549990 0.7568169
ggplot(cors, aes(x = cosine_breaks, y = estimate)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

cors <- swow_wiki %>%
  filter(cosine != 1) %>%
  mutate(cosine_breaks = cut(cosine, breaks = c(0,  .2,  .4,  .6,  .8, 1 )))%>%
  filter(!is.na(cosine_breaks)) %>%
  group_by(cosine_breaks) %>%
  nest() %>%
  mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
  select(-data) %>%
  unnest() %>%
  arrange(cosine_breaks) %>%
  select(-method, -alternative) 

kable(cors)
cosine_breaks estimate statistic p.value parameter conf.low conf.high
(0,0.2] 0.0640350 6.068416 0.0000000 8944 0.0433699 0.0846452
(0.2,0.4] 0.1261656 20.538206 0.0000000 26078 0.1142039 0.1380908
(0.4,0.6] 0.0895384 10.028530 0.0000000 12444 0.0720831 0.1069389
(0.6,0.8] 0.0749515 3.635121 0.0002838 2339 0.0345440 0.1151144
(0.8,1] 0.1706677 1.897409 0.0601770 120 -0.0073152 0.3381692
ggplot(cors, aes(x = cosine_breaks, y = estimate)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

Correlation for full range:

cor.test(swow_wiki$log_r1_strength, swow_wiki$cosine)
## 
##  Pearson's product-moment correlation
## 
## data:  swow_wiki$log_r1_strength and swow_wiki$cosine
## t = 64.656, df = 49998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2696699 0.2858478
## sample estimates:
##       cor 
## 0.2777785

Correlation for upper range only:

swow_wiki_sub <- swow_wiki %>%
  filter(cosine > .5)

cor.test(swow_wiki_sub$log_r1_strength, swow_wiki_sub$cosine)
## 
##  Pearson's product-moment correlation
## 
## data:  swow_wiki_sub$log_r1_strength and swow_wiki_sub$cosine
## t = 6.39, df = 6903, p-value = 1.767e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05319243 0.10008997
## sample estimates:
##        cor 
## 0.07668361

Correlation for lower range only:

swow_wiki_sub <- swow_wiki %>%
  filter(cosine < .5)

cor.test(swow_wiki_sub$log_r1_strength, swow_wiki_sub$cosine)
## 
##  Pearson's product-moment correlation
## 
## data:  swow_wiki_sub$log_r1_strength and swow_wiki_sub$cosine
## t = 45.508, df = 43093, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2051104 0.2231274
## sample estimates:
##       cor 
## 0.2141371

PNAS unrelated words

PNAS paper: “Naming unrelated words predicts creativity” https://www.pnas.org/doi/pdf/10.1073/pnas.2022340118

PNAS_DISTANCES <- here("analyses/15_validate_distance/data/pnas_model_distances.csv")

#pnas_data <- read_tsv("/Users/mollylewis/Downloads/study1a.tsv") %>%
 # clean_names()

#all_words <- pnas_data %>%
  #select(id, contains("word")) %>%
  #pivot_longer(cols = contains("word")) %>%
  #mutate(value = tolower(value)) 

unique_pairs <- read_csv(PNAS_DISTANCES)

hist(unique_pairs$cos_dist, main = "model similarity for unrelated human pairs")

Mean model similarity is 0.21.

RANDOM_WORD_PAIR_PATH <-"/Users/mollylewis/Downloads/RUS_common_word_dists.feather"
word_pairs <- read_feather(RANDOM_WORD_PAIR_PATH) %>%
  filter(cos_dist != 1) %>%
  filter(w1 < w2)

hist(word_pairs$cos_dist)