Validating pairwise cosine distances - using human judgements (SWOW/PNAS data)

Small world of words - people vs. model

swow_wiki <- inner_join(swow1, all_distances,
           by= c(cue = "target_word", 
                 response = "comparision_word")) %>%
  mutate(log_r1_strength = log(r1_strength))

hist(swow_wiki$log_r1_strength, main = "human similarity")

hist(swow_wiki$cosine, main = "model similarity for human pairs")

Mean model similarity is 0.34.

ggplot(swow_wiki, aes(x = cosine,
                      y = log_r1_strength)) +
  geom_point() +
  geom_smooth(method = "lm")

cors <- swow_wiki %>%
  filter(cosine != 1) %>%

  mutate(cosine_tile = ntile(cosine, n = 10)) %>% 
  group_by(cosine_tile) %>%
  nest() %>%
  mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
  select(-data) %>%
  unnest() %>%
  arrange(cosine_tile) %>%
  select(-method, -alternative) 

kable(cors)

cosine_tile	estimate	statistic	p.value	parameter	conf.low	conf.high
1	0.0489745	3.4644028	0.0005359	4992	0.0212674	0.0766063
2	0.0056090	0.3963032	0.6918983	4992	-0.0221304	0.0333397
3	0.0201653	1.4250506	0.1542051	4992	-0.0075749	0.0478745
4	0.0126763	0.8957016	0.3704553	4992	-0.0150650	0.0403980
5	0.0060421	0.4269076	0.6694651	4992	-0.0216975	0.0337724
6	0.0338590	2.3936513	0.0167183	4992	0.0061288	0.0615372
7	0.0145134	1.0255396	0.3051584	4992	-0.0132279	0.0422323
8	0.0056056	0.3960265	0.6921024	4991	-0.0221365	0.0333392
9	0.0782133	5.5425109	0.0000000	4991	0.0505843	0.1057226
10	0.1142098	8.1217273	0.0000000	4991	0.0867459	0.1415003

ggplot(cors, aes(x = cosine_tile, y = estimate)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

cors <- swow_wiki %>%
  filter(cosine != 1) %>%
  mutate(cosine_breaks = cut(cosine, breaks = c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)))%>%
  filter(!is.na(cosine_breaks)) %>%
  group_by(cosine_breaks) %>%
  nest() %>%
  mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
  select(-data) %>%
  unnest() %>%
  arrange(cosine_breaks) %>%
  select(-method, -alternative) 

kable(cors)

cosine_breaks	estimate	statistic	p.value	parameter	conf.low	conf.high
(0,0.1]	0.0285939	0.7942894	0.4272714	771	-0.0420057	0.0989095
(0.1,0.2]	0.0514844	4.6600409	0.0000032	8171	0.0298372	0.0730833
(0.2,0.3]	0.0827355	9.7264794	0.0000000	13726	0.0660987	0.0993264
(0.3,0.4]	0.0671836	7.4830560	0.0000000	12350	0.0496069	0.0847187
(0.4,0.5]	0.0101085	0.9078447	0.3639874	8065	-0.0117165	0.0319239
(0.5,0.6]	0.0446113	2.9543737	0.0031497	4377	0.0150113	0.0741331
(0.6,0.7]	0.0138835	0.5884291	0.5563183	1796	-0.0323654	0.0600731
(0.7,0.8]	0.0274967	0.6397996	0.5225738	541	-0.0567786	0.1113830
(0.8,0.9]	0.1006786	1.0804428	0.2822255	114	-0.0831645	0.2778943
(0.9,1]	-0.1418997	-0.2867004	0.7885791	4	-0.8549990	0.7568169

ggplot(cors, aes(x = cosine_breaks, y = estimate)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

cors <- swow_wiki %>%
  filter(cosine != 1) %>%
  mutate(cosine_breaks = cut(cosine, breaks = c(0,  .2,  .4,  .6,  .8, 1 )))%>%
  filter(!is.na(cosine_breaks)) %>%
  group_by(cosine_breaks) %>%
  nest() %>%
  mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
  select(-data) %>%
  unnest() %>%
  arrange(cosine_breaks) %>%
  select(-method, -alternative) 

kable(cors)

cosine_breaks	estimate	statistic	p.value	parameter	conf.low	conf.high
(0,0.2]	0.0640350	6.068416	0.0000000	8944	0.0433699	0.0846452
(0.2,0.4]	0.1261656	20.538206	0.0000000	26078	0.1142039	0.1380908
(0.4,0.6]	0.0895384	10.028530	0.0000000	12444	0.0720831	0.1069389
(0.6,0.8]	0.0749515	3.635121	0.0002838	2339	0.0345440	0.1151144
(0.8,1]	0.1706677	1.897409	0.0601770	120	-0.0073152	0.3381692

ggplot(cors, aes(x = cosine_breaks, y = estimate)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

Correlation for full range:

cor.test(swow_wiki$log_r1_strength, swow_wiki$cosine)

## 
##  Pearson's product-moment correlation
## 
## data:  swow_wiki$log_r1_strength and swow_wiki$cosine
## t = 64.656, df = 49998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2696699 0.2858478
## sample estimates:
##       cor 
## 0.2777785

Correlation for upper range only:

swow_wiki_sub <- swow_wiki %>%
  filter(cosine > .5)

cor.test(swow_wiki_sub$log_r1_strength, swow_wiki_sub$cosine)

## 
##  Pearson's product-moment correlation
## 
## data:  swow_wiki_sub$log_r1_strength and swow_wiki_sub$cosine
## t = 6.39, df = 6903, p-value = 1.767e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05319243 0.10008997
## sample estimates:
##        cor 
## 0.07668361

Correlation for lower range only:

swow_wiki_sub <- swow_wiki %>%
  filter(cosine < .5)

cor.test(swow_wiki_sub$log_r1_strength, swow_wiki_sub$cosine)

## 
##  Pearson's product-moment correlation
## 
## data:  swow_wiki_sub$log_r1_strength and swow_wiki_sub$cosine
## t = 45.508, df = 43093, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2051104 0.2231274
## sample estimates:
##       cor 
## 0.2141371

Validating pairwise cosine distances - using human judgements (SWOW/PNAS data)

Molly Lewis

2022-11-29

Small world of words - people vs. model

PNAS unrelated words