SWOW_MODEL_INPATH <- here("analyses/15_validate_distance/data/sample_model_swow_distances.csv")
all_distances <- read_csv(SWOW_MODEL_INPATH)
SWOW_PATH <- here("analyses/15_validate_distance/data/strength.SWOW-EN.R1.csv")
swow1 <- read_tsv(SWOW_PATH) %>%
clean_names()
swow_wiki <- inner_join(swow1, all_distances,
by= c(cue = "target_word",
response = "comparision_word")) %>%
mutate(log_r1_strength = log(r1_strength))
hist(swow_wiki$log_r1_strength, main = "human similarity")
hist(swow_wiki$cosine, main = "model similarity for human pairs")
Mean model similarity is 0.34.
ggplot(swow_wiki, aes(x = cosine,
y = log_r1_strength)) +
geom_point() +
geom_smooth(method = "lm")
cors <- swow_wiki %>%
filter(cosine != 1) %>%
mutate(cosine_tile = ntile(cosine, n = 10)) %>%
group_by(cosine_tile) %>%
nest() %>%
mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
select(-data) %>%
unnest() %>%
arrange(cosine_tile) %>%
select(-method, -alternative)
kable(cors)
| cosine_tile | estimate | statistic | p.value | parameter | conf.low | conf.high |
|---|---|---|---|---|---|---|
| 1 | 0.0489745 | 3.4644028 | 0.0005359 | 4992 | 0.0212674 | 0.0766063 |
| 2 | 0.0056090 | 0.3963032 | 0.6918983 | 4992 | -0.0221304 | 0.0333397 |
| 3 | 0.0201653 | 1.4250506 | 0.1542051 | 4992 | -0.0075749 | 0.0478745 |
| 4 | 0.0126763 | 0.8957016 | 0.3704553 | 4992 | -0.0150650 | 0.0403980 |
| 5 | 0.0060421 | 0.4269076 | 0.6694651 | 4992 | -0.0216975 | 0.0337724 |
| 6 | 0.0338590 | 2.3936513 | 0.0167183 | 4992 | 0.0061288 | 0.0615372 |
| 7 | 0.0145134 | 1.0255396 | 0.3051584 | 4992 | -0.0132279 | 0.0422323 |
| 8 | 0.0056056 | 0.3960265 | 0.6921024 | 4991 | -0.0221365 | 0.0333392 |
| 9 | 0.0782133 | 5.5425109 | 0.0000000 | 4991 | 0.0505843 | 0.1057226 |
| 10 | 0.1142098 | 8.1217273 | 0.0000000 | 4991 | 0.0867459 | 0.1415003 |
ggplot(cors, aes(x = cosine_tile, y = estimate)) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))
cors <- swow_wiki %>%
filter(cosine != 1) %>%
mutate(cosine_breaks = cut(cosine, breaks = c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)))%>%
filter(!is.na(cosine_breaks)) %>%
group_by(cosine_breaks) %>%
nest() %>%
mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
select(-data) %>%
unnest() %>%
arrange(cosine_breaks) %>%
select(-method, -alternative)
kable(cors)
| cosine_breaks | estimate | statistic | p.value | parameter | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (0,0.1] | 0.0285939 | 0.7942894 | 0.4272714 | 771 | -0.0420057 | 0.0989095 |
| (0.1,0.2] | 0.0514844 | 4.6600409 | 0.0000032 | 8171 | 0.0298372 | 0.0730833 |
| (0.2,0.3] | 0.0827355 | 9.7264794 | 0.0000000 | 13726 | 0.0660987 | 0.0993264 |
| (0.3,0.4] | 0.0671836 | 7.4830560 | 0.0000000 | 12350 | 0.0496069 | 0.0847187 |
| (0.4,0.5] | 0.0101085 | 0.9078447 | 0.3639874 | 8065 | -0.0117165 | 0.0319239 |
| (0.5,0.6] | 0.0446113 | 2.9543737 | 0.0031497 | 4377 | 0.0150113 | 0.0741331 |
| (0.6,0.7] | 0.0138835 | 0.5884291 | 0.5563183 | 1796 | -0.0323654 | 0.0600731 |
| (0.7,0.8] | 0.0274967 | 0.6397996 | 0.5225738 | 541 | -0.0567786 | 0.1113830 |
| (0.8,0.9] | 0.1006786 | 1.0804428 | 0.2822255 | 114 | -0.0831645 | 0.2778943 |
| (0.9,1] | -0.1418997 | -0.2867004 | 0.7885791 | 4 | -0.8549990 | 0.7568169 |
ggplot(cors, aes(x = cosine_breaks, y = estimate)) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))
cors <- swow_wiki %>%
filter(cosine != 1) %>%
mutate(cosine_breaks = cut(cosine, breaks = c(0, .2, .4, .6, .8, 1 )))%>%
filter(!is.na(cosine_breaks)) %>%
group_by(cosine_breaks) %>%
nest() %>%
mutate(d = map(data, ~tidy(cor.test(.$cosine,.$log_r1_strength)))) %>%
select(-data) %>%
unnest() %>%
arrange(cosine_breaks) %>%
select(-method, -alternative)
kable(cors)
| cosine_breaks | estimate | statistic | p.value | parameter | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (0,0.2] | 0.0640350 | 6.068416 | 0.0000000 | 8944 | 0.0433699 | 0.0846452 |
| (0.2,0.4] | 0.1261656 | 20.538206 | 0.0000000 | 26078 | 0.1142039 | 0.1380908 |
| (0.4,0.6] | 0.0895384 | 10.028530 | 0.0000000 | 12444 | 0.0720831 | 0.1069389 |
| (0.6,0.8] | 0.0749515 | 3.635121 | 0.0002838 | 2339 | 0.0345440 | 0.1151144 |
| (0.8,1] | 0.1706677 | 1.897409 | 0.0601770 | 120 | -0.0073152 | 0.3381692 |
ggplot(cors, aes(x = cosine_breaks, y = estimate)) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))
Correlation for full range:
cor.test(swow_wiki$log_r1_strength, swow_wiki$cosine)
##
## Pearson's product-moment correlation
##
## data: swow_wiki$log_r1_strength and swow_wiki$cosine
## t = 64.656, df = 49998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2696699 0.2858478
## sample estimates:
## cor
## 0.2777785
Correlation for upper range only:
swow_wiki_sub <- swow_wiki %>%
filter(cosine > .5)
cor.test(swow_wiki_sub$log_r1_strength, swow_wiki_sub$cosine)
##
## Pearson's product-moment correlation
##
## data: swow_wiki_sub$log_r1_strength and swow_wiki_sub$cosine
## t = 6.39, df = 6903, p-value = 1.767e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.05319243 0.10008997
## sample estimates:
## cor
## 0.07668361
Correlation for lower range only:
swow_wiki_sub <- swow_wiki %>%
filter(cosine < .5)
cor.test(swow_wiki_sub$log_r1_strength, swow_wiki_sub$cosine)
##
## Pearson's product-moment correlation
##
## data: swow_wiki_sub$log_r1_strength and swow_wiki_sub$cosine
## t = 45.508, df = 43093, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2051104 0.2231274
## sample estimates:
## cor
## 0.2141371