light-light words only.

BEDNY_DATA <- here("data/raw/datalong_CBSAMT.csv")
bedny_data <- read_csv(BEDNY_DATA)

sim_ratings <-bedny_data %>%
  filter(C1 == "Light", C2 == "Light") %>%
  select(contains("V"), contains("_")) %>%
  rename(word1 = V1,
         word2 = V2) %>%
  gather("subject_id", "raw_similarity", -word1, -word2) %>%
  group_by(subject_id) %>%
  mutate(scale_similarity = scale(raw_similarity),
         norm_similarity = (scale_similarity - min(scale_similarity))/
           (max(scale_similarity) - min(scale_similarity))) %>%
  mutate(group_type = case_when(str_detect(subject_id, "CB_")~ "CB",
                                str_detect(subject_id, "S_")~ "S",
                                str_detect(subject_id, "AMT_")~ "AMT"))

mean_ratings <- sim_ratings %>%
  group_by(group_type, word1, word2) %>%
  filter(!is.na(norm_similarity))%>%
  summarize(similarity = mean(norm_similarity)) 

# get language distances
LANGUAGE_DISTANCES <- here("data/processed/bedny_2019_lang_distances.csv")
long_word_word_dists <- read_csv(LANGUAGE_DISTANCES)

all words

all_data <- mean_ratings %>%
  left_join(long_word_word_dists)

ggplot(all_data, aes(x = language_similarity, y = similarity))+
  geom_point() + 
  #geom_text(aes(label = word1)) +
  ylab("human similarity")+
  geom_smooth(method = "lm") +
  facet_grid(~group_type) + 
  theme_classic()

all_data %>%
  group_by(group_type)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$similarity, .$language_similarity,  method = "spearman")))) %>%
  select(-data) %>%
  unnest()
## # A tibble: 3 x 6
##   group_type estimate statistic     p.value method              alternative
##   <chr>         <dbl>     <dbl>       <dbl> <chr>               <chr>      
## 1 AMT           0.451   105997.     1.41e-6 Spearman's rank co… two.sided  
## 2 CB            0.530    90588.     5.89e-9 Spearman's rank co… two.sided  
## 3 S             0.470   102340.     4.36e-7 Spearman's rank co… two.sided
cb_s <- all_data %>% 
  filter(group_type != "AMT") %>%
  select(-language_similarity) %>%
  spread(group_type, similarity)

ggplot(cb_s, aes(x = CB, y = S))+
  ggtitle("Sighted vs. Blind") +
  geom_point() + 
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(cb_s$S, cb_s$CB)
## 
##  Pearson's product-moment correlation
## 
## data:  cb_s$S and cb_s$CB
## t = 21.231, df = 103, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8591094 0.9326100
## sample estimates:
##       cor 
## 0.9022147