V5

Each language has equal number of words (200), but words differ across languages.

cos_dist <- read_csv("data/lang_pairwise_decile_distance_10x200.csv")
 
  
conc_corr_ms <- cos_dist %>%
  group_by(concreteness_tile1, concreteness_tile2) %>%
  multi_boot_standard(col = "mean_cos_dist") %>%
  ungroup() %>%
  mutate(concreteness_tile1 = as.factor(concreteness_tile1))


ggplot(conc_corr_ms, aes(x = concreteness_tile1, y = mean, group = 1)) +
  geom_line(aes(group = concreteness_tile2, 
                      color = as.factor(concreteness_tile2))) +
  theme_classic()

conc_corr_ms <- cos_dist %>%
  group_by(concreteness_tile1, concreteness_tile2) %>%
  multi_boot_standard(col = "mean_cos_dist") %>%
  ungroup() %>%
  mutate(concreteness_tile2 = as.factor(concreteness_tile2))


ggplot(conc_corr_ms, aes(x = concreteness_tile2, y = mean, group = 1)) +
  geom_line(aes(group = concreteness_tile1, 
                      color = as.factor(concreteness_tile1))) +
  theme_classic()

V4

Each langauges has sample of same words (1000), but different number of words. I got these 10,000 words by looking at the words that were most common across models.

conc_corr <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/L2ETS/studies/study2/analyses/5_concreteness/analysis_v4/data/lang_pairwise_tile_correlations.csv",
                      col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) 

conc_corr_ms <- conc_corr %>%
  group_by(tile1, tile2) %>%
  multi_boot_standard(col = "corr") %>%
  ungroup() %>%
  mutate(tile1 = as.factor(tile1))

conc_corr_ms <- conc_corr %>%
  group_by(tile1, tile2) %>%
  multi_boot_standard(col = "corr") %>%
  ungroup() %>%
  filter(tile2 <= tile1) %>%
  mutate(tile1 = as.factor(tile1)) %>%
  mutate(group_type = ifelse(tile1 == tile2, "same quintile", "different quintile"),
         tile1 = fct_relevel(tile1, "10", after = Inf))  %>%
  filter(tile1 != tile2)



ggplot(conc_corr_ms, aes(x = tile1, y = mean, group = 1)) +
  geom_line(aes(group = tile2, 
                      color = as.factor(tile2))) +
  theme_classic()

conc_corr_ms <- conc_corr %>%
  group_by(tile1, tile2) %>%
  multi_boot_standard(col = "corr") %>%
  ungroup() %>%
  mutate(tile2 = as.factor(tile2))

conc_corr_ms <- conc_corr %>%
  group_by(tile1, tile2) %>%
  multi_boot_standard(col = "corr") %>%
  ungroup() %>%
  filter(tile2 <= tile1) %>%
  mutate(tile2 = as.factor(tile2)) %>%
  mutate(group_type = ifelse(tile1 == tile2, "same quintile", "different quintile"),
         tile2 = fct_relevel(tile2, "10", after = Inf))  %>%
  filter(tile1 != tile2)



ggplot(conc_corr_ms, aes(x = tile2, y = mean, group = 1)) +
  geom_line(aes(group = tile1, 
                      color = as.factor(tile1))) +
  theme_classic()

N words per langauge/decile actually in analysis in V4: