| title: Concreteness decile analysis - ETS fasttext |
| author: Molly Lewis |
| date: “2022-07-11” |
| output: |
| html_document: |
| toc_float: no |
| code_folding: hide |
| number_sections: no |
| toc: yes |
Concreteness and similarity
ETS_FASTTEXT_PATH1 <- here("analyses/02_concreteness_semantics/data/ets_fasttext/lang_pairwise_tile_correlations_ets_decile_fasttext.csv")
ETS_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_tile_correlations_ets_decile.csv")
MODEL_SIZE <- here("analyses/02_concreteness_semantics/data/ets_fasttext/compare_model_dims.csv")
conc_corr_ets_ft1 <- read_csv(ETS_FASTTEXT_PATH1,
col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) %>%
mutate(lang1temp = case_when(lang1 < lang2 ~ lang1,
TRUE ~ lang2),
lang2temp = case_when(lang1 < lang2 ~ lang2,
TRUE ~ lang1)) %>%
distinct(lang1temp, lang2temp, tile1, tile2, .keep_all = T) %>%
select(-lang1, -lang2) %>%
rename(lang1 = lang1temp, lang2 = lang2temp)
conc_corr_ets_ft <- conc_corr_ets_ft1 %>%
mutate(corpus = "ets_ft")
conc_corr_ets_ft %>%
count(lang1,lang2) %>%
nrow()
## [1] 595
conc_corr_ets <- read_csv(ETS_PATH,
col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) %>%
mutate(corpus = "ets")
conc_corr_ets %>%
count(lang1,lang2) %>%
nrow()
## [1] 595
conc_corr <- bind_rows(conc_corr_ets_ft, conc_corr_ets) %>%
filter(tile1 >= tile2)
## Within deciles
conc_corr_ms_same <- conc_corr %>%
group_by(tile1, tile2, corpus) %>%
multi_boot_standard(col = "corr") %>%
ungroup() %>%
filter(tile2 == tile1) %>%
mutate(tile1 = as.factor(tile1))
#pdf("figs/concreteness_plot_ets_ft_vs_doc2vec.pdf", width = 5, height = 4.7)
ggplot(conc_corr_ms_same, aes(x = tile2, y = mean,
line_type = corpus, group = corpus, color = tile2)) +
geom_smooth(method = "lm", alpha = .2, color = "black") +
geom_pointrange(aes(ymin = ci_lower, max = ci_upper, shape = corpus), size = .7, alpha = .7) +
viridis::scale_color_viridis(option = "plasma", begin = 1, end = 0,
#guide = guide_legend(nrow = 0)) +
guide =FALSE) +
ylab("Cross-linguistic\nWord Distance Correlation") +
#geom_dl(aes(label=corpus), method = list('last.bumpup', cex = 1.3, hjust = 1)) +
# ggtitle("Within Concreteness Deciles") +
#scale_y_continuous(position = "right", limits = c(.14, .52)) +
scale_x_continuous(breaks = 1:10,
label = c("1\n(abstract)", "2" ,"3", "4", "5", "6", "7", "8", "9", "10\n(concrete)")) +
xlab("Word Concreteness") +
theme_classic(base_size = 18) +
guides(shape=guide_legend(title="Corpus")) +
theme(axis.line = element_line(size = 1.2),
axis.ticks = element_line(size = 1),
legend.position=c(0.25,0.77),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.background = element_rect(linetype = 1, size = 0.5,
colour = 1),
plot.margin = margin(10, 20, 10, 20))

conc_corr_ets_separate <- conc_corr %>%
filter(tile1 >= tile2) %>%
group_by(lang1, lang2, tile1, tile2, corpus) %>%
summarize(mean = mean(corr)) %>%
ungroup() %>%
filter(tile2 == tile1) %>%
mutate(tile1 = as.factor(tile1),
langpair = paste0(lang1, lang2, corpus))
ggplot(conc_corr_ets_separate, aes(x = tile2, y = mean, group = langpair, color = corpus)) +
geom_line(alpha = .05) +
theme_classic()

#geom_smooth(method = "lm", se = F)
Concreteness and similarity
ETS_FASTTEXT_PATH1 <- here("analyses/02_concreteness_semantics/data/ets_fasttext/lang_pairwise_tile_correlations_ets_decile_fasttext.csv")
ETS_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_tile_correlations_ets_decile.csv")
MODEL_SIZE <- here("analyses/02_concreteness_semantics/data/ets_fasttext/compare_model_dims.csv")
conc_corr_ets_ft1 <- read_csv(ETS_FASTTEXT_PATH1,
col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) %>%
mutate(lang1temp = case_when(lang1 < lang2 ~ lang1,
TRUE ~ lang2),
lang2temp = case_when(lang1 < lang2 ~ lang2,
TRUE ~ lang1)) %>%
distinct(lang1temp, lang2temp, tile1, tile2, .keep_all = T) %>%
select(-lang1, -lang2) %>%
rename(lang1 = lang1temp, lang2 = lang2temp)
conc_corr_ets_ft <- conc_corr_ets_ft1 %>%
mutate(corpus = "ets_ft")
conc_corr_ets_ft %>%
count(lang1,lang2) %>%
nrow()
## [1] 595
conc_corr_ets <- read_csv(ETS_PATH,
col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) %>%
mutate(corpus = "ets")
conc_corr_ets %>%
count(lang1,lang2) %>%
nrow()
## [1] 595
conc_corr <- bind_rows(conc_corr_ets_ft, conc_corr_ets) %>%
filter(tile1 >= tile2)
## Within deciles
conc_corr_ms_same <- conc_corr %>%
group_by(tile1, tile2, corpus) %>%
multi_boot_standard(col = "corr") %>%
ungroup() %>%
filter(tile2 == tile1) %>%
mutate(tile1 = as.factor(tile1))
#pdf("figs/concreteness_plot_ets_ft_vs_doc2vec.pdf", width = 5, height = 4.7)
ggplot(conc_corr_ms_same, aes(x = tile2, y = mean,
line_type = corpus, group = corpus, color = tile2)) +
geom_smooth(method = "lm", alpha = .2, color = "black") +
geom_pointrange(aes(ymin = ci_lower, max = ci_upper, shape = corpus), size = .7, alpha = .7) +
viridis::scale_color_viridis(option = "plasma", begin = 1, end = 0,
#guide = guide_legend(nrow = 0)) +
guide =FALSE) +
ylab("Cross-linguistic\nWord Distance Correlation") +
#geom_dl(aes(label=corpus), method = list('last.bumpup', cex = 1.3, hjust = 1)) +
# ggtitle("Within Concreteness Deciles") +
#scale_y_continuous(position = "right", limits = c(.14, .52)) +
scale_x_continuous(breaks = 1:10,
label = c("1\n(abstract)", "2" ,"3", "4", "5", "6", "7", "8", "9", "10\n(concrete)")) +
xlab("Word Concreteness") +
theme_classic(base_size = 18) +
guides(shape=guide_legend(title="Corpus")) +
theme(axis.line = element_line(size = 1.2),
axis.ticks = element_line(size = 1),
legend.position=c(0.25,0.77),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.background = element_rect(linetype = 1, size = 0.5,
colour = 1),
plot.margin = margin(10, 20, 10, 20))

conc_corr_ets_separate <- conc_corr %>%
filter(tile1 >= tile2) %>%
group_by(lang1, lang2, tile1, tile2, corpus) %>%
summarize(mean = mean(corr)) %>%
ungroup() %>%
filter(tile2 == tile1) %>%
mutate(tile1 = as.factor(tile1),
langpair = paste0(lang1, lang2, corpus))
ggplot(conc_corr_ets_separate, aes(x = tile2, y = mean, group = langpair, color = corpus)) +
geom_line(alpha = .05) +
theme_classic()

#geom_smooth(method = "lm", se = F)
Local-global
conc_in_out <- conc_corr %>%
filter(tile1 >= tile2) %>%
mutate(local_global = case_when(tile1 == tile2 ~ "local", TRUE ~ "global")) %>%
group_by(corpus, lang1, lang2, local_global) %>%
summarize(mean_corr = mean(corr)) %>%
mutate(lang_pair = paste0(lang1, lang2)) %>%
ungroup() %>%
select(corpus, local_global, lang_pair, lang1, lang2, mean_corr)
conc_in_out_summary <- conc_in_out %>%
group_by(corpus, local_global) %>%
multi_boot_standard(col = "mean_corr") %>%
arrange(corpus)
ggplot(conc_in_out_summary, aes(x = local_global, y = mean,
color = corpus)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_line(aes(group = corpus)) +
ylab("Mean Correlation") +
#ggtitle("Local-global for concreteness deciles") +
theme_classic()

stats
conc_in_out_wide <- conc_in_out %>%
filter(corpus == "ets_ft") %>%
spread(local_global, mean_corr) %>%
mutate(dif = local- global) %>%
select(lang1, lang2, local, global, dif)
regression
lm(dif ~ 1, conc_in_out_wide) %>%
summary() %>%
tidy() %>%
kable()
| (Intercept) |
0.0141709 |
0.0001759 |
80.58142 |
0 |
paired t-test
paired_t_ets <- t.test(conc_in_out_wide$local,
conc_in_out_wide$global,
paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2)
kable(paired_t_ets)
| 0.01 |
80.58 |
0 |
594 |
0.0138255 |
0.0145163 |
Paired t-test |
two.sided |
glue("TOEFL-FT: t({paired_t_ets$parameter}) = {paired_t_ets$statistic}; p < .0001")
## TOEFL-FT: t(594) = 80.58; p < .0001
wilcox_paired_ets <- wilcox.test(conc_in_out_wide$local, conc_in_out_wide$global, paired = TRUE)
wilcox_paired_ets
##
## Wilcoxon signed rank test with continuity correction
##
## data: conc_in_out_wide$local and conc_in_out_wide$global
## V = 177310, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
glue("$W$ = {wilcox_paired_ets$statistic}, $p$ $<$ .0001")
## $W$ = 177310, $p$ $<$ .0001
Effect size
es_data_ets <- conc_in_out_wide %>%
mutate(id = 1:n()) %>%
select(id, local, global) %>%
pivot_longer(cols = 2:3) %>%
group_by(name) %>%
summarize(m = mean(value, na.rm = T),
sd = sd(value, na.rm = T),
n= n())
ets_es <- mes(es_data_ets %>% filter(name == "local") %>% pull(m),
es_data_ets %>% filter(name == "global") %>% pull(m),
es_data_ets %>% filter(name == "local") %>% pull(sd),
es_data_ets %>% filter(name == "global") %>% pull(sd),
es_data_ets %>% filter(name == "local") %>% pull(n),
es_data_ets %>% filter(name == "global") %>% pull(n),
verbose = F)
glue("TOEFL: $d$ = {ets_es$d} [{ets_es$l.d}, {ets_es$u.d}]")
## TOEFL: $d$ = 0.36 [0.24, 0.47]
ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets_fasttext/lang_pairwise_semantics_correlations_ets_by_cluster.csv")
cluster_ets <- read_csv(ETS_CLUSTER_CORR_PATH,
col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
mutate(corpus = "TOEFL_FT")
Plot
cluster_pair_means <- cluster_ets %>%
group_by(corpus, cluster1, cluster2) %>% # aggregate across languages
summarize(cor = mean(cor, na.rm = T))
full_cluster_pair_means <- cluster_pair_means %>%
bind_rows(data.frame(corpus = cluster_pair_means$corpus,
cluster2 = cluster_pair_means$cluster1,
cluster1 = cluster_pair_means$cluster2,
cor = cluster_pair_means$cor)) %>%
mutate(same = case_when(cluster1 == cluster2 ~ "Local", TRUE ~ "Global")) %>%
distinct()
df <- full_cluster_pair_means %>%
group_by(corpus, cluster1, same) %>%
multi_boot_standard(col = "cor", na.rm = T) %>%
ungroup() %>%
mutate(same = fct_rev(same))
df_segment <- df %>%
select(corpus, cluster1, same, mean) %>%
spread(same, mean) %>%
mutate(dif = Local-Global) %>%
arrange(corpus, -dif) %>%
group_by(corpus) %>%
mutate(cluster_plotting = 1:n())
#pdf("figs/local_global_plot.pdf", width = 10, height = 4.4)
scale_label <- "Semantic\nRelation"
ggplot() +
facet_grid(corpus ~ ., switch = "y") +
# theme(strip.placement = "outside") +
geom_segment(data = df_segment, aes(y = Global, yend = Local,
x = cluster1, xend = cluster1),
linetype = 1, size = .6) +
geom_linerange(data = filter(df, same == "Global"),
aes(x = cluster1, ymin = ci_lower, ymax = ci_upper), color = "#377EB8", size = 1.5) +
geom_point(data = df, size = 5, aes(x = cluster1, y = mean, color = same, shape = same)) +
ylab("Cross-linguistic\nWord Distance Correlation") +
scale_x_continuous(breaks = 1:10, name = "Cluster") +
scale_shape_manual(scale_label, values = c(19, 15)) +
scale_fill_manual(scale_label, values = c( "#E41A1C", "#377EB8" )) +
scale_color_manual(scale_label, values = c("#E41A1C", "#377EB8" )) +
theme_classic(base_size = 20) +
theme(axis.line = element_line(size = 1.2),
axis.ticks = element_line(size = 1),
legend.position = "none",
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.background = element_rect(linetype = 1, size = 0.5, colour = 1),
strip.placement = "outside")

#dev.off()
Stats
Summary Stats
cluster_corr <- cluster_ets %>%
mutate(local_global = case_when(cluster1 == cluster2 ~ "local", TRUE ~ "global"),
lang_pair = glue("{lang1}_{lang2}")) %>%
group_by(corpus, lang_pair, local_global) %>%
summarize(mean_corr = mean(cor, na.rm = T)) %>% # aggregate across cluster pairs
ungroup() %>%
as.data.frame()
cluster_dif_wide_ets <- cluster_corr %>%
spread(local_global, mean_corr) %>%
mutate(dif = local- global)
summary_stats <- cluster_dif_wide_ets %>%
group_by(corpus) %>%
summarize(mean_dif = mean(dif),
sd_dif = sd(dif)) %>%
mutate_if(is.numeric, round, 3)
glue("TOEF_FT: $M$ = {summary_stats %>% filter(corpus == 'TOEFL_FT') %>% pull(mean_dif)}, $SD$ = {summary_stats %>% filter(corpus == 'TOEFL_FT') %>% pull(sd_dif)}}")
## TOEF_FT: $M$ = 0.011, $SD$ = 0.008}
paired_t_ets_df <- t.test(cluster_dif_wide_ets$local, cluster_dif_wide_ets$global,
paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2)
kable(paired_t_ets_df)
| 0.01 |
34.5 |
0 |
594 |
0.0105465 |
0.01182 |
Paired t-test |
two.sided |
wilcox_paired_ets <- wilcox.test(cluster_dif_wide_ets$local, cluster_dif_wide_ets$global, paired = TRUE)
wilcox_paired_ets
##
## Wilcoxon signed rank test with continuity correction
##
## data: cluster_dif_wide_ets$local and cluster_dif_wide_ets$global
## V = 174196, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
glue("$W$ = {wilcox_paired_ets$statistic}, $p$ $<$ .0001")
## $W$ = 174196, $p$ $<$ .0001
Effect size
es_data_ets <- cluster_dif_wide_ets %>%
mutate(id = 1:n()) %>%
select(id, local, global) %>%
pivot_longer(cols = 2:3) %>%
group_by(name) %>%
summarize(m = mean(value),
sd = sd(value),
n= n())
toefl_es <- mes(es_data_ets %>% filter(name == "local") %>% pull(m),
es_data_ets %>% filter(name == "global") %>% pull(m),
es_data_ets %>% filter(name == "local") %>% pull(sd),
es_data_ets %>% filter(name == "global") %>% pull(sd),
es_data_ets %>% filter(name == "local") %>% pull(n),
es_data_ets %>% filter(name == "global") %>% pull(n),
verbose = F)
glue("TEOFL: $d$ = {toefl_es$d} [{toefl_es$l.d}, {toefl_es$u.d}]")
## TEOFL: $d$ = 0.28 [0.17, 0.39]
Controling for language distance
LANGUAGE_DISTANCES <- here("analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/wals_language_distance")
lang_dists <- read_csv(LANGUAGE_DISTANCES)
lang_dists_tidy_ets <- lang_dists %>%
mutate(lang_pair = paste0(toupper(lang1_ETS), "_", toupper(lang2_ETS))) %>%
select(lang_pair, wals_lang_dist)
diff_with_language_distance_ets <- cluster_dif_wide_ets %>%
left_join(lang_dists_tidy_ets)
cor.test(diff_with_language_distance_ets$dif,
diff_with_language_distance_ets$wals_lang_dist)
##
## Pearson's product-moment correlation
##
## data: diff_with_language_distance_ets$dif and diff_with_language_distance_ets$wals_lang_dist
## t = 5.1888, df = 593, p-value = 2.911e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1302001 0.2840218
## sample estimates:
## cor
## 0.2083993
lm(dif ~ 1 + wals_lang_dist, data = diff_with_language_distance_ets) %>%
summary()
##
## Call:
## lm(formula = dif ~ 1 + wals_lang_dist, data = diff_with_language_distance_ets)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0224401 -0.0057311 -0.0003354 0.0055772 0.0215181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.063e-03 1.221e-03 4.145 3.89e-05 ***
## wals_lang_dist 4.688e-05 9.034e-06 5.189 2.91e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007741 on 593 degrees of freedom
## Multiple R-squared: 0.04343, Adjusted R-squared: 0.04182
## F-statistic: 26.92 on 1 and 593 DF, p-value: 2.911e-07