CLUSTER_ETS <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets.csv")
CLUSTER_WIKI <- here("analyses/02_concreteness_semantics/data/wiki/target_word_cluster_assignments.csv")
CONC_ETS <- here("analyses/02_concreteness_semantics/data/ets/concreteness_deciles.csv")
CONC_WIKI <- here("analyses/02_concreteness_semantics/data/wiki/target_translations_xling_words.csv")
cluster_assignments_raw_ets <- read_csv(CLUSTER_ETS)
cluster_assignments_raw_wiki <- read_csv(CLUSTER_WIKI)
conc_assignments_ets <- read_csv(CONC_ETS)
conc_assignments_wiki <- read_csv(CONC_WIKI) %>%
distinct(word, concreteness_tile)
all_assignments_ets <- full_join(cluster_assignments_raw_ets, conc_assignments_ets) %>%
rename(concreteness_tile = conc_tile) %>%
mutate(type = "ets",
cluster = as.factor(cluster), # reorder by means
cluster2 = fct_recode(cluster, "1" = "9", "2" = "1", "3" = "2", "4" = "8", "5" = "4",
"6"= "3", "7" = "10", "8" = "7", "9" = "5", "10" = "6"))
all_assignments_wiki <- full_join(conc_assignments_wiki, cluster_assignments_raw_wiki) %>%
mutate(type = "wiki",
cluster = as.factor(cluster),
cluster2 = fct_recode(cluster, "1" = "9", "2"= "3", "3" = "8", "4"= "1",
"6"= "7", "7"= "4", "8" = "6", "9"= "2"))
all_assignments <- bind_rows(all_assignments_ets, all_assignments_wiki) %>%
mutate(cluster2 = fct_relevel(cluster2, "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
type = fct_recode(type, "Second-Language TOEFL Corpus" = "ets",
"Multilingual Wikipedia Corpus" = "wiki"))
#pdf("figs/concreteness_semantics.pdf", width = 10, height = 5.5)
ggplot(all_assignments, aes(x = concreteness_tile, y = cluster2, fill = cluster2)) +
scale_fill_viridis_d(option = "plasma", begin = 1, end = 0,
guide = guide_legend(nrow = 1)) +
ggridges::geom_density_ridges(scale = 3, rel_min_height = .01, bandwidth = .6) +
facet_wrap(.~type) +
ylab("Semantic Cluster") +
scale_x_continuous(breaks = 1:10) +
xlab("Concreteness Decile") +
theme_classic(base_size = 15) +
theme(legend.position = "none",
axis.line = element_line(size = 1.2),
axis.ticks = element_line(size = 1),
strip.background = element_rect(colour = "white", fill="white"),
strip.text = element_text(size = 16))
#dev.off()
Are semantics and concreteness overlaping? Yes.
ETS:
ets_count_mat <- all_assignments %>%
filter(type == "Second-Language TOEFL Corpus") %>%
count(type, concreteness_tile, cluster2) %>%
select(-type) %>%
spread(cluster2, n, -1, fill = 0) %>%
select(-1) %>%
as.matrix()
chisq.test(ets_count_mat)
##
## Pearson's Chi-squared test
##
## data: ets_count_mat
## X-squared = 1538.1, df = 81, p-value < 2.2e-16
WIKI:
wiki_count_mat <- all_assignments %>%
filter(type == "Multilingual Wikipedia Corpus") %>%
count(type, concreteness_tile, cluster2) %>%
select(-type) %>%
spread(cluster2, n, -1, fill = 0) %>%
select(-1) %>%
as.matrix()
chisq.test(wiki_count_mat)
##
## Pearson's Chi-squared test
##
## data: wiki_count_mat
## X-squared = 5144.1, df = 81, p-value < 2.2e-16
WIKI_PATH <- here("analyses/02_concreteness_semantics/data/wiki/lang_pairwise_tile_correlations.csv")
ETS_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_tile_correlations_ets_decile.csv")
conc_corr_wiki <- read_csv(WIKI_PATH,
col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) %>%
mutate(corpus = "wiki")
conc_corr_ets <- read_csv(ETS_PATH,
col_names = c("tile1", "tile2", "corr", "lang1", "lang2")) %>%
mutate(corpus = "ets")
conc_corr <- bind_rows(conc_corr_wiki, conc_corr_ets)
## Within deciles
conc_corr_ms_same <- conc_corr %>%
filter(tile1 >= tile2) %>%
group_by(tile1, tile2, corpus) %>%
multi_boot_standard(col = "corr") %>%
ungroup() %>%
filter(tile2 == tile1) %>%
mutate(tile1 = as.factor(tile1),
lab = case_when(tile2 == 9 & corpus == "ets" ~ "Second-\nLanguage TOEFL Corpus",
tile2 == 9 & corpus == "wiki" ~ "Multilingual\nWikipedia Corpus",
TRUE ~ "")) %>%
mutate(corpus = fct_recode(corpus,
"TOEFL" = "ets",
"Wikipedia" = "wiki"))
#pdf("figs/concreteness_plot.pdf", width = 5, height = 4.7)
ggplot(conc_corr_ms_same, aes(x = tile2, y = mean,
line_type = corpus, group = corpus, color = tile2)) +
geom_smooth(method = "lm", alpha = .2, color = "black") +
geom_pointrange(aes(ymin = ci_lower, max = ci_upper, shape = rev(corpus)), size = .7, alpha = .7) +
scale_x_continuous(breaks = 1:10) +
viridis::scale_color_viridis(option = "plasma", begin = 1, end = 0,
#guide = guide_legend(nrow = 0)) +
guide =FALSE) +
ylab("Cross-linguistic\nWord Distance Correlation") +
#geom_dl(aes(label=corpus), method = list('last.bumpup', cex = 1.3, hjust = 1)) +
# ggtitle("Within Concreteness Deciles") +
#scale_y_continuous(position = "right", limits = c(.14, .52)) +
xlab("Concreteness Decile") +
theme_classic(base_size = 18) +
guides(shape=guide_legend(title="Corpus")) +
theme(axis.line = element_line(size = 1.2),
axis.ticks = element_line(size = 1),
legend.position=c(0.25,0.77),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
legend.background = element_rect(linetype = 1, size = 0.5,
colour = 1))
#hcludev.off()
conc_corr_ms_same %>%
group_by(corpus) %>%
do(tidy(cor.test(.$mean,.$tile2))) %>%
mutate(estimate = round(estimate,2)) %>%
kable()
| corpus | estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|
| TOEFL | 0.78 | 3.514126 | 0.0079136 | 8 | 0.2931959 | 0.9450867 | Pearson’s product-moment correlation | two.sided |
| Wikipedia | 0.81 | 3.895982 | 0.0045705 | 8 | 0.3661770 | 0.9531930 | Pearson’s product-moment correlation | two.sided |
conc_in_out <- conc_corr %>%
filter(tile1 >= tile2) %>%
mutate(local_global = case_when(tile1 == tile2 ~ "local", TRUE ~ "global")) %>%
group_by(corpus, lang1, lang2, local_global) %>%
summarize(mean_corr = mean(corr)) %>%
mutate(lang_pair = paste0(lang1, lang2)) %>%
ungroup() %>%
select(corpus, local_global, lang_pair, lang1, lang2, mean_corr)
conc_in_out_summary <- conc_in_out %>%
group_by(corpus, local_global) %>%
multi_boot_standard(col = "mean_corr") %>%
arrange(corpus)
ggplot(conc_in_out_summary, aes(x = local_global, y = mean,
color = corpus)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("Mean Correlation") +
ggtitle("Local-global for concreteness deciles") +
theme_classic()
conc_in_out_wide <- conc_in_out %>%
filter(corpus == "wiki") %>%
spread(local_global, mean_corr) %>%
mutate(dif = local- global) %>%
select(lang1, lang2, local, global, dif)
This is what the data look like:
head(conc_in_out_wide) %>%
kable()
| lang1 | lang2 | local | global | dif |
|---|---|---|---|---|
| ar | bg | 0.3303223 | 0.2830508 | 0.0472715 |
| ar | bn | 0.2480982 | 0.2057305 | 0.0423677 |
| ar | de | 0.3248045 | 0.2796395 | 0.0451650 |
| ar | el | 0.3134378 | 0.2688484 | 0.0445894 |
| ar | en | 0.3697269 | 0.3181968 | 0.0515301 |
| ar | es | 0.3474667 | 0.3001870 | 0.0472796 |
lm(dif ~ 1, conc_in_out_wide) %>%
summary() %>%
tidy() %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.035533 | 0.0006627 | 53.6193 | 0 |
t.test(conc_in_out_wide$global, conc_in_out_wide$local, paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2) %>%
kable()
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| -0.04 | -53.62 | 0 | 515 | -0.0368349 | -0.0342311 | Paired t-test | two.sided |
iv_mat <- conc_in_out_wide %>%
select(-local, -global) %>%
bind_rows(conc_in_out_wide %>%
select(-local, -global) %>% rename(lang1 = lang2,
lang2 = lang1)) %>%
spread(lang2, dif) %>%
select(-lang1) %>%
as.matrix()
head(iv_mat)
## ar bg bn de el en
## [1,] NA 0.04727148 0.04236768 0.04516497 0.04458942 0.05153010
## [2,] 0.04727148 NA 0.04604494 0.05497622 0.05930866 0.05559946
## [3,] 0.04236768 0.04604494 NA 0.04278500 0.05108388 0.04716660
## [4,] 0.04516497 0.05497622 0.04278500 NA 0.05562776 0.05409559
## [5,] 0.04458942 0.05930866 0.05108388 0.05562776 NA 0.05824733
## [6,] 0.05153010 0.05559946 0.04716660 0.05409559 0.05824733 NA
## es fa fr gu hi id
## [1,] 0.04727962 0.04920370 0.04385525 0.03312881 0.04379451 0.04911387
## [2,] 0.05231979 0.04920048 0.05054834 0.03473511 0.04789857 0.05041657
## [3,] 0.04674872 0.04336345 0.04487898 0.03180056 0.04402548 0.04284389
## [4,] 0.05192894 0.04488979 0.04889500 0.03670042 0.04524603 0.05133440
## [5,] NA 0.05186800 0.05215153 0.03586838 0.04980898 NA
## [6,] 0.05024244 NA 0.05142425 0.04088537 0.04910850 0.05538531
## ig it ja kn ko ml
## [1,] 0.013980309 0.04767846 0.008634183 0.04073757 0.03912732 0.03276816
## [2,] 0.009760160 0.05223729 0.006775421 0.04519056 0.03707190 0.03499538
## [3,] 0.003296772 0.04229685 0.001829571 0.04122112 0.03241991 0.03776234
## [4,] 0.028359157 0.05477206 0.008267010 0.04291152 0.03695615 0.03088596
## [5,] 0.014681674 NA NA NA NA NA
## [6,] 0.020987937 0.05209206 0.010588612 0.04738328 0.04079088 0.03347353
## mr ne nl pa pl pt
## [1,] 0.03441889 0.02608345 0.04559855 0.03187010 0.04946336 0.04648018
## [2,] 0.04016692 0.03163316 0.05143233 0.04158374 0.05480756 0.05249924
## [3,] 0.03494670 0.04144799 0.04112400 0.03030946 0.04702599 0.04636616
## [4,] 0.03881699 0.02897634 0.04837117 0.03165009 0.05598211 0.05090981
## [5,] NA NA 0.05279372 NA NA NA
## [6,] 0.04145008 0.03235927 0.05283536 0.03973876 0.05923973 0.04947635
## ro ru ta te th tl
## [1,] 0.04751700 0.04582059 0.04457064 0.03785080 0.03987087 0.04130156
## [2,] 0.05293860 0.05545659 0.04513277 0.04409530 0.04422659 0.04256032
## [3,] 0.04703937 0.04165780 0.04096068 0.03861215 0.03884191 0.03997498
## [4,] 0.05332001 0.05620587 0.04261433 0.04196960 0.04032039 0.04362830
## [5,] NA NA NA NA 0.05029354 NA
## [6,] 0.05460251 NA 0.04557751 NA NA 0.04960849
## tr ur vi yo zh
## [1,] 0.04899807 0.02673111 0.02475849 0.017404883 0.02238931
## [2,] 0.05453382 0.01934906 0.03701805 0.015197086 0.02164418
## [3,] 0.04868182 0.02188865 0.04877261 0.011909379 0.01783975
## [4,] 0.05809892 0.01345640 0.04806513 0.004466861 0.02063652
## [5,] 0.05733103 0.02202312 0.05548075 0.013389275 0.02471565
## [6,] NA NA NA NA 0.02258322
dv_mat <- matrix(1, nrow = nrow(iv_mat), ncol = ncol(iv_mat))
head(dv_mat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [4,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [5,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [6,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,] 1 1 1 1 1 1 1 1 1 1 1 1
## [2,] 1 1 1 1 1 1 1 1 1 1 1 1
## [3,] 1 1 1 1 1 1 1 1 1 1 1 1
## [4,] 1 1 1 1 1 1 1 1 1 1 1 1
## [5,] 1 1 1 1 1 1 1 1 1 1 1 1
## [6,] 1 1 1 1 1 1 1 1 1 1 1 1
## [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
## [1,] 1 1 1 1 1 1 1 1 1
## [2,] 1 1 1 1 1 1 1 1 1
## [3,] 1 1 1 1 1 1 1 1 1
## [4,] 1 1 1 1 1 1 1 1 1
## [5,] 1 1 1 1 1 1 1 1 1
## [6,] 1 1 1 1 1 1 1 1 1
qap_model <- netlm(iv_mat,
dv_mat,
intercept = F,
mode = "graph",
nullhyp = "qap",
reps = 1000)
summary(qap_model)
##
## OLS Network Model
##
## Residuals:
## 0% 25% 50% 75% 100%
## -0.040817070 -0.009421389 0.003589803 0.010805290 0.044959197
##
## Coefficients:
## Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.03553297 0.047 0.976 0.976
##
## Residual standard error: 0.01505 on 515 degrees of freedom
## Multiple R-squared: 0.8481 Adjusted R-squared: 0.8478
## F-statistic: 2875 on 1 and 515 degrees of freedom, p-value: 0
##
##
## Test Diagnostics:
##
## Null Hypothesis: qapy
## Replications: 1000
## Coefficient Distribution Summary:
##
## x1
## Min 53.62
## 1stQ 53.62
## Median 53.62
## Mean 53.62
## 3rdQ 53.62
## Max 53.62
data.frame(beta = round(summary(qap_model)$coefficients,2),
t_tstat = round(summary(qap_model)$tstat,2),
p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
kable()
| beta | t_tstat | p_stat |
|---|---|---|
| 0.04 | 53.62 | 0.98 |
conc_in_out_wide_full <- conc_in_out_wide %>%
select(-local, -global) %>%
bind_rows(conc_in_out_wide %>%
select(-local, -global) %>% rename(lang1 = lang2,
lang2 = lang1))
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
geom_histogram() +
geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(~lang1)
conc_in_out_wide <- conc_in_out %>%
filter(corpus == "ets") %>%
spread(local_global, mean_corr) %>%
mutate(dif = local- global) %>%
select(lang1, lang2, local, global, dif)
lm(dif ~ 1, conc_in_out_wide) %>%
summary() %>%
tidy() %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.0254055 | 0.0001975 | 128.6386 | 0 |
t.test(conc_in_out_wide$global, conc_in_out_wide$local, paired = T) %>%
tidy() %>%
mutate_at(vars(estimate, statistic), round, 2) %>%
kable()
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| -0.03 | -128.64 | 0 | 594 | -0.0257934 | -0.0250176 | Paired t-test | two.sided |
iv_mat <- conc_in_out_wide %>%
select(-local, -global) %>%
bind_rows(conc_in_out_wide %>%
select(-local, -global) %>% rename(lang1 = lang2,
lang2 = lang1)) %>%
spread(lang2, dif) %>%
select(-lang1) %>%
as.matrix()
dv_mat <- matrix(1, nrow = nrow(iv_mat), ncol = ncol(iv_mat))
#networkkw
#ucinet
qap_model <- netlm(iv_mat,
dv_mat,
intercept = FALSE,
mode = "graph",
nullhyp = "classical",
test.statistic = "t-value",
reps = 1000)
summary(qap_model)
##
## OLS Network Model
##
## Residuals:
## 0% 25% 50% 75% 100%
## -0.0150909782 -0.0035497070 0.0003625696 0.0032915254 0.0139446294
##
## Coefficients:
## Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.02540552 1 0 0
##
## Residual standard error: 0.004817 on 594 degrees of freedom
## Multiple R-squared: 0.9653 Adjusted R-squared: 0.9653
## F-statistic: 1.655e+04 on 1 and 594 degrees of freedom, p-value: 0
##
##
## Test Diagnostics:
##
## Null Hypothesis: classical
data.frame(beta = round(summary(qap_model)$coefficients,2),
t_tstat = round(summary(qap_model)$tstat,2),
p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
kable()
| beta | t_tstat | p_stat |
|---|---|---|
| 0.03 | 128.64 | 0 |
conc_in_out_wide_full <- conc_in_out_wide %>%
select(-local, -global) %>%
bind_rows(conc_in_out_wide %>%
select(-local, -global) %>% rename(lang1 = lang2,
lang2 = lang1))
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
geom_histogram() +
geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(~lang1)