Semantics overlap

CLUSTER_ETS <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets.csv")
CLUSTER_WIKI <- here("analyses/02_concreteness_semantics/data/wiki/target_word_cluster_assignments.csv")
CONC_ETS <- here("analyses/02_concreteness_semantics/data/ets/concreteness_deciles.csv")
CONC_WIKI <-  here("analyses/02_concreteness_semantics/data/wiki/target_translations_xling_words.csv") 
cluster_assignments_raw_ets <- read_csv(CLUSTER_ETS)
cluster_assignments_raw_wiki <- read_csv(CLUSTER_WIKI)
conc_assignments_ets <- read_csv(CONC_ETS) 
conc_assignments_wiki <- read_csv(CONC_WIKI) %>%
  distinct(word, concreteness_tile)

all_assignments_ets <- full_join(cluster_assignments_raw_ets, conc_assignments_ets) %>%
  rename(concreteness_tile = conc_tile) %>%
  mutate(type = "ets",
         cluster = as.factor(cluster), # reorder by means
         cluster2 = fct_recode(cluster, "1" = "9", "2" = "1", "3" = "2", "4" = "8", "5" = "4",
                              "6"= "3", "7" = "10", "8" = "7", "9" = "5", "10" = "6"))

all_assignments_wiki <- full_join(conc_assignments_wiki, cluster_assignments_raw_wiki) %>%
  mutate(type = "wiki",
        cluster = as.factor(cluster),
        cluster2 = fct_recode(cluster, "1" = "9", "2"=  "3", "3" = "8", "4"= "1",
                              "6"= "7", "7"= "4", "8" = "6", "9"= "2"))

all_assignments <- bind_rows(all_assignments_ets, all_assignments_wiki) %>%
        mutate(cluster2 = fct_relevel(cluster2, "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), 
               type = fct_recode(type, "Second-Language TOEFL Corpus" = "ets",
                                        "Multilingual Wikipedia Corpus" = "wiki"))

#pdf("figs/concreteness_semantics.pdf", width = 10, height = 5.5)
ggplot(all_assignments, aes(x = concreteness_tile, y = cluster2, fill = cluster2)) + 
  scale_fill_viridis_d(option = "plasma", begin = 1, end = 0, 
                       guide = guide_legend(nrow = 1)) +
  ggridges::geom_density_ridges(scale = 3, rel_min_height = .01, bandwidth = .6) +
  facet_wrap(.~type) +
  ylab("Semantic Cluster") +
  scale_x_continuous(breaks = 1:10) +
  xlab("Concreteness Decile") +
  theme_classic(base_size = 15) +
  theme(legend.position = "none",
        axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        strip.background = element_rect(colour = "white", fill="white"),
        strip.text = element_text(size = 16))

#dev.off()

Are semantics and concreteness overlaping? Yes.

ETS:

ets_count_mat <- all_assignments %>% 
  filter(type == "Second-Language TOEFL Corpus") %>%
  count(type, concreteness_tile, cluster2) %>%
  select(-type) %>%
  spread(cluster2, n, -1, fill = 0) %>%
  select(-1) %>%
  as.matrix()

chisq.test(ets_count_mat)
## 
##  Pearson's Chi-squared test
## 
## data:  ets_count_mat
## X-squared = 1538.1, df = 81, p-value < 2.2e-16

WIKI:

wiki_count_mat <- all_assignments %>% 
  filter(type == "Multilingual Wikipedia Corpus") %>%
  count(type, concreteness_tile, cluster2) %>%
  select(-type) %>%
  spread(cluster2, n, -1, fill = 0) %>%
  select(-1) %>%
  as.matrix()

chisq.test(wiki_count_mat)
## 
##  Pearson's Chi-squared test
## 
## data:  wiki_count_mat
## X-squared = 5144.1, df = 81, p-value < 2.2e-16

Concrentess and similarity

WIKI_PATH <- here("analyses/02_concreteness_semantics/data/wiki/lang_pairwise_tile_correlations.csv")
ETS_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_tile_correlations_ets_decile.csv")
conc_corr_wiki <- read_csv(WIKI_PATH,
                      col_names = c("tile1", "tile2", "corr", "lang1", "lang2"))  %>%
  mutate(corpus = "wiki")
conc_corr_ets <- read_csv(ETS_PATH,
                      col_names = c("tile1", "tile2", "corr",  "lang1", "lang2"))  %>%
  mutate(corpus = "ets")

conc_corr <- bind_rows(conc_corr_wiki, conc_corr_ets)
## Within deciles
conc_corr_ms_same <- conc_corr %>%
  filter(tile1 >= tile2) %>%
  group_by(tile1, tile2, corpus) %>%
  multi_boot_standard(col = "corr") %>%
  ungroup() %>%
  filter(tile2 == tile1) %>%
  mutate(tile1 = as.factor(tile1),
         lab = case_when(tile2 == 9 & corpus == "ets" ~ "Second-\nLanguage TOEFL Corpus",
                         tile2 == 9 & corpus == "wiki" ~ "Multilingual\nWikipedia Corpus",
                         TRUE ~ "")) %>%
    mutate(corpus = fct_recode(corpus, 
                             "TOEFL" = "ets",
"Wikipedia" = "wiki"))

#pdf("figs/concreteness_plot.pdf", width = 5, height = 4.7)
ggplot(conc_corr_ms_same, aes(x = tile2, y = mean,
                         line_type = corpus, group = corpus, color = tile2)) +
  geom_smooth(method = "lm", alpha = .2, color = "black") +
  geom_pointrange(aes(ymin = ci_lower, max = ci_upper, shape = rev(corpus)), size = .7, alpha = .7)  +
  scale_x_continuous(breaks = 1:10) +
  viridis::scale_color_viridis(option = "plasma", begin = 1, end = 0, 
                       #guide = guide_legend(nrow = 0)) +
                       guide =FALSE) +
  
  ylab("Cross-linguistic\nWord Distance Correlation") +
  #geom_dl(aes(label=corpus), method = list('last.bumpup', cex = 1.3, hjust = 1)) +
 # ggtitle("Within Concreteness Deciles") + 
  #scale_y_continuous(position = "right", limits = c(.14, .52)) +
  xlab("Concreteness Decile") +
  theme_classic(base_size = 18)  +
  guides(shape=guide_legend(title="Corpus")) +
  theme(axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        legend.position=c(0.25,0.77),
        legend.text = element_text(size = 8),
        legend.title = element_text(size = 10),
        legend.background = element_rect(linetype = 1, size = 0.5, 
                                         colour = 1))

#hcludev.off()
conc_corr_ms_same %>%
  group_by(corpus) %>%
  do(tidy(cor.test(.$mean,.$tile2))) %>%
  mutate(estimate = round(estimate,2)) %>%
  kable()
corpus estimate statistic p.value parameter conf.low conf.high method alternative
TOEFL 0.78 3.514126 0.0079136 8 0.2931959 0.9450867 Pearson’s product-moment correlation two.sided
Wikipedia 0.81 3.895982 0.0045705 8 0.3661770 0.9531930 Pearson’s product-moment correlation two.sided

Discrete local-global concreteness analysis

conc_in_out <- conc_corr %>%
  filter(tile1 >= tile2) %>%
  mutate(local_global = case_when(tile1 == tile2 ~ "local", TRUE ~ "global")) %>%
  group_by(corpus, lang1, lang2, local_global) %>%
  summarize(mean_corr = mean(corr))  %>%
  mutate(lang_pair = paste0(lang1, lang2)) %>%
  ungroup() %>%
  select(corpus, local_global, lang_pair, lang1, lang2, mean_corr)

conc_in_out_summary <- conc_in_out %>%
  group_by(corpus, local_global) %>%
  multi_boot_standard(col = "mean_corr") %>%
  arrange(corpus)

ggplot(conc_in_out_summary, aes(x = local_global, y = mean,
                        color = corpus)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("Mean Correlation") +
  ggtitle("Local-global for concreteness deciles") +
  theme_classic()

Stats

Wiki

conc_in_out_wide <- conc_in_out %>%
  filter(corpus == "wiki") %>%
  spread(local_global, mean_corr) %>%
  mutate(dif = local- global) %>%
  select(lang1, lang2, local, global, dif) 

This is what the data look like:

head(conc_in_out_wide) %>%
  kable()
lang1 lang2 local global dif
ar bg 0.3303223 0.2830508 0.0472715
ar bn 0.2480982 0.2057305 0.0423677
ar de 0.3248045 0.2796395 0.0451650
ar el 0.3134378 0.2688484 0.0445894
ar en 0.3697269 0.3181968 0.0515301
ar es 0.3474667 0.3001870 0.0472796

regression

lm(dif ~ 1, conc_in_out_wide) %>%
  summary() %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 0.035533 0.0006627 53.6193 0

paired t-test

t.test(conc_in_out_wide$global,  conc_in_out_wide$local, paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
-0.04 -53.62 0 515 -0.0368349 -0.0342311 Paired t-test two.sided

qap regression

iv_mat <- conc_in_out_wide %>%
  select(-local, -global) %>%
   bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) %>%
  spread(lang2, dif) %>%
  select(-lang1) %>%
  as.matrix()

head(iv_mat)
##              ar         bg         bn         de         el         en
## [1,]         NA 0.04727148 0.04236768 0.04516497 0.04458942 0.05153010
## [2,] 0.04727148         NA 0.04604494 0.05497622 0.05930866 0.05559946
## [3,] 0.04236768 0.04604494         NA 0.04278500 0.05108388 0.04716660
## [4,] 0.04516497 0.05497622 0.04278500         NA 0.05562776 0.05409559
## [5,] 0.04458942 0.05930866 0.05108388 0.05562776         NA 0.05824733
## [6,] 0.05153010 0.05559946 0.04716660 0.05409559 0.05824733         NA
##              es         fa         fr         gu         hi         id
## [1,] 0.04727962 0.04920370 0.04385525 0.03312881 0.04379451 0.04911387
## [2,] 0.05231979 0.04920048 0.05054834 0.03473511 0.04789857 0.05041657
## [3,] 0.04674872 0.04336345 0.04487898 0.03180056 0.04402548 0.04284389
## [4,] 0.05192894 0.04488979 0.04889500 0.03670042 0.04524603 0.05133440
## [5,]         NA 0.05186800 0.05215153 0.03586838 0.04980898         NA
## [6,] 0.05024244         NA 0.05142425 0.04088537 0.04910850 0.05538531
##               ig         it          ja         kn         ko         ml
## [1,] 0.013980309 0.04767846 0.008634183 0.04073757 0.03912732 0.03276816
## [2,] 0.009760160 0.05223729 0.006775421 0.04519056 0.03707190 0.03499538
## [3,] 0.003296772 0.04229685 0.001829571 0.04122112 0.03241991 0.03776234
## [4,] 0.028359157 0.05477206 0.008267010 0.04291152 0.03695615 0.03088596
## [5,] 0.014681674         NA          NA         NA         NA         NA
## [6,] 0.020987937 0.05209206 0.010588612 0.04738328 0.04079088 0.03347353
##              mr         ne         nl         pa         pl         pt
## [1,] 0.03441889 0.02608345 0.04559855 0.03187010 0.04946336 0.04648018
## [2,] 0.04016692 0.03163316 0.05143233 0.04158374 0.05480756 0.05249924
## [3,] 0.03494670 0.04144799 0.04112400 0.03030946 0.04702599 0.04636616
## [4,] 0.03881699 0.02897634 0.04837117 0.03165009 0.05598211 0.05090981
## [5,]         NA         NA 0.05279372         NA         NA         NA
## [6,] 0.04145008 0.03235927 0.05283536 0.03973876 0.05923973 0.04947635
##              ro         ru         ta         te         th         tl
## [1,] 0.04751700 0.04582059 0.04457064 0.03785080 0.03987087 0.04130156
## [2,] 0.05293860 0.05545659 0.04513277 0.04409530 0.04422659 0.04256032
## [3,] 0.04703937 0.04165780 0.04096068 0.03861215 0.03884191 0.03997498
## [4,] 0.05332001 0.05620587 0.04261433 0.04196960 0.04032039 0.04362830
## [5,]         NA         NA         NA         NA 0.05029354         NA
## [6,] 0.05460251         NA 0.04557751         NA         NA 0.04960849
##              tr         ur         vi          yo         zh
## [1,] 0.04899807 0.02673111 0.02475849 0.017404883 0.02238931
## [2,] 0.05453382 0.01934906 0.03701805 0.015197086 0.02164418
## [3,] 0.04868182 0.02188865 0.04877261 0.011909379 0.01783975
## [4,] 0.05809892 0.01345640 0.04806513 0.004466861 0.02063652
## [5,] 0.05733103 0.02202312 0.05548075 0.013389275 0.02471565
## [6,]         NA         NA         NA          NA 0.02258322
dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))

head(dv_mat)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [2,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [3,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [4,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [5,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [6,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,]     1     1     1     1     1     1     1     1     1     1     1     1
## [2,]     1     1     1     1     1     1     1     1     1     1     1     1
## [3,]     1     1     1     1     1     1     1     1     1     1     1     1
## [4,]     1     1     1     1     1     1     1     1     1     1     1     1
## [5,]     1     1     1     1     1     1     1     1     1     1     1     1
## [6,]     1     1     1     1     1     1     1     1     1     1     1     1
##      [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
## [1,]     1     1     1     1     1     1     1     1     1
## [2,]     1     1     1     1     1     1     1     1     1
## [3,]     1     1     1     1     1     1     1     1     1
## [4,]     1     1     1     1     1     1     1     1     1
## [5,]     1     1     1     1     1     1     1     1     1
## [6,]     1     1     1     1     1     1     1     1     1
qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = F,
                   mode = "graph",
                   nullhyp = "qap",
                   reps = 1000) 

summary(qap_model)
## 
## OLS Network Model
## 
## Residuals:
##           0%          25%          50%          75%         100% 
## -0.040817070 -0.009421389  0.003589803  0.010805290  0.044959197 
## 
## Coefficients:
##    Estimate   Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.03553297 0.047   0.976   0.976    
## 
## Residual standard error: 0.01505 on 515 degrees of freedom
## Multiple R-squared: 0.8481   Adjusted R-squared: 0.8478 
## F-statistic:  2875 on 1 and 515 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qapy 
##  Replications: 1000 
##  Coefficient Distribution Summary:
## 
##           x1
## Min    53.62
## 1stQ   53.62
## Median 53.62
## Mean   53.62
## 3rdQ   53.62
## Max    53.62
data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()
beta t_tstat p_stat
0.04 53.62 0.98
conc_in_out_wide_full <- conc_in_out_wide %>%
  select(-local, -global) %>%
  bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) 
  
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), linetype = 2) + 
  facet_wrap(~lang1)

ETS

conc_in_out_wide <- conc_in_out %>%
  filter(corpus == "ets") %>%
  spread(local_global, mean_corr) %>%
  mutate(dif = local- global) %>%
  select(lang1, lang2, local, global, dif) 

regression

lm(dif ~ 1, conc_in_out_wide) %>%
  summary() %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 0.0254055 0.0001975 128.6386 0

paired t-test

t.test(conc_in_out_wide$global,  conc_in_out_wide$local, paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
-0.03 -128.64 0 594 -0.0257934 -0.0250176 Paired t-test two.sided

qap regression

iv_mat <- conc_in_out_wide %>%
  select(-local, -global) %>%
  bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) %>%
  spread(lang2, dif) %>%
  select(-lang1) %>%
  as.matrix()

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))

#networkkw
#ucinet
qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = FALSE,
                   mode = "graph",
                   nullhyp = "classical",
                   test.statistic = "t-value",
                   reps = 1000) 

summary(qap_model)
## 
## OLS Network Model
## 
## Residuals:
##            0%           25%           50%           75%          100% 
## -0.0150909782 -0.0035497070  0.0003625696  0.0032915254  0.0139446294 
## 
## Coefficients:
##    Estimate   Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.02540552 1       0       0        
## 
## Residual standard error: 0.004817 on 594 degrees of freedom
## Multiple R-squared: 0.9653   Adjusted R-squared: 0.9653 
## F-statistic: 1.655e+04 on 1 and 594 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: classical
data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()
beta t_tstat p_stat
0.03 128.64 0
conc_in_out_wide_full <- conc_in_out_wide %>%
  select(-local, -global) %>%
  bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) 
  
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), linetype = 2) + 
  facet_wrap(~lang1)