Semantics overlap
Concrentess and similarity
Discrete local-global concreteness analysis
- Stats
  - Wiki
  - ETS

Semantics overlap

CLUSTER_ETS <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets.csv")
CLUSTER_WIKI <- here("analyses/02_concreteness_semantics/data/wiki/target_word_cluster_assignments.csv")
CONC_ETS <- here("analyses/02_concreteness_semantics/data/ets/concreteness_deciles.csv")
CONC_WIKI <-  here("analyses/02_concreteness_semantics/data/wiki/target_translations_xling_words.csv")

cluster_assignments_raw_ets <- read_csv(CLUSTER_ETS)
cluster_assignments_raw_wiki <- read_csv(CLUSTER_WIKI)
conc_assignments_ets <- read_csv(CONC_ETS) 
conc_assignments_wiki <- read_csv(CONC_WIKI) %>%
  distinct(word, concreteness_tile)

all_assignments_ets <- full_join(cluster_assignments_raw_ets, conc_assignments_ets) %>%
  rename(concreteness_tile = conc_tile) %>%
  mutate(type = "ets",
         cluster = as.factor(cluster), # reorder by means
         cluster2 = fct_recode(cluster, "1" = "9", "2" = "1", "3" = "2", "4" = "8", "5" = "4",
                              "6"= "3", "7" = "10", "8" = "7", "9" = "5", "10" = "6"))

all_assignments_wiki <- full_join(conc_assignments_wiki, cluster_assignments_raw_wiki) %>%
  mutate(type = "wiki",
        cluster = as.factor(cluster),
        cluster2 = fct_recode(cluster, "1" = "9", "2"=  "3", "3" = "8", "4"= "1",
                              "6"= "7", "7"= "4", "8" = "6", "9"= "2"))

all_assignments <- bind_rows(all_assignments_ets, all_assignments_wiki) %>%
        mutate(cluster2 = fct_relevel(cluster2, "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), 
               type = fct_recode(type, "Second-Language TOEFL Corpus" = "ets",
                                        "Multilingual Wikipedia Corpus" = "wiki"))

#pdf("figs/concreteness_semantics.pdf", width = 10, height = 5.5)
ggplot(all_assignments, aes(x = concreteness_tile, y = cluster2, fill = cluster2)) + 
  scale_fill_viridis_d(option = "plasma", begin = 1, end = 0, 
                       guide = guide_legend(nrow = 1)) +
  ggridges::geom_density_ridges(scale = 3, rel_min_height = .01, bandwidth = .6) +
  facet_wrap(.~type) +
  ylab("Semantic Cluster") +
  scale_x_continuous(breaks = 1:10) +
  xlab("Concreteness Decile") +
  theme_classic(base_size = 15) +
  theme(legend.position = "none",
        axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        strip.background = element_rect(colour = "white", fill="white"),
        strip.text = element_text(size = 16))

#dev.off()

Are semantics and concreteness overlaping? Yes.

ETS:

ets_count_mat <- all_assignments %>% 
  filter(type == "Second-Language TOEFL Corpus") %>%
  count(type, concreteness_tile, cluster2) %>%
  select(-type) %>%
  spread(cluster2, n, -1, fill = 0) %>%
  select(-1) %>%
  as.matrix()

chisq.test(ets_count_mat)

## 
##  Pearson's Chi-squared test
## 
## data:  ets_count_mat
## X-squared = 1538.1, df = 81, p-value < 2.2e-16

WIKI:

wiki_count_mat <- all_assignments %>% 
  filter(type == "Multilingual Wikipedia Corpus") %>%
  count(type, concreteness_tile, cluster2) %>%
  select(-type) %>%
  spread(cluster2, n, -1, fill = 0) %>%
  select(-1) %>%
  as.matrix()

chisq.test(wiki_count_mat)

## 
##  Pearson's Chi-squared test
## 
## data:  wiki_count_mat
## X-squared = 5144.1, df = 81, p-value < 2.2e-16

Concrentess and similarity

WIKI_PATH <- here("analyses/02_concreteness_semantics/data/wiki/lang_pairwise_tile_correlations.csv")
ETS_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_tile_correlations_ets_decile.csv")

conc_corr_wiki <- read_csv(WIKI_PATH,
                      col_names = c("tile1", "tile2", "corr", "lang1", "lang2"))  %>%
  mutate(corpus = "wiki")
conc_corr_ets <- read_csv(ETS_PATH,
                      col_names = c("tile1", "tile2", "corr",  "lang1", "lang2"))  %>%
  mutate(corpus = "ets")

conc_corr <- bind_rows(conc_corr_wiki, conc_corr_ets)

## Within deciles
conc_corr_ms_same <- conc_corr %>%
  filter(tile1 >= tile2) %>%
  group_by(tile1, tile2, corpus) %>%
  multi_boot_standard(col = "corr") %>%
  ungroup() %>%
  filter(tile2 == tile1) %>%
  mutate(tile1 = as.factor(tile1),
         lab = case_when(tile2 == 9 & corpus == "ets" ~ "Second-\nLanguage TOEFL Corpus",
                         tile2 == 9 & corpus == "wiki" ~ "Multilingual\nWikipedia Corpus",
                         TRUE ~ "")) %>%
    mutate(corpus = fct_recode(corpus, 
                             "TOEFL" = "ets",
"Wikipedia" = "wiki"))

#pdf("figs/concreteness_plot.pdf", width = 5, height = 4.7)
ggplot(conc_corr_ms_same, aes(x = tile2, y = mean,
                         line_type = corpus, group = corpus, color = tile2)) +
  geom_smooth(method = "lm", alpha = .2, color = "black") +
  geom_pointrange(aes(ymin = ci_lower, max = ci_upper, shape = rev(corpus)), size = .7, alpha = .7)  +
  scale_x_continuous(breaks = 1:10) +
  viridis::scale_color_viridis(option = "plasma", begin = 1, end = 0, 
                       #guide = guide_legend(nrow = 0)) +
                       guide =FALSE) +
  
  ylab("Cross-linguistic\nWord Distance Correlation") +
  #geom_dl(aes(label=corpus), method = list('last.bumpup', cex = 1.3, hjust = 1)) +
 # ggtitle("Within Concreteness Deciles") + 
  #scale_y_continuous(position = "right", limits = c(.14, .52)) +
  xlab("Concreteness Decile") +
  theme_classic(base_size = 18)  +
  guides(shape=guide_legend(title="Corpus")) +
  theme(axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        legend.position=c(0.25,0.77),
        legend.text = element_text(size = 8),
        legend.title = element_text(size = 10),
        legend.background = element_rect(linetype = 1, size = 0.5, 
                                         colour = 1))

#hcludev.off()

conc_corr_ms_same %>%
  group_by(corpus) %>%
  do(tidy(cor.test(.$mean,.$tile2))) %>%
  mutate(estimate = round(estimate,2)) %>%
  kable()

corpus	estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
TOEFL	0.78	3.514126	0.0079136	8	0.2931959	0.9450867	Pearson’s product-moment correlation	two.sided
Wikipedia	0.81	3.895982	0.0045705	8	0.3661770	0.9531930	Pearson’s product-moment correlation	two.sided

Discrete local-global concreteness analysis

conc_in_out <- conc_corr %>%
  filter(tile1 >= tile2) %>%
  mutate(local_global = case_when(tile1 == tile2 ~ "local", TRUE ~ "global")) %>%
  group_by(corpus, lang1, lang2, local_global) %>%
  summarize(mean_corr = mean(corr))  %>%
  mutate(lang_pair = paste0(lang1, lang2)) %>%
  ungroup() %>%
  select(corpus, local_global, lang_pair, lang1, lang2, mean_corr)

conc_in_out_summary <- conc_in_out %>%
  group_by(corpus, local_global) %>%
  multi_boot_standard(col = "mean_corr") %>%
  arrange(corpus)

ggplot(conc_in_out_summary, aes(x = local_global, y = mean,
                        color = corpus)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("Mean Correlation") +
  ggtitle("Local-global for concreteness deciles") +
  theme_classic()

Stats

Wiki

conc_in_out_wide <- conc_in_out %>%
  filter(corpus == "wiki") %>%
  spread(local_global, mean_corr) %>%
  mutate(dif = local- global) %>%
  select(lang1, lang2, local, global, dif)

This is what the data look like:

head(conc_in_out_wide) %>%
  kable()

lang1	lang2	local	global	dif
ar	bg	0.3303223	0.2830508	0.0472715
ar	bn	0.2480982	0.2057305	0.0423677
ar	de	0.3248045	0.2796395	0.0451650
ar	el	0.3134378	0.2688484	0.0445894
ar	en	0.3697269	0.3181968	0.0515301
ar	es	0.3474667	0.3001870	0.0472796

regression

lm(dif ~ 1, conc_in_out_wide) %>%
  summary() %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.035533	0.0006627	53.6193	0

paired t-test

t.test(conc_in_out_wide$global,  conc_in_out_wide$local, paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
-0.04	-53.62	0	515	-0.0368349	-0.0342311	Paired t-test	two.sided

qap regression

iv_mat <- conc_in_out_wide %>%
  select(-local, -global) %>%
   bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) %>%
  spread(lang2, dif) %>%
  select(-lang1) %>%
  as.matrix()

head(iv_mat)

##              ar         bg         bn         de         el         en
## [1,]         NA 0.04727148 0.04236768 0.04516497 0.04458942 0.05153010
## [2,] 0.04727148         NA 0.04604494 0.05497622 0.05930866 0.05559946
## [3,] 0.04236768 0.04604494         NA 0.04278500 0.05108388 0.04716660
## [4,] 0.04516497 0.05497622 0.04278500         NA 0.05562776 0.05409559
## [5,] 0.04458942 0.05930866 0.05108388 0.05562776         NA 0.05824733
## [6,] 0.05153010 0.05559946 0.04716660 0.05409559 0.05824733         NA
##              es         fa         fr         gu         hi         id
## [1,] 0.04727962 0.04920370 0.04385525 0.03312881 0.04379451 0.04911387
## [2,] 0.05231979 0.04920048 0.05054834 0.03473511 0.04789857 0.05041657
## [3,] 0.04674872 0.04336345 0.04487898 0.03180056 0.04402548 0.04284389
## [4,] 0.05192894 0.04488979 0.04889500 0.03670042 0.04524603 0.05133440
## [5,]         NA 0.05186800 0.05215153 0.03586838 0.04980898         NA
## [6,] 0.05024244         NA 0.05142425 0.04088537 0.04910850 0.05538531
##               ig         it          ja         kn         ko         ml
## [1,] 0.013980309 0.04767846 0.008634183 0.04073757 0.03912732 0.03276816
## [2,] 0.009760160 0.05223729 0.006775421 0.04519056 0.03707190 0.03499538
## [3,] 0.003296772 0.04229685 0.001829571 0.04122112 0.03241991 0.03776234
## [4,] 0.028359157 0.05477206 0.008267010 0.04291152 0.03695615 0.03088596
## [5,] 0.014681674         NA          NA         NA         NA         NA
## [6,] 0.020987937 0.05209206 0.010588612 0.04738328 0.04079088 0.03347353
##              mr         ne         nl         pa         pl         pt
## [1,] 0.03441889 0.02608345 0.04559855 0.03187010 0.04946336 0.04648018
## [2,] 0.04016692 0.03163316 0.05143233 0.04158374 0.05480756 0.05249924
## [3,] 0.03494670 0.04144799 0.04112400 0.03030946 0.04702599 0.04636616
## [4,] 0.03881699 0.02897634 0.04837117 0.03165009 0.05598211 0.05090981
## [5,]         NA         NA 0.05279372         NA         NA         NA
## [6,] 0.04145008 0.03235927 0.05283536 0.03973876 0.05923973 0.04947635
##              ro         ru         ta         te         th         tl
## [1,] 0.04751700 0.04582059 0.04457064 0.03785080 0.03987087 0.04130156
## [2,] 0.05293860 0.05545659 0.04513277 0.04409530 0.04422659 0.04256032
## [3,] 0.04703937 0.04165780 0.04096068 0.03861215 0.03884191 0.03997498
## [4,] 0.05332001 0.05620587 0.04261433 0.04196960 0.04032039 0.04362830
## [5,]         NA         NA         NA         NA 0.05029354         NA
## [6,] 0.05460251         NA 0.04557751         NA         NA 0.04960849
##              tr         ur         vi          yo         zh
## [1,] 0.04899807 0.02673111 0.02475849 0.017404883 0.02238931
## [2,] 0.05453382 0.01934906 0.03701805 0.015197086 0.02164418
## [3,] 0.04868182 0.02188865 0.04877261 0.011909379 0.01783975
## [4,] 0.05809892 0.01345640 0.04806513 0.004466861 0.02063652
## [5,] 0.05733103 0.02202312 0.05548075 0.013389275 0.02471565
## [6,]         NA         NA         NA          NA 0.02258322

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))

head(dv_mat)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [2,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [3,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [4,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [5,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
## [6,]    1    1    1    1    1    1    1    1    1     1     1     1     1     1
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,]     1     1     1     1     1     1     1     1     1     1     1     1
## [2,]     1     1     1     1     1     1     1     1     1     1     1     1
## [3,]     1     1     1     1     1     1     1     1     1     1     1     1
## [4,]     1     1     1     1     1     1     1     1     1     1     1     1
## [5,]     1     1     1     1     1     1     1     1     1     1     1     1
## [6,]     1     1     1     1     1     1     1     1     1     1     1     1
##      [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
## [1,]     1     1     1     1     1     1     1     1     1
## [2,]     1     1     1     1     1     1     1     1     1
## [3,]     1     1     1     1     1     1     1     1     1
## [4,]     1     1     1     1     1     1     1     1     1
## [5,]     1     1     1     1     1     1     1     1     1
## [6,]     1     1     1     1     1     1     1     1     1

qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = F,
                   mode = "graph",
                   nullhyp = "qap",
                   reps = 1000) 

summary(qap_model)

## 
## OLS Network Model
## 
## Residuals:
##           0%          25%          50%          75%         100% 
## -0.040817070 -0.009421389  0.003589803  0.010805290  0.044959197 
## 
## Coefficients:
##    Estimate   Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.03553297 0.047   0.976   0.976    
## 
## Residual standard error: 0.01505 on 515 degrees of freedom
## Multiple R-squared: 0.8481   Adjusted R-squared: 0.8478 
## F-statistic:  2875 on 1 and 515 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qapy 
##  Replications: 1000 
##  Coefficient Distribution Summary:
## 
##           x1
## Min    53.62
## 1stQ   53.62
## Median 53.62
## Mean   53.62
## 3rdQ   53.62
## Max    53.62

data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()

beta	t_tstat	p_stat
0.04	53.62	0.98

conc_in_out_wide_full <- conc_in_out_wide %>%
  select(-local, -global) %>%
  bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) 
  
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), linetype = 2) + 
  facet_wrap(~lang1)

ETS

conc_in_out_wide <- conc_in_out %>%
  filter(corpus == "ets") %>%
  spread(local_global, mean_corr) %>%
  mutate(dif = local- global) %>%
  select(lang1, lang2, local, global, dif)

regression

lm(dif ~ 1, conc_in_out_wide) %>%
  summary() %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.0254055	0.0001975	128.6386	0

paired t-test

t.test(conc_in_out_wide$global,  conc_in_out_wide$local, paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
-0.03	-128.64	0	594	-0.0257934	-0.0250176	Paired t-test	two.sided

qap regression

iv_mat <- conc_in_out_wide %>%
  select(-local, -global) %>%
  bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) %>%
  spread(lang2, dif) %>%
  select(-lang1) %>%
  as.matrix()

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))

#networkkw
#ucinet

qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = FALSE,
                   mode = "graph",
                   nullhyp = "classical",
                   test.statistic = "t-value",
                   reps = 1000) 

summary(qap_model)

## 
## OLS Network Model
## 
## Residuals:
##            0%           25%           50%           75%          100% 
## -0.0150909782 -0.0035497070  0.0003625696  0.0032915254  0.0139446294 
## 
## Coefficients:
##    Estimate   Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.02540552 1       0       0        
## 
## Residual standard error: 0.004817 on 594 degrees of freedom
## Multiple R-squared: 0.9653   Adjusted R-squared: 0.9653 
## F-statistic: 1.655e+04 on 1 and 594 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: classical

data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()

beta	t_tstat	p_stat
0.03	128.64	0

conc_in_out_wide_full <- conc_in_out_wide %>%
  select(-local, -global) %>%
  bind_rows(conc_in_out_wide %>%  
  select(-local, -global) %>% rename(lang1 = lang2, 
                                      lang2 = lang1)) 
  
ggplot(conc_in_out_wide_full, aes(x = dif, fill = lang1)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), linetype = 2) + 
  facet_wrap(~lang1)

Concreteness decile analysis - ETS and WIKI

Molly Lewis

2019-11-26

Semantics overlap

Concrentess and similarity

Discrete local-global concreteness analysis

Stats

Wiki

regression

paired t-test

qap regression

ETS

regression

paired t-test

qap regression