Z-transformed correlations

INPATH_W <- "wiki/data/lang_pairwise_semantics_correlations.csv"

wiki_in_out_w <- read_csv(INPATH_W,
                        col_names = c("cluster", "comparision",
                                      "correlation_coefficient", "lang1", "lang2")) %>%
  mutate(corpus = "wiki") %>%
  distinct()

INPATH_E <- "ets/data/lang_pairwise_semantics_correlations_ets.csv"

wiki_in_out_e <- read_csv(INPATH_E,
                        col_names = c("cluster", "comparision",
                                      "correlation_coefficient", "lang1", "lang2")) %>%
  mutate(corpus = "ets")

in_out_mean <- wiki_in_out_w %>%
  bind_rows(wiki_in_out_e)  %>%
  mutate(correlation_coefficient_z = fisherz(correlation_coefficient - .0000001)) %>% # deal with case where r = 1
  select(-correlation_coefficient)
in_out_mean %>%
  group_by(corpus, comparision) %>%
  multi_boot_standard(col = "correlation_coefficient_z")  %>%
  mutate_if(is.numeric, fisherz2r) %>%
  ggplot(aes(x = comparision, y = mean, color = corpus, group = corpus)) +
  geom_line() +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("mean correlation coefficient") +
  theme_classic()

in_out_mean %>%
  group_by(corpus, comparision, cluster) %>%
  summarize(correlation_coefficient_z = mean(correlation_coefficient_z)) %>%
  group_by(corpus, comparision) %>%
  summarize(sd_z = sd(correlation_coefficient_z),
            correlation_coefficient_z = mean(correlation_coefficient_z)) %>%
  mutate(sd_r = fisherz2r(sd_z),
         correlation_coefficient_r = fisherz2r(correlation_coefficient_z)) %>%
  select(-contains("_z")) %>%
  mutate_if(is.numeric, round, 2) %>%
  kable()
corpus comparision sd_r correlation_coefficient_r
ets in 0.11 0.45
ets out 0.02 0.27
wiki in 0.08 0.36
wiki out 0.03 0.24

Inference

ETS

Simple regression

paired_df <- in_out_mean %>%
  filter(corpus == "ets") %>%
  spread(comparision, correlation_coefficient_z) %>%
  rename(in_dists = `in`,
         out_dists = out) %>%
  group_by(lang1, lang2) %>%
  summarize(in_dists = mean(in_dists),
            out_dists = mean(out_dists))

paired_diff_df <- paired_df %>%
  mutate(correlation_coefficient_diff_z = in_dists - out_dists) %>%
  select(lang1, lang2, correlation_coefficient_diff_z) %>%
  group_by(lang1, lang2) %>%
  summarize(correlation_coefficient_diff_z = mean(correlation_coefficient_diff_z)) %>%
  ungroup() 

lm(correlation_coefficient_diff_z ~ 1, paired_diff_df) %>%
  summary()
## 
## Call:
## lm(formula = correlation_coefficient_diff_z ~ 1, data = paired_diff_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.036598 -0.009268  0.000325  0.010740  0.041418 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.2053710  0.0005956   344.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01453 on 594 degrees of freedom

QAP regression

# qap regression
iv_mat <- paired_diff_df %>%
   bind_rows(paired_diff_df %>% rename(lang1 = lang2, 
                                       lang2 = lang1)) %>%
  spread(lang2, correlation_coefficient_diff_z) %>%
  select(-lang1) %>%
  as.matrix()

iv_mat[lower.tri(iv_mat, diag = TRUE)] <- NA 

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))
dv_mat[lower.tri(dv_mat, diag = TRUE)] <- NA 

qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = F,
                   nullhyp = "qap",
                   test.statistic = "t-value",
                   reps = 10000) 

summary(qap_model)
## 
## OLS Network Model
## 
## Residuals:
##            0%           25%           50%           75%          100% 
## -0.0365982725 -0.0092675064  0.0003253035  0.0107400686  0.0414177808 
## 
## Coefficients:
##    Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.205371 1       0       0        
## 
## Residual standard error: 0.01453 on 594 degrees of freedom
## Multiple R-squared: 0.995    Adjusted R-squared: 0.995 
## F-statistic: 1.189e+05 on 1 and 594 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qapy 
##  Replications: 10000 
##  Coefficient Distribution Summary:
## 
##           x1
## Min    170.2
## 1stQ   233.1
## Median 245.1
## Mean   244.5
## 3rdQ   256.5
## Max    309.4
data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()
beta t_tstat p_stat
0.21 344.8 0

Wikipedia

Simple regression

paired_df <- in_out_mean %>%
  filter(corpus == "wiki") %>%
  spread(comparision, correlation_coefficient_z) %>%
  rename(in_dists = `in`,
         out_dists = out) %>%
  group_by(lang1, lang2) %>%
  summarize(in_dists = mean(in_dists),
            out_dists = mean(out_dists))

paired_diff_df <- paired_df %>%
  mutate(correlation_coefficient_diff_z = in_dists - out_dists) %>%
  select(lang1, lang2, correlation_coefficient_diff_z) %>%
  group_by(lang1, lang2) %>%
  summarize(correlation_coefficient_diff_z = mean(correlation_coefficient_diff_z)) %>%
  ungroup() 

lm(correlation_coefficient_diff_z ~ 1, paired_diff_df) %>%
  summary()
## 
## Call:
## lm(formula = correlation_coefficient_diff_z ~ 1, data = paired_diff_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13165 -0.04305 -0.02244  0.00211  1.08311 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.129872   0.004723    27.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1152 on 594 degrees of freedom

QAP regression

iv_mat <- paired_diff_df %>%
   bind_rows(paired_diff_df %>% rename(lang1 = lang2, 
                                       lang2 = lang1)) %>%
  spread(lang2, correlation_coefficient_diff_z) %>%
  select(-lang1) %>%
  as.matrix()

iv_mat[lower.tri(iv_mat, diag = TRUE)] <- NA 

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))
dv_mat[lower.tri(dv_mat, diag = TRUE)] <- NA 

qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = F,
                   nullhyp = "qap",
                   test.statistic = "t-value",
                   reps = 10000) 

summary(qap_model)
## 
## OLS Network Model
## 
## Residuals:
##           0%          25%          50%          75%         100% 
## -0.131650821 -0.043045559 -0.022440829  0.002109621  1.083113764 
## 
## Coefficients:
##    Estimate  Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.1298721 0.8547  0.1453  0.1453   
## 
## Residual standard error: 0.1152 on 594 degrees of freedom
## Multiple R-squared: 0.56     Adjusted R-squared: 0.5593 
## F-statistic: 756.1 on 1 and 594 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qapy 
##  Replications: 10000 
##  Coefficient Distribution Summary:
## 
##           x1
## Min    12.49
## 1stQ   17.83
## Median 19.74
## Mean   21.66
## 3rdQ   23.79
## Max    47.91
data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()
beta t_tstat p_stat
0.13 27.5 0.15

Note this is controling for language-level pairs, but not word pairs.