Z-transformed correlations
INPATH_W <- "wiki/data/lang_pairwise_semantics_correlations.csv"
wiki_in_out_w <- read_csv(INPATH_W,
col_names = c("cluster", "comparision",
"correlation_coefficient", "lang1", "lang2")) %>%
mutate(corpus = "wiki") %>%
distinct()
INPATH_E <- "ets/data/lang_pairwise_semantics_correlations_ets.csv"
wiki_in_out_e <- read_csv(INPATH_E,
col_names = c("cluster", "comparision",
"correlation_coefficient", "lang1", "lang2")) %>%
mutate(corpus = "ets")
in_out_mean <- wiki_in_out_w %>%
bind_rows(wiki_in_out_e) %>%
mutate(correlation_coefficient_z = fisherz(correlation_coefficient - .0000001)) %>% # deal with case where r = 1
select(-correlation_coefficient)
in_out_mean %>%
group_by(corpus, comparision) %>%
multi_boot_standard(col = "correlation_coefficient_z") %>%
mutate_if(is.numeric, fisherz2r) %>%
ggplot(aes(x = comparision, y = mean, color = corpus, group = corpus)) +
geom_line() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("mean correlation coefficient") +
theme_classic()

in_out_mean %>%
group_by(corpus, comparision, cluster) %>%
summarize(correlation_coefficient_z = mean(correlation_coefficient_z)) %>%
group_by(corpus, comparision) %>%
summarize(sd_z = sd(correlation_coefficient_z),
correlation_coefficient_z = mean(correlation_coefficient_z)) %>%
mutate(sd_r = fisherz2r(sd_z),
correlation_coefficient_r = fisherz2r(correlation_coefficient_z)) %>%
select(-contains("_z")) %>%
mutate_if(is.numeric, round, 2) %>%
kable()
ets |
in |
0.11 |
0.45 |
ets |
out |
0.02 |
0.27 |
wiki |
in |
0.08 |
0.36 |
wiki |
out |
0.03 |
0.24 |
Inference
ETS
Simple regression
paired_df <- in_out_mean %>%
filter(corpus == "ets") %>%
spread(comparision, correlation_coefficient_z) %>%
rename(in_dists = `in`,
out_dists = out) %>%
group_by(lang1, lang2) %>%
summarize(in_dists = mean(in_dists),
out_dists = mean(out_dists))
paired_diff_df <- paired_df %>%
mutate(correlation_coefficient_diff_z = in_dists - out_dists) %>%
select(lang1, lang2, correlation_coefficient_diff_z) %>%
group_by(lang1, lang2) %>%
summarize(correlation_coefficient_diff_z = mean(correlation_coefficient_diff_z)) %>%
ungroup()
lm(correlation_coefficient_diff_z ~ 1, paired_diff_df) %>%
summary()
##
## Call:
## lm(formula = correlation_coefficient_diff_z ~ 1, data = paired_diff_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.036598 -0.009268 0.000325 0.010740 0.041418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2053710 0.0005956 344.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01453 on 594 degrees of freedom
QAP regression
# qap regression
iv_mat <- paired_diff_df %>%
bind_rows(paired_diff_df %>% rename(lang1 = lang2,
lang2 = lang1)) %>%
spread(lang2, correlation_coefficient_diff_z) %>%
select(-lang1) %>%
as.matrix()
iv_mat[lower.tri(iv_mat, diag = TRUE)] <- NA
dv_mat <- matrix(1, nrow = nrow(iv_mat), ncol = ncol(iv_mat))
dv_mat[lower.tri(dv_mat, diag = TRUE)] <- NA
qap_model <- netlm(iv_mat,
dv_mat,
intercept = F,
nullhyp = "qap",
test.statistic = "t-value",
reps = 10000)
summary(qap_model)
##
## OLS Network Model
##
## Residuals:
## 0% 25% 50% 75% 100%
## -0.0365982725 -0.0092675064 0.0003253035 0.0107400686 0.0414177808
##
## Coefficients:
## Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.205371 1 0 0
##
## Residual standard error: 0.01453 on 594 degrees of freedom
## Multiple R-squared: 0.995 Adjusted R-squared: 0.995
## F-statistic: 1.189e+05 on 1 and 594 degrees of freedom, p-value: 0
##
##
## Test Diagnostics:
##
## Null Hypothesis: qapy
## Replications: 10000
## Coefficient Distribution Summary:
##
## x1
## Min 170.2
## 1stQ 233.1
## Median 245.1
## Mean 244.5
## 3rdQ 256.5
## Max 309.4
data.frame(beta = round(summary(qap_model)$coefficients,2),
t_tstat = round(summary(qap_model)$tstat,2),
p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
kable()
Wikipedia
Simple regression
paired_df <- in_out_mean %>%
filter(corpus == "wiki") %>%
spread(comparision, correlation_coefficient_z) %>%
rename(in_dists = `in`,
out_dists = out) %>%
group_by(lang1, lang2) %>%
summarize(in_dists = mean(in_dists),
out_dists = mean(out_dists))
paired_diff_df <- paired_df %>%
mutate(correlation_coefficient_diff_z = in_dists - out_dists) %>%
select(lang1, lang2, correlation_coefficient_diff_z) %>%
group_by(lang1, lang2) %>%
summarize(correlation_coefficient_diff_z = mean(correlation_coefficient_diff_z)) %>%
ungroup()
lm(correlation_coefficient_diff_z ~ 1, paired_diff_df) %>%
summary()
##
## Call:
## lm(formula = correlation_coefficient_diff_z ~ 1, data = paired_diff_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13165 -0.04305 -0.02244 0.00211 1.08311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.129872 0.004723 27.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1152 on 594 degrees of freedom
QAP regression
iv_mat <- paired_diff_df %>%
bind_rows(paired_diff_df %>% rename(lang1 = lang2,
lang2 = lang1)) %>%
spread(lang2, correlation_coefficient_diff_z) %>%
select(-lang1) %>%
as.matrix()
iv_mat[lower.tri(iv_mat, diag = TRUE)] <- NA
dv_mat <- matrix(1, nrow = nrow(iv_mat), ncol = ncol(iv_mat))
dv_mat[lower.tri(dv_mat, diag = TRUE)] <- NA
qap_model <- netlm(iv_mat,
dv_mat,
intercept = F,
nullhyp = "qap",
test.statistic = "t-value",
reps = 10000)
summary(qap_model)
##
## OLS Network Model
##
## Residuals:
## 0% 25% 50% 75% 100%
## -0.131650821 -0.043045559 -0.022440829 0.002109621 1.083113764
##
## Coefficients:
## Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.1298721 0.8547 0.1453 0.1453
##
## Residual standard error: 0.1152 on 594 degrees of freedom
## Multiple R-squared: 0.56 Adjusted R-squared: 0.5593
## F-statistic: 756.1 on 1 and 594 degrees of freedom, p-value: 0
##
##
## Test Diagnostics:
##
## Null Hypothesis: qapy
## Replications: 10000
## Coefficient Distribution Summary:
##
## x1
## Min 12.49
## 1stQ 17.83
## Median 19.74
## Mean 21.66
## 3rdQ 23.79
## Max 47.91
data.frame(beta = round(summary(qap_model)$coefficients,2),
t_tstat = round(summary(qap_model)$tstat,2),
p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
kable()
Note this is controling for language-level pairs, but not word pairs.