Within vs. across distance correlations based on clusters

corpus	comparision	sd_r	correlation_coefficient_r
ets	in	0.11	0.45
ets	out	0.02	0.27
wiki	in	0.08	0.36
wiki	out	0.03	0.24

Inference

ETS

Simple regression

paired_df <- in_out_mean %>%
  filter(corpus == "ets") %>%
  spread(comparision, correlation_coefficient_z) %>%
  rename(in_dists = `in`,
         out_dists = out) %>%
  group_by(lang1, lang2) %>%
  summarize(in_dists = mean(in_dists),
            out_dists = mean(out_dists))

paired_diff_df <- paired_df %>%
  mutate(correlation_coefficient_diff_z = in_dists - out_dists) %>%
  select(lang1, lang2, correlation_coefficient_diff_z) %>%
  group_by(lang1, lang2) %>%
  summarize(correlation_coefficient_diff_z = mean(correlation_coefficient_diff_z)) %>%
  ungroup() 

lm(correlation_coefficient_diff_z ~ 1, paired_diff_df) %>%
  summary()

## 
## Call:
## lm(formula = correlation_coefficient_diff_z ~ 1, data = paired_diff_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.036598 -0.009268  0.000325  0.010740  0.041418 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.2053710  0.0005956   344.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01453 on 594 degrees of freedom

QAP regression

# qap regression
iv_mat <- paired_diff_df %>%
   bind_rows(paired_diff_df %>% rename(lang1 = lang2, 
                                       lang2 = lang1)) %>%
  spread(lang2, correlation_coefficient_diff_z) %>%
  select(-lang1) %>%
  as.matrix()

iv_mat[lower.tri(iv_mat, diag = TRUE)] <- NA 

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))
dv_mat[lower.tri(dv_mat, diag = TRUE)] <- NA 

qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = F,
                   nullhyp = "qap",
                   test.statistic = "t-value",
                   reps = 10000) 

summary(qap_model)

## 
## OLS Network Model
## 
## Residuals:
##            0%           25%           50%           75%          100% 
## -0.0365982725 -0.0092675064  0.0003253035  0.0107400686  0.0414177808 
## 
## Coefficients:
##    Estimate Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.205371 1       0       0        
## 
## Residual standard error: 0.01453 on 594 degrees of freedom
## Multiple R-squared: 0.995    Adjusted R-squared: 0.995 
## F-statistic: 1.189e+05 on 1 and 594 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qapy 
##  Replications: 10000 
##  Coefficient Distribution Summary:
## 
##           x1
## Min    170.2
## 1stQ   233.1
## Median 245.1
## Mean   244.5
## 3rdQ   256.5
## Max    309.4

data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()

beta	t_tstat	p_stat
0.21	344.8	0

Wikipedia

Simple regression

paired_df <- in_out_mean %>%
  filter(corpus == "wiki") %>%
  spread(comparision, correlation_coefficient_z) %>%
  rename(in_dists = `in`,
         out_dists = out) %>%
  group_by(lang1, lang2) %>%
  summarize(in_dists = mean(in_dists),
            out_dists = mean(out_dists))

paired_diff_df <- paired_df %>%
  mutate(correlation_coefficient_diff_z = in_dists - out_dists) %>%
  select(lang1, lang2, correlation_coefficient_diff_z) %>%
  group_by(lang1, lang2) %>%
  summarize(correlation_coefficient_diff_z = mean(correlation_coefficient_diff_z)) %>%
  ungroup() 

lm(correlation_coefficient_diff_z ~ 1, paired_diff_df) %>%
  summary()

## 
## Call:
## lm(formula = correlation_coefficient_diff_z ~ 1, data = paired_diff_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13165 -0.04305 -0.02244  0.00211  1.08311 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.129872   0.004723    27.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1152 on 594 degrees of freedom

QAP regression

iv_mat <- paired_diff_df %>%
   bind_rows(paired_diff_df %>% rename(lang1 = lang2, 
                                       lang2 = lang1)) %>%
  spread(lang2, correlation_coefficient_diff_z) %>%
  select(-lang1) %>%
  as.matrix()

iv_mat[lower.tri(iv_mat, diag = TRUE)] <- NA 

dv_mat <- matrix(1, nrow = nrow(iv_mat),  ncol = ncol(iv_mat))
dv_mat[lower.tri(dv_mat, diag = TRUE)] <- NA 

qap_model <- netlm(iv_mat, 
                   dv_mat, 
                   intercept = F,
                   nullhyp = "qap",
                   test.statistic = "t-value",
                   reps = 10000) 

summary(qap_model)

## 
## OLS Network Model
## 
## Residuals:
##           0%          25%          50%          75%         100% 
## -0.131650821 -0.043045559 -0.022440829  0.002109621  1.083113764 
## 
## Coefficients:
##    Estimate  Pr(<=b) Pr(>=b) Pr(>=|b|)
## x1 0.1298721 0.8547  0.1453  0.1453   
## 
## Residual standard error: 0.1152 on 594 degrees of freedom
## Multiple R-squared: 0.56     Adjusted R-squared: 0.5593 
## F-statistic: 756.1 on 1 and 594 degrees of freedom, p-value:     0 
## 
## 
## Test Diagnostics:
## 
##  Null Hypothesis: qapy 
##  Replications: 10000 
##  Coefficient Distribution Summary:
## 
##           x1
## Min    12.49
## 1stQ   17.83
## Median 19.74
## Mean   21.66
## 3rdQ   23.79
## Max    47.91

data.frame(beta = round(summary(qap_model)$coefficients,2),
           t_tstat = round(summary(qap_model)$tstat,2),
           p_stat = round(summary(qap_model)$pgreqabs,2)) %>%
  kable()

beta	t_tstat	p_stat
0.13	27.5	0.15

Note this is controling for language-level pairs, but not word pairs.

Within vs. across distance correlations based on clusters

Molly Lewis

2019-06-17

Inference

ETS

Simple regression

QAP regression

Wikipedia

Simple regression

QAP regression