Does d-place data explain variability in semantic space similarity? Here we look at distances between words as measure of similarity in semantic space across languages. These are for the low scoring models only.


Read in language semantic distances from ETS.

raw_dists <- read_csv("../../data/processed/pairwise_word_dists/low_lang_distances.csv") 

dists <- raw_dists %>%
        mutate(lang1 = colnames(raw_dists)) %>%
        select(lang1, everything()) %>%
        gather("lang2", "dist", -1) %>%
        filter(lang1 != lang2,
               !is.na(dist)) %>%
        rename(semantic_dist = dist)

ggplot(dists, aes(x = semantic_dist)) +
  geom_histogram() +
  ggtitle("Distribution of essay distances") +
  theme_bw()

Cultural measures

DOMAINS <- c("CulturalDistances","Agriculture_and_vegetation", "Basic_actions_and_technology",  "Emotions_and_values", "Kinship", "Law", "Possession", "Religion_and_belief", "Social_and_political_relations", "The_house", "The_physical_world")

There are a 10 different cultural domain distances measures, and one overall measure: CulturalDistances, Agriculture_and_vegetation, Basic_actions_and_technology, Emotions_and_values, Kinship, Law, Possession, Religion_and_belief, Social_and_political_relations, The_house, The_physical_world. The names are a little misleading, however, because they are mapped onto concepticon categories. We have cultural data for 30/36 langauges. The ones missing are: “GER”, “GRE”, “ITA”, “POR”, “RUM”, “FAS”. For several there were not exact matches between the two datasets, and but I mapped them to very similar groups. Below is the correlation between categories.

abbrevs <- read_csv("../../data/processed/ETS_abbrev_to_names.csv")  

cult_dist <- read_csv("../../data/processed/ImputeEACulturalDifferences/results/EA_distances/CulturalDistances_Long.csv")  %>%
  left_join(abbrevs %>% 
              select(ETS_code, EA_language_Name), 
                     by = c("Var1" = "EA_language_Name")) %>%
  rename(lang1 = ETS_code) %>%
   left_join(abbrevs %>% 
              select(ETS_code, EA_language_Name,), 
                     by = c("Var2" = "EA_language_Name")) %>%
  rename(lang2 = ETS_code) %>%
  select(lang1, lang2, value) %>%
  filter(!is.na(lang1) & !is.na(lang2))
get_domain_data <- function(domain, dists){
  path <- paste0("../../data/processed/ImputeEACulturalDifferences/results/EA_distances/", domain, "_long.csv")
  cult_dists_raw <- read_csv(path) %>%
    mutate(domain = domain)
  
  all_cult_dists <- cult_dists_raw  %>%
  left_join(abbrevs %>% 
              select(ETS_code, EA_language_Name), 
                     by = c("Var1" = "EA_language_Name")) %>%
  rename(lang1 = ETS_code) %>%
   left_join(abbrevs %>% 
              select(ETS_code, EA_language_Name,), 
                     by = c("Var2" = "EA_language_Name")) %>%
  rename(lang2 = ETS_code,
         cult_dist = value) %>%
  select(lang1, lang2, cult_dist, domain) %>%
  filter(!is.na(lang1) & !is.na(lang2))
  
    inner_join(dists, all_cult_dists)  %>%
      select(-semantic_dist)
  }

all_cultural_values <- map_df(DOMAINS, get_domain_data, dists)

all_cultural_values_wide <- all_cultural_values %>%
  distinct(lang1, lang2, domain, .keep_all = T) %>%
  spread(domain, cult_dist) 

cultural_corrs <- all_cultural_values_wide %>%
  select(-lang1, -lang2) %>%
  correlate() %>%
  shave(upper = TRUE)

kable(cultural_corrs, digits = 2)
rowname Agriculture_and_vegetation Basic_actions_and_technology CulturalDistances Emotions_and_values Kinship Law Possession Religion_and_belief Social_and_political_relations The_house The_physical_world
Agriculture_and_vegetation NA NA NA NA NA NA NA NA NA NA NA
Basic_actions_and_technology 0.37 NA NA NA NA NA NA NA NA NA NA
CulturalDistances 0.48 0.81 NA NA NA NA NA NA NA NA NA
Emotions_and_values 0.40 0.89 0.80 NA NA NA NA NA NA NA NA
Kinship 0.07 0.50 0.81 0.54 NA NA NA NA NA NA NA
Law 0.55 0.46 0.66 0.48 0.39 NA NA NA NA NA NA
Possession 0.59 0.92 0.89 0.85 0.52 0.60 NA NA NA NA NA
Religion_and_belief 0.47 0.44 0.65 0.59 0.52 0.59 0.54 NA NA NA NA
Social_and_political_relations 0.49 0.59 0.83 0.60 0.55 0.85 0.70 0.55 NA NA NA
The_house 0.17 0.48 0.66 0.48 0.49 0.33 0.51 0.25 0.74 NA NA
The_physical_world 0.35 0.40 0.55 0.41 0.37 0.35 0.43 0.38 0.59 0.67 NA
rplot(cultural_corrs, legend = T) +    
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Semantic distance and cultural distance correlations

get_domain_corr <- function(domain, dists){
  path <- paste0("../../data/processed/ImputeEACulturalDifferences/results/EA_distances/", domain, "_long.csv")
  cult_dists_raw <- read_csv(path) %>%
    mutate(domain = domain)
  
  all_cult_dists <- cult_dists_raw  %>%
  left_join(abbrevs %>% 
              select(ETS_code, EA_language_Name), 
                     by = c("Var1" = "EA_language_Name")) %>%
  rename(lang1 = ETS_code) %>%
   left_join(abbrevs %>% 
              select(ETS_code, EA_language_Name,), 
                     by = c("Var2" = "EA_language_Name")) %>%
  rename(lang2 = ETS_code,
         cult_dist = value) %>%
  select(lang1, lang2, cult_dist, domain) %>%
  filter(!is.na(lang1) & !is.na(lang2))
  
  #all_cult_dists
  full <- left_join(dists, all_cult_dists) 
  
  tidy(cor.test(full$semantic_dist, full$cult_dist)) %>% 
    mutate(domain = domain) %>%
    select(domain, estimate, statistic, p.value, conf.low, conf.high) 
}

map_df(DOMAINS, get_domain_corr, dists) %>%
  arrange(-statistic) %>%
  kable()
domain estimate statistic p.value conf.low conf.high
Kinship 0.1463315 4.2099855 0.0000284 0.0783202 0.2129870
Religion_and_belief 0.0951810 2.7212539 0.0066430 0.0265551 0.1629140
CulturalDistances 0.0921328 2.6333467 0.0086160 0.0234819 0.1599190
Basic_actions_and_technology 0.0682379 1.9466229 0.0519253 -0.0005645 0.1363974
Possession 0.0450438 1.2832723 0.1997637 -0.0238298 0.1134919
The_house 0.0273437 0.7785051 0.4364986 -0.0415343 0.0959629
The_physical_world 0.0228137 0.6494596 0.5162255 -0.0460584 0.0914700
Social_and_political_relations -0.0043298 -0.1232302 0.9019554 -0.0731078 0.0644892
Law -0.0069356 -0.1973943 0.8435685 -0.0756992 0.0618938
Emotions_and_values -0.0084077 -0.2392955 0.8109370 -0.0771629 0.0604271
Agriculture_and_vegetation -0.0764189 -2.1812990 0.0294482 -0.1444592 -0.0076594

Overall cultural distance correlation with semantic distance

all_cultural_values_wide_with_dists <- all_cultural_values_wide %>%
  left_join(dists)

ggplot(all_cultural_values_wide_with_dists, aes(x = semantic_dist, y = CulturalDistances)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("Overall cultural measure") +
  theme_bw()

Linear models

Predicting semantic distance with cultural distance measures.

“Kinship” (family) accounts for 2% of the varaiance between word distances

mod1 <- lm(semantic_dist ~ CulturalDistances, all_cultural_values_wide_with_dists)
summary(mod1)
## 
## Call:
## lm(formula = semantic_dist ~ CulturalDistances, data = all_cultural_values_wide_with_dists)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.174459 -0.067943 -0.009127  0.067216  0.220173 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.18608    0.01489  12.496  < 2e-16 ***
## CulturalDistances  0.07779    0.02954   2.633  0.00862 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0927 on 810 degrees of freedom
## Multiple R-squared:  0.008488,   Adjusted R-squared:  0.007264 
## F-statistic: 6.935 on 1 and 810 DF,  p-value: 0.008616
mod2 <- lm(semantic_dist ~ Kinship, all_cultural_values_wide_with_dists)
summary(mod2)
## 
## Call:
## lm(formula = semantic_dist ~ Kinship, data = all_cultural_values_wide_with_dists)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.180621 -0.065027 -0.008313  0.065940  0.224675 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.17841    0.01138   15.68  < 2e-16 ***
## Kinship      0.08898    0.02113    4.21 2.84e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09209 on 810 degrees of freedom
## Multiple R-squared:  0.02141,    Adjusted R-squared:  0.0202 
## F-statistic: 17.72 on 1 and 810 DF,  p-value: 2.84e-05
mod3 <- lm(semantic_dist ~ Kinship  + Religion_and_belief, all_cultural_values_wide_with_dists)
summary(mod3)
## 
## Call:
## lm(formula = semantic_dist ~ Kinship + Religion_and_belief, data = all_cultural_values_wide_with_dists)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.181934 -0.066148 -0.008594  0.066823  0.225538 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.17443    0.01303  13.384  < 2e-16 ***
## Kinship              0.08084    0.02482   3.257  0.00117 ** 
## Religion_and_belief  0.01510    0.02409   0.627  0.53112    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09213 on 809 degrees of freedom
## Multiple R-squared:  0.02189,    Adjusted R-squared:  0.01947 
## F-statistic: 9.052 on 2 and 809 DF,  p-value: 0.0001295

Linguistic distance

We only have wals distances and cultuyral distances for 136 pairs (and even fewer for asjp). But, of those, there’s not much of a correlation betweeb culture and language.

ling_dist <- read_csv("../../data/processed/all_pairwise_dists.csv")  %>%
  select(-hd_dist) %>%
  filter(lang1 != lang2)

all <- all_cultural_values_wide_with_dists %>%
  left_join(ling_dist %>% rename(lang1 = lang2, lang2 = lang1))

ggplot(all, aes(x = CulturalDistances, y = wals_euclidean_dist)) +
  geom_point() +
  geom_smooth(method = "lm")

ggplot(filter(all, CulturalDistances > .2), aes(x = CulturalDistances, y = wals_euclidean_dist)) +
  geom_point() +
  geom_smooth(method = "lm")

ggplot(all, aes(x = CulturalDistances, y = asjp_dist)) +
  geom_point() +
  geom_smooth(method = "lm")

ggplot(filter(all, CulturalDistances > .2), aes(x = CulturalDistances, y = asjp_dist)) +
  geom_point() +
  geom_smooth(method = "lm")

With this subset, cultural difference, but not linguistic difference, is a predictor of semantic distance.

lm(semantic_dist ~ CulturalDistances + wals_euclidean_dist, all) %>%
  summary()
## 
## Call:
## lm(formula = semantic_dist ~ CulturalDistances + wals_euclidean_dist, 
##     data = all)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.179620 -0.094258 -0.009744  0.076930  0.218200 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.2227964  0.0660121   3.375 0.000967 ***
## CulturalDistances   -0.0178678  0.1142435  -0.156 0.875954    
## wals_euclidean_dist  0.0000475  0.0003049   0.156 0.876432    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1104 on 133 degrees of freedom
##   (676 observations deleted due to missingness)
## Multiple R-squared:  0.0003249,  Adjusted R-squared:  -0.01471 
## F-statistic: 0.02162 on 2 and 133 DF,  p-value: 0.9786