Here we’re just looking at one item: tree

Read in drawing pairwise-sims file.

tree <- read_csv("../../data/keras_similarities/pairwise_country/tree_pairwise_sims.csv") %>%
    mutate(key_id_1 = as.character(key_id_1),
           key_id_2 = as.character(key_id_2))

Number of unique participants per each country (sampled 50). We take the 34 biggest countries. (Note that for even smaller countries, some duplicates)

tree_big = tree %>%
  group_by(country_code_1, key_id_1)%>%
  slice(1) %>%
  ungroup()%>%
  count(country_code_1)%>%
  arrange(-n)%>%
  as.data.frame() %>%
  slice(1:34)

kable(tree_big)
country_code_1 n
AE 50
AU 50
CA 50
CZ 50
GB 50
MY 50
PL 50
SE 50
TR 50
US 50
BG 49
BR 49
ES 49
FI 49
FR 49
HK 49
HR 49
HU 49
ID 49
IN 49
NL 49
PH 49
RU 49
SA 49
SG 49
TH 49
VN 49
DE 48
IE 48
IT 48
KR 48
RS 48
SK 48
TW 48

Remove NAs- very few, but not sure where they come from. Need to look into this. Subset to only big countries.

tree_clean <- tree %>%
  filter(!is.na(cosine)) %>%
  filter(country_code_1 %in% tree_big$country_code_1 & country_code_2 %in%   tree_big$country_code_1)
get_unique_relation_id <- function (x, y){
  pairs = c(x, y)
  ordered = order(pairs)
  paste0(pairs[ordered[1]], pairs[ordered[2]])
}

tree_means <- tree_clean %>%
  group_by(country_code_1, country_code_2) %>%
  summarize(mean_cosine = mean(cosine)) %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
  group_by(all_codes)%>%
  slice(1) %>%
  ungroup()%>%
  mutate(country_name_1 = countrycode(country_code_1, "iso2c","country.name"),
         country_name_2 = countrycode(country_code_2, "iso2c","country.name"),
         country_names = paste0(country_name_1,"_", country_name_2),
         continent_name_1 = as.factor(countrycode(country_code_1, 'iso2c', 'continent')),
         continent_name_2 = as.factor(countrycode(country_code_2, 'iso2c', 'continent')),
         cont_order_1 = unclass(continent_name_1) %>% as.numeric,
         cont_order_2 = unclass(continent_name_2) %>% as.numeric) %>%
  select(-country_code_1, -country_code_2, -continent_name_1, -continent_name_2) %>%
  arrange(-mean_cosine)

Similarity distributions by country

Distribution of mean country-wise similarities: Looks to be correlation between mean and varaince.

tree_means %>%
  ungroup() %>%
  select(mean_cosine, country_name_1, country_name_2) %>%
  gather("country", "country_name", 2:3) %>%
  select(-country) %>%
  ggplot(aes(x = mean_cosine, y = reorder(country_name, mean_cosine))) + 
  geom_joy(scale = 2, aes(color = country_name), alpha = .3, size = 1) +
  theme_joy(font_size = 13, grid = T) + 
  theme(axis.title.y = element_blank(),
        legend.position = "none")

Pairwise similarity:

tree_means_2 = tree_means %>%
  mutate(country_name_3 = country_name_1,
                   country_name_1 = country_name_2, 
                   country_name_2 = country_name_3) %>%
  select(-country_name_3)    

tree_means %>%
  bind_rows(tree_means_2) %>%
    # mutate(country_name_1 = as.factor(country_name_1),
    #       country_name_2 = as.factor(country_name_2),
    #       country_name_1 = fct_reorder(country_name_1, cont_order_1), 
    #       country_name_2 = fct_reorder(country_name_2, cont_order_2)) %>%
  ggplot(aes(x = country_name_1, 
             y = country_name_2)) +
  geom_raster(aes(fill = mean_cosine)) +
  scale_fill_gradientn(colours = rev(terrain.colors(10))) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust =.25, size = 4),
        axis.text.y = element_text(size = 4),
        axis.title = element_blank())

Within country similarity is not necessarily bigger than across country. Note that when within country similarity is low, all across country variability also low.

Dyadic predictive measures

Centroid geographic distance and linguistic distance. Continuous linguistic measures come from here: https://github.com/ddediu/lgfam-newick/blob/master/paper/family-trees-with-brlength.pdf.

dyadic <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv") 
#dyadic2 <- dyadic %>%
#  rowwise() %>%
#  mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
#  select(-country_code_1, -country_code_2) %>%
#  ungroup()
# write_csv(dyadic2, "../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars2.csv")
dyadic2 <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars2.csv") %>%
  select(all_codes, everything()) 

is.na(dyadic2) <- do.call(cbind,lapply(dyadic2, is.infinite))
dyadic_clean <- dyadic2 %>%
  mutate(all_codes = as.factor(all_codes)) %>%
  group_by(all_codes) %>%
  summarize(wals_euclidean_dist = mean(wals_euclidean_dist, na.rm = TRUE),
            asjp_dist = mean(asjp_dist, na.rm = TRUE),
            centroid_dist_meters = mean(centroid_dist_meters, na.rm = TRUE))

Summary of dyadic measures:

summary(dyadic_clean)
##    all_codes    wals_euclidean_dist   asjp_dist      centroid_dist_meters
##  AEAE   :   1   Min.   :  0.00      Min.   :0.0000   Min.   :       0    
##  AEAR   :   1   1st Qu.: 82.39      1st Qu.:0.7687   1st Qu.: 2108497    
##  AEAT   :   1   Median :141.73      Median :0.8321   Median : 5328696    
##  AEAU   :   1   Mean   :117.52      Mean   :0.7175   Mean   : 6286402    
##  AEBA   :   1   3rd Qu.:156.59      3rd Qu.:0.8755   3rd Qu.: 9655098    
##  AEBE   :   1   Max.   :216.48      Max.   :0.9458   Max.   :19758598    
##  (Other):2550   NA's   :712         NA's   :835      NA's   :71
dyadic_clean %>%
  gather("measure", "value" , 2:4)  %>%
  ggplot(aes(x = value, fill = measure)) +
  geom_histogram() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  theme(legend.position = "none")

Correlations:

correlate(dyadic_clean %>% select(-1),
                   use = "complete.obs")  %>%
  shave() %>%
  kable()
rowname wals_euclidean_dist asjp_dist centroid_dist_meters
wals_euclidean_dist NA NA NA
asjp_dist 0.8062047 NA NA
centroid_dist_meters 0.4637033 0.3450888 NA

Dyadic predicting pairwise country variance

Distance

all <- left_join(tree_means, dyadic_clean)

all_2 = all %>%
  mutate(country_name_3 = country_name_1,
                   country_name_1 = country_name_2, 
                   country_name_2 = country_name_3) %>%
  select(-country_name_3) 


all %>%
  bind_rows(all_2)%>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = country_name_1,
             y = country_name_2)) +
  geom_raster(aes(fill = centroid_dist_meters)) +
  scale_fill_gradientn(colours = terrain.colors(10)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust =.25, size = 4),
        axis.text.y = element_text(size = 4),
        axis.title = element_blank()) 

all %>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = centroid_dist_meters, y = mean_cosine)) +
  geom_label(aes(label = all_codes)) +
  geom_smooth(method = "lm")

ASJP language distance

all %>%
  bind_rows(all_2)%>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = country_name_1,
             y = country_name_2)) +
  geom_raster(aes(fill = asjp_dist)) +
  scale_fill_gradientn(colours = terrain.colors(10)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust =.25, size = 4),
        axis.text.y = element_text(size = 4),
        axis.title = element_blank()) 

all %>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = asjp_dist, y = mean_cosine)) +
  geom_label(aes(label = all_codes)) +
  geom_smooth(method = "lm")

WALS language distance

all %>%
  bind_rows(all_2)%>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = country_name_1,
             y = country_name_2)) +
  geom_raster(aes(fill = wals_euclidean_dist)) +
  scale_fill_gradientn(colours = terrain.colors(10)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust =.25, size = 4),
        axis.text.y = element_text(size = 4),
        axis.title = element_blank()) 

all %>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = wals_euclidean_dist, y = mean_cosine)) +
  geom_label(aes(label = all_codes)) +
  geom_smooth(method = "lm")