Here we’re just looking at one item: tree
Read in drawing pairwise-sims file.
tree <- read_csv("../../data/keras_similarities/pairwise_country/tree_pairwise_sims.csv") %>%
mutate(key_id_1 = as.character(key_id_1),
key_id_2 = as.character(key_id_2))
Number of unique participants per each country (sampled 50). We take the 34 biggest countries. (Note that for even smaller countries, some duplicates)
tree_big = tree %>%
group_by(country_code_1, key_id_1)%>%
slice(1) %>%
ungroup()%>%
count(country_code_1)%>%
arrange(-n)%>%
as.data.frame() %>%
slice(1:34)
kable(tree_big)
| country_code_1 | n |
|---|---|
| AE | 50 |
| AU | 50 |
| CA | 50 |
| CZ | 50 |
| GB | 50 |
| MY | 50 |
| PL | 50 |
| SE | 50 |
| TR | 50 |
| US | 50 |
| BG | 49 |
| BR | 49 |
| ES | 49 |
| FI | 49 |
| FR | 49 |
| HK | 49 |
| HR | 49 |
| HU | 49 |
| ID | 49 |
| IN | 49 |
| NL | 49 |
| PH | 49 |
| RU | 49 |
| SA | 49 |
| SG | 49 |
| TH | 49 |
| VN | 49 |
| DE | 48 |
| IE | 48 |
| IT | 48 |
| KR | 48 |
| RS | 48 |
| SK | 48 |
| TW | 48 |
Remove NAs- very few, but not sure where they come from. Need to look into this. Subset to only big countries.
tree_clean <- tree %>%
filter(!is.na(cosine)) %>%
filter(country_code_1 %in% tree_big$country_code_1 & country_code_2 %in% tree_big$country_code_1)
get_unique_relation_id <- function (x, y){
pairs = c(x, y)
ordered = order(pairs)
paste0(pairs[ordered[1]], pairs[ordered[2]])
}
tree_means <- tree_clean %>%
group_by(country_code_1, country_code_2) %>%
summarize(mean_cosine = mean(cosine)) %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
group_by(all_codes)%>%
slice(1) %>%
ungroup()%>%
mutate(country_name_1 = countrycode(country_code_1, "iso2c","country.name"),
country_name_2 = countrycode(country_code_2, "iso2c","country.name"),
country_names = paste0(country_name_1,"_", country_name_2),
continent_name_1 = as.factor(countrycode(country_code_1, 'iso2c', 'continent')),
continent_name_2 = as.factor(countrycode(country_code_2, 'iso2c', 'continent')),
cont_order_1 = unclass(continent_name_1) %>% as.numeric,
cont_order_2 = unclass(continent_name_2) %>% as.numeric) %>%
select(-country_code_1, -country_code_2, -continent_name_1, -continent_name_2) %>%
arrange(-mean_cosine)
Distribution of mean country-wise similarities: Looks to be correlation between mean and varaince.
tree_means %>%
ungroup() %>%
select(mean_cosine, country_name_1, country_name_2) %>%
gather("country", "country_name", 2:3) %>%
select(-country) %>%
ggplot(aes(x = mean_cosine, y = reorder(country_name, mean_cosine))) +
geom_joy(scale = 2, aes(color = country_name), alpha = .3, size = 1) +
theme_joy(font_size = 13, grid = T) +
theme(axis.title.y = element_blank(),
legend.position = "none")
Pairwise similarity:
tree_means_2 = tree_means %>%
mutate(country_name_3 = country_name_1,
country_name_1 = country_name_2,
country_name_2 = country_name_3) %>%
select(-country_name_3)
tree_means %>%
bind_rows(tree_means_2) %>%
# mutate(country_name_1 = as.factor(country_name_1),
# country_name_2 = as.factor(country_name_2),
# country_name_1 = fct_reorder(country_name_1, cont_order_1),
# country_name_2 = fct_reorder(country_name_2, cont_order_2)) %>%
ggplot(aes(x = country_name_1,
y = country_name_2)) +
geom_raster(aes(fill = mean_cosine)) +
scale_fill_gradientn(colours = rev(terrain.colors(10))) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust =.25, size = 4),
axis.text.y = element_text(size = 4),
axis.title = element_blank())
Within country similarity is not necessarily bigger than across country. Note that when within country similarity is low, all across country variability also low.
Centroid geographic distance and linguistic distance. Continuous linguistic measures come from here: https://github.com/ddediu/lgfam-newick/blob/master/paper/family-trees-with-brlength.pdf.
dyadic <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv")
#dyadic2 <- dyadic %>%
# rowwise() %>%
# mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
# select(-country_code_1, -country_code_2) %>%
# ungroup()
# write_csv(dyadic2, "../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars2.csv")
dyadic2 <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars2.csv") %>%
select(all_codes, everything())
is.na(dyadic2) <- do.call(cbind,lapply(dyadic2, is.infinite))
dyadic_clean <- dyadic2 %>%
mutate(all_codes = as.factor(all_codes)) %>%
group_by(all_codes) %>%
summarize(wals_euclidean_dist = mean(wals_euclidean_dist, na.rm = TRUE),
asjp_dist = mean(asjp_dist, na.rm = TRUE),
centroid_dist_meters = mean(centroid_dist_meters, na.rm = TRUE))
Summary of dyadic measures:
summary(dyadic_clean)
## all_codes wals_euclidean_dist asjp_dist centroid_dist_meters
## AEAE : 1 Min. : 0.00 Min. :0.0000 Min. : 0
## AEAR : 1 1st Qu.: 82.39 1st Qu.:0.7687 1st Qu.: 2108497
## AEAT : 1 Median :141.73 Median :0.8321 Median : 5328696
## AEAU : 1 Mean :117.52 Mean :0.7175 Mean : 6286402
## AEBA : 1 3rd Qu.:156.59 3rd Qu.:0.8755 3rd Qu.: 9655098
## AEBE : 1 Max. :216.48 Max. :0.9458 Max. :19758598
## (Other):2550 NA's :712 NA's :835 NA's :71
dyadic_clean %>%
gather("measure", "value" , 2:4) %>%
ggplot(aes(x = value, fill = measure)) +
geom_histogram() +
facet_wrap(~measure, scales = "free") +
theme_bw() +
theme(legend.position = "none")
Correlations:
correlate(dyadic_clean %>% select(-1),
use = "complete.obs") %>%
shave() %>%
kable()
| rowname | wals_euclidean_dist | asjp_dist | centroid_dist_meters |
|---|---|---|---|
| wals_euclidean_dist | NA | NA | NA |
| asjp_dist | 0.8062047 | NA | NA |
| centroid_dist_meters | 0.4637033 | 0.3450888 | NA |
all <- left_join(tree_means, dyadic_clean)
all_2 = all %>%
mutate(country_name_3 = country_name_1,
country_name_1 = country_name_2,
country_name_2 = country_name_3) %>%
select(-country_name_3)
all %>%
bind_rows(all_2)%>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = country_name_1,
y = country_name_2)) +
geom_raster(aes(fill = centroid_dist_meters)) +
scale_fill_gradientn(colours = terrain.colors(10)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust =.25, size = 4),
axis.text.y = element_text(size = 4),
axis.title = element_blank())
all %>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = centroid_dist_meters, y = mean_cosine)) +
geom_label(aes(label = all_codes)) +
geom_smooth(method = "lm")
all %>%
bind_rows(all_2)%>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = country_name_1,
y = country_name_2)) +
geom_raster(aes(fill = asjp_dist)) +
scale_fill_gradientn(colours = terrain.colors(10)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust =.25, size = 4),
axis.text.y = element_text(size = 4),
axis.title = element_blank())
all %>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = asjp_dist, y = mean_cosine)) +
geom_label(aes(label = all_codes)) +
geom_smooth(method = "lm")
all %>%
bind_rows(all_2)%>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = country_name_1,
y = country_name_2)) +
geom_raster(aes(fill = wals_euclidean_dist)) +
scale_fill_gradientn(colours = terrain.colors(10)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust =.25, size = 4),
axis.text.y = element_text(size = 4),
axis.title = element_blank())
all %>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = wals_euclidean_dist, y = mean_cosine)) +
geom_label(aes(label = all_codes)) +
geom_smooth(method = "lm")