ITEM <- "tree"
Here we’re just looking at one item: tree
get_unique_relation_id <- function (x, y){
pairs = c(x, y)
ordered = order(pairs)
paste0(pairs[ordered[1]], pairs[ordered[2]])
}
ratios <- read_csv(paste0("../../data/keras_similarities/pairwise_country/", ITEM, "_sim_ratios.csv")) %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
ungroup() %>%
select(all_codes, everything()) %>%
mutate(country_name_1 = as.factor(country_name_1),
country_name_2 = as.factor(country_name_2))
# write_csv(ratios, "../../data/supplementary_data/cultural_sim_measures/temp2.csv")
ratios <- read_csv( "../../data/supplementary_data/cultural_sim_measures/temp2.csv")
dict <- ratios %>%
select(all_codes, country_code_1,
country_code_2, country_name_1,
country_name_2, cont_order_1,
cont_order_2 ) %>%
distinct() %>%
mutate_all(as.factor) %>%
mutate(cont_order_1 = as.factor(cont_order_1),
cont_order_1 = fct_recode(cont_order_1,
"AM" = "2",
"AU" = "5",
"EU" = "4",
"AS" = "3",
"AF" = "1")) %>%
ungroup()
Dyadic predictive measures
Centroid geographic distance and linguistic distance. Continuous linguistic measures come from here: https://github.com/ddediu/lgfam-newick/blob/master/paper/family-trees-with-brlength.pdf.
dyadic <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv") %>%
select(1, 4:8)
Merge together dists and sims
Note there a bunch of different ratios we could use here since it’s not symmetrical. Here I’m taking the average of the two.
all0 <- ratios %>%
left_join(dyadic_clean, by = "all_codes") %>%
select(all_codes, everything())
all1 <- ratios %>%
group_by(all_codes) %>%
slice(1) %>%
left_join(dyadic_cle an, by = "all_codes") %>%
select(all_codes, everything())%>%
mutate(country_name_1 = as.factor(country_name_1),
country_name_2 = as.factor(country_name_2))
all2 <- ratios %>%
group_by(all_codes) %>%
slice(2) %>%
left_join(dyadic_clean, by = "all_codes") %>%
select(all_codes, everything())%>%
mutate(country_name_1 = as.factor(country_name_1),
country_name_2 = as.factor(country_name_2))
all <- ratios %>%
mutate(all_codes = as.factor(all_codes),
country_code_1 = as.factor(country_code_1),
country_code_2 = as.factor(country_code_2)) %>%
group_by(all_codes) %>%
summarize(mean_ratio = mean(mean_ratio)) %>%
left_join(dyadic) %>%
full_join(dict) %>%
ungroup() %>%
mutate(all_codes = as.factor(all_codes),
country_code_1 = as.factor(country_code_1),
country_code_2 = as.factor(country_code_2),
cont_pair = paste0(as.character(cont_order_1), as.character(cont_order_2))) %>%
distinct()
Most similiar countries overall
N <- 1
c1max <- all %>%
select(country_name_1, country_name_2, mean_ratio) %>%
group_by(country_name_1) %>%
top_n(N, mean_ratio)
c2max <- all %>%
select(country_name_1, country_name_2, mean_ratio) %>%
group_by(country_name_2) %>%
top_n(N, mean_ratio)
both_max = bind_rows(c1max, c2max) %>%
ungroup() %>%
mutate(index = 1:n())
max = both_max %>%
gather("country", "country_name", 1:2) %>%
select(-country) %>%
#mutate(country_name = as.factor(country_name)) %>%
group_by(country_name) %>%
top_n(N, mean_ratio) %>%
left_join(both_max %>% select(-mean_ratio), by = "index") %>%
mutate(country_pair = ifelse(country_name == country_name_1,
as.character(country_name_2),
as.character(country_name_1))) %>%
select(country_name, country_pair, mean_ratio) %>%
ungroup()%>%
distinct() # remove reciprocals
Draw network
max2 <- max %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(as.character(country_name),
as.character(country_pair)))
max3 <- max2 %>%
select(all_codes, everything()) %>%
ungroup() %>%
count(all_codes) %>%
left_join(max2 %>% select(-mean_ratio), by = "all_codes")
cont_dict <- dict %>%
ungroup() %>%
select(country_name_2, cont_order_2) %>%
distinct() %>%
filter(!is.na(cont_order_2)) %>%
rename(continent = cont_order_2)
graph <- max3 %>%
select(country_name, country_pair) %>%
mutate_all(as.factor) %>%
graph_from_data_frame(directed = FALSE, vertices = cont_dict) %>%
simplify()
E(graph)$weight=max3$n # add in weights
ggplot(ggnetwork(asNetwork(graph)),
aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(size = weight), color = "grey60",
curvature = .15,
show.legend = FALSE) +
geom_nodes(aes(color = continent), show.legend = FALSE) +
geom_nodelabel(aes(label = vertex.names,
fill = continent),
show.legend = FALSE,
label.padding = unit(0.2,"lines"),
label.size = 0.4, size = 3,
force = .1) +
ggtitle(paste0("Top ", N, " most similar countries to each country")) +
theme_blank()

Between measure correlations
corrs <- correlate(all %>% select(2:7),
use = "pairwise.complete.obs")
rplot(corrs,
legend = TRUE,
colours = c("skyblue1", "white","indianred2")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

gather(corrs,
"variable1", "value", -1) %>%
filter(rowname == "mean_ratio") %>%
kable()
| mean_ratio |
mean_ratio |
NA |
| mean_ratio |
log_normalized_n_events_all |
0.0595072 |
| mean_ratio |
log_normalized_mean_imports_dollars |
0.0073009 |
| mean_ratio |
centroid_dist_meters |
-0.1492344 |
| mean_ratio |
wals_euclidean_dist |
-0.0128667 |
| mean_ratio |
asjp_dist |
-0.1116393 |
Scatter plots
all %>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = centroid_dist_meters, y = mean_ratio)) +
geom_point(size = .2) +
#geom_label(aes(label = all_codes)) +
geom_smooth(method = "lm") +
theme_minimal()

all %>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = asjp_dist, y = mean_ratio)) +
geom_point(size = .2) +
#geom_label(aes(label = all_codes)) +
geom_smooth(method = "lm") +
theme_minimal()

Models
lm(mean_ratio ~ centroid_dist_meters , all) %>%
tidy () %>%
kable()
| (Intercept) |
0.9999964 |
3e-07 |
3307943.35502 |
0 |
| centroid_dist_meters |
0.0000000 |
0e+00 |
-10.48681 |
0 |
lm(mean_ratio ~ asjp_dist, all) %>%
tidy () %>%
kable()
| (Intercept) |
0.9999977 |
7e-07 |
1.377711e+06 |
0 |
| asjp_dist |
-0.0000060 |
9e-07 |
-6.453534e+00 |
0 |
lm(mean_ratio ~ wals_euclidean_dist , all) %>%
tidy () %>%
kable()
| (Intercept) |
0.9999946 |
5e-07 |
2.082613e+06 |
0.0000000 |
| wals_euclidean_dist |
0.0000000 |
0e+00 |
-7.662508e-01 |
0.4435781 |
lm(mean_ratio ~ asjp_dist + centroid_dist_meters, all) %>%
tidy () %>%
kable()
| (Intercept) |
1.0e+00 |
8e-07 |
1.305412e+06 |
0 |
| asjp_dist |
-5.9e-06 |
1e-06 |
-6.183378e+00 |
0 |
| centroid_dist_meters |
0.0e+00 |
0e+00 |
-7.779464e+00 |
0 |
lm(mean_ratio ~ log_normalized_n_events_all + centroid_dist_meters, all) %>%
tidy () %>%
kable()
| (Intercept) |
0.9999961 |
1.9e-06 |
5.144605e+05 |
0.0000000 |
| log_normalized_n_events_all |
0.0000000 |
1.0e-07 |
-9.522800e-03 |
0.9924025 |
| centroid_dist_meters |
0.0000000 |
0.0e+00 |
-8.739032e+00 |
0.0000000 |
lm(mean_ratio ~ log_normalized_mean_imports_dollars + centroid_dist_meters, all) %>%
tidy () %>%
kable()
| (Intercept) |
0.9999877 |
2.5e-06 |
407932.388259 |
0.0000000 |
| log_normalized_mean_imports_dollars |
-0.0000004 |
1.0e-07 |
-3.480903 |
0.0005045 |
| centroid_dist_meters |
0.0000000 |
0.0e+00 |
-10.094815 |
0.0000000 |