ITEM <- "bread"
Here we’re just looking at one item: bread
get_unique_relation_id <- function (x, y){
pairs = c(x, y)
ordered = order(pairs)
paste0(pairs[ordered[1]], pairs[ordered[2]])
}
ratios <- read_csv(paste0("../../data/keras_similarities/pairwise_country/",ITEM, "_sim_ratios.csv")) %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
ungroup() %>%
select(all_codes, everything()) %>%
mutate(country_name_1 = as.factor(country_name_1),
country_name_2 = as.factor(country_name_2))
dict <- ratios %>%
select(all_codes,country_code_1, country_code_2,country_name_1, country_name_2, cont_order_1, cont_order_2 )
Centroid geographic distance and linguistic distance. Continuous linguistic measures come from here: https://github.com/ddediu/lgfam-newick/blob/master/paper/family-trees-with-brlength.pdf.
dyadic <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv") %>%
select(1, 4:8)
Merge together dists and sims
Note there a bunch of different ratios we could use here since it’s not symmetrical. Here I’m taking the average of the two.
all0 <- ratios %>%
left_join(dyadic_clean, by = "all_codes") %>%
select(all_codes, everything())
all1 <- ratios %>%
group_by(all_codes) %>%
slice(1) %>%
left_join(dyadic_cle an, by = "all_codes") %>%
select(all_codes, everything())%>%
mutate(country_name_1 = as.factor(country_name_1),
country_name_2 = as.factor(country_name_2))
all2 <- ratios %>%
group_by(all_codes) %>%
slice(2) %>%
left_join(dyadic_clean, by = "all_codes") %>%
select(all_codes, everything())%>%
mutate(country_name_1 = as.factor(country_name_1),
country_name_2 = as.factor(country_name_2))
all <- ratios %>%
mutate(all_codes = as.factor(all_codes)) %>%
group_by(all_codes) %>%
summarize(mean_ratio = mean(mean_ratio)) %>%
left_join(dyadic) %>%
full_join(dict)
write_csv(all, "../../data/supplementary_data/cultural_sim_measures/temp.csv")
all <- read_csv( "../../data/supplementary_data/cultural_sim_measures/temp.csv") %>%
ungroup() %>%
mutate(all_codes = as.factor(all_codes),
country_code_1 = as.factor(country_code_1),
country_code_2 = as.factor(country_code_2)) %>%
distinct()
```
N <- 1
c1max <- all %>%
select(country_name_1, country_name_2, mean_ratio) %>%
group_by(country_name_1) %>%
top_n(N, mean_ratio)
c2max <- all %>%
select(country_name_1, country_name_2, mean_ratio) %>%
group_by(country_name_2) %>%
top_n(N, mean_ratio)
both_max = bind_rows(c1max, c2max) %>%
ungroup() %>%
mutate(index = 1:n())
max = both_max %>%
gather("country", "country_name", 1:2) %>%
select(-country) %>%
#mutate(country_name = as.factor(country_name)) %>%
group_by(country_name) %>%
top_n(N, mean_ratio) %>%
left_join(both_max %>% select(-mean_ratio), by = "index") %>%
mutate(country_pair = ifelse(country_name == country_name_1,
as.character(country_name_2),
as.character(country_name_1))) %>%
select(country_name, country_pair, mean_ratio) %>%
ungroup()%>%
distinct() # remove reciprocals
Draw network
max2 <- max %>%
rowwise() %>%
mutate(all_codes = get_unique_relation_id(as.character(country_name),
as.character(country_pair)))
max3 <- max2 %>%
select(all_codes, everything()) %>%
ungroup() %>%
count(all_codes) %>%
left_join(max2 %>% select(-mean_ratio), by = "all_codes")
cont_dict <- all %>%
select(country_name_1, cont_order_1) %>%
distinct() %>%
#bind_rows(data.frame(country_name_1 = "South Africa", cont_order_1 = 1)) %>%
mutate(cont_order_1 = as.factor(cont_order_1),
cont_order_1 = fct_recode(cont_order_1,
"AM" = "2",
"AU" = "5",
"EU" = "4",
"AS" = "3",
"AF" = "1"))
graph <- max3 %>%
select(country_name, country_pair) %>%
#distinct(country_name, country_pair) %>%
graph_from_data_frame(directed = FALSE, vertices = cont_dict) %>%
simplify()
E(graph)$weight=max3$n # add in weights
ggplot(ggnetwork(asNetwork(graph)),
aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(size = weight), color = "grey60",
curvature = .15,
show.legend = FALSE) +
geom_nodes(aes(color = cont_order_1), show.legend = FALSE) +
geom_nodelabel(aes(label = vertex.names,
fill = cont_order_1),
show.legend = FALSE,
label.padding = unit(0.2,"lines"),
label.size = 0.4, size = 3,
force = .1) +
ggtitle(paste0("Top ", N, " most similar countries to each country")) +
theme_blank()
corrs <- correlate(all %>% select(2:7),
use = "pairwise.complete.obs")
rplot(corrs,
legend = TRUE,
colours = c("skyblue1", "white","indianred2")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
gather(corrs,
"variable1", "value", -1) %>%
filter(rowname == "mean_ratio") %>%
kable()
| rowname | variable1 | value |
|---|---|---|
| mean_ratio | mean_ratio | NA |
| mean_ratio | log_normalized_n_events_all | 0.0318593 |
| mean_ratio | log_normalized_mean_imports_dollars | -0.0262694 |
| mean_ratio | centroid_dist_meters | 0.0629343 |
| mean_ratio | wals_euclidean_dist | 0.0036824 |
| mean_ratio | asjp_dist | -0.0086418 |
all %>%
filter(country_name_1 != country_name_2) %>%
ggplot(aes(x = centroid_dist_meters, y = mean_ratio)) +
geom_point(size = .2) +
#geom_label(aes(label = all_codes)) +
geom_smooth(method = "lm") +
theme_minimal()
lm(mean_ratio ~ centroid_dist_meters , all) %>%
tidy () %>%
kable()
lm(mean_ratio ~ asjp_dist, all) %>%
tidy () %>%
kable()
lm(mean_ratio ~ wals_euclidean_dist , all) %>%
tidy () %>%
kable()
lm(mean_ratio ~ asjp_dist + centroid_dist_meters, all) %>%
tidy () %>%
kable()
lm(mean_ratio ~ wals_euclidean_dist + centroid_dist_meters, all) %>%
tidy () %>%
kable()