Predict pairwise similarities

ITEM <- "bread"

Here we’re just looking at one item: bread

get_unique_relation_id <- function (x, y){
  pairs = c(x, y)
  ordered = order(pairs)
  paste0(pairs[ordered[1]], pairs[ordered[2]])
}

ratios <- read_csv(paste0("../../data/keras_similarities/pairwise_country/",ITEM, "_sim_ratios.csv")) %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2))  %>%
  ungroup() %>%
  select(all_codes, everything()) %>%
  mutate(country_name_1 = as.factor(country_name_1),
         country_name_2 = as.factor(country_name_2))

dict <- ratios  %>%
  select(all_codes,country_code_1, country_code_2,country_name_1, country_name_2, cont_order_1, cont_order_2 )

Dyadic predictive measures

Centroid geographic distance and linguistic distance. Continuous linguistic measures come from here: https://github.com/ddediu/lgfam-newick/blob/master/paper/family-trees-with-brlength.pdf.

dyadic <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv")  %>%
  select(1, 4:8)

Merge together dists and sims

Note there a bunch of different ratios we could use here since it’s not symmetrical. Here I’m taking the average of the two.

all0 <- ratios %>%
  left_join(dyadic_clean, by = "all_codes") %>%
  select(all_codes, everything())

all1 <- ratios %>%
  group_by(all_codes) %>%
  slice(1) %>%
  left_join(dyadic_cle an, by = "all_codes") %>%
  select(all_codes, everything())%>%
  mutate(country_name_1 = as.factor(country_name_1),
         country_name_2 = as.factor(country_name_2))

all2 <- ratios %>%
  group_by(all_codes) %>%
  slice(2) %>%
  left_join(dyadic_clean, by = "all_codes") %>%
  select(all_codes, everything())%>%
  mutate(country_name_1 = as.factor(country_name_1),
         country_name_2 = as.factor(country_name_2))

all <- ratios %>%   
  mutate(all_codes = as.factor(all_codes)) %>%
  group_by(all_codes) %>%
  summarize(mean_ratio = mean(mean_ratio)) %>%
  left_join(dyadic) %>%
  full_join(dict) 

write_csv(all, "../../data/supplementary_data/cultural_sim_measures/temp.csv")

all <- read_csv( "../../data/supplementary_data/cultural_sim_measures/temp.csv") %>% 
  ungroup() %>%
  mutate(all_codes = as.factor(all_codes),
         country_code_1 = as.factor(country_code_1),
         country_code_2 = as.factor(country_code_2)) %>%
  distinct()

```

Most similiar countries overall

N <- 1
c1max <- all %>%
  select(country_name_1, country_name_2, mean_ratio) %>%
  group_by(country_name_1) %>%
  top_n(N, mean_ratio)

c2max <- all %>%
  select(country_name_1, country_name_2, mean_ratio) %>%
  group_by(country_name_2) %>%
  top_n(N, mean_ratio)

both_max = bind_rows(c1max, c2max) %>%
           ungroup() %>%
           mutate(index = 1:n())

max = both_max %>%
        gather("country", "country_name", 1:2) %>%
        select(-country) %>%
        #mutate(country_name = as.factor(country_name)) %>%
        group_by(country_name) %>%
        top_n(N, mean_ratio) %>%
        left_join(both_max %>% select(-mean_ratio), by = "index") %>%
        mutate(country_pair = ifelse(country_name == country_name_1, 
                                     as.character(country_name_2),
                                     as.character(country_name_1))) %>%
        select(country_name, country_pair, mean_ratio) %>%
        ungroup()%>%
        distinct() # remove reciprocals

Draw network

max2 <- max %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(country_name),
                                            as.character(country_pair)))  

max3 <- max2 %>%
  select(all_codes, everything()) %>%
  ungroup() %>%
  count(all_codes) %>%
  left_join(max2 %>% select(-mean_ratio), by = "all_codes")

cont_dict <- all %>%
  select(country_name_1, cont_order_1) %>%
  distinct() %>%
  #bind_rows(data.frame(country_name_1 = "South Africa", cont_order_1 = 1)) %>%
  mutate(cont_order_1 = as.factor(cont_order_1),
         cont_order_1 = fct_recode(cont_order_1, 
                                   "AM" = "2",
                                   "AU" = "5",
                                   "EU" = "4",
                                   "AS" = "3",
                                   "AF" = "1")) 
 
graph <- max3 %>%
      select(country_name, country_pair) %>%
      #distinct(country_name, country_pair) %>%
      graph_from_data_frame(directed = FALSE, vertices = cont_dict) %>%
      simplify() 

E(graph)$weight=max3$n # add in weights
  
ggplot(ggnetwork(asNetwork(graph)), 
            aes(x = x, y = y, xend = xend, yend = yend)) +
         geom_edges(aes(size = weight), color = "grey60", 
                    curvature = .15, 
                    show.legend = FALSE) +
         geom_nodes(aes(color = cont_order_1), show.legend = FALSE) +
         geom_nodelabel(aes(label =  vertex.names,
                                  fill = cont_order_1), 
                        show.legend = FALSE, 
                        label.padding = unit(0.2,"lines"), 
                        label.size = 0.4, size = 3,
                        force = .1) +
        ggtitle(paste0("Top ", N, " most similar countries to each country")) +
        theme_blank()

Between measure correlations

corrs <- correlate(all %>% select(2:7), 
                   use = "pairwise.complete.obs")   

rplot(corrs, 
      legend = TRUE,
      colours = c("skyblue1", "white","indianred2")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

gather(corrs, 
       "variable1", "value", -1) %>%
  filter(rowname == "mean_ratio") %>%
  kable()

rowname	variable1	value
mean_ratio	mean_ratio	NA
mean_ratio	log_normalized_n_events_all	0.0318593
mean_ratio	log_normalized_mean_imports_dollars	-0.0262694
mean_ratio	centroid_dist_meters	0.0629343
mean_ratio	wals_euclidean_dist	0.0036824
mean_ratio	asjp_dist	-0.0086418

Scatter plots

all %>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = centroid_dist_meters, y = mean_ratio)) +
  geom_point(size = .2) +
  #geom_label(aes(label = all_codes)) +
  geom_smooth(method = "lm") +
  theme_minimal()

Models

lm(mean_ratio ~ centroid_dist_meters , all) %>%
  tidy () %>%
  kable()

lm(mean_ratio ~ asjp_dist, all) %>%
  tidy () %>%
  kable()

lm(mean_ratio ~ wals_euclidean_dist , all) %>%
    tidy () %>%
  kable()

lm(mean_ratio ~ asjp_dist + centroid_dist_meters, all) %>%
  tidy () %>%
  kable()

lm(mean_ratio ~ wals_euclidean_dist + centroid_dist_meters, all) %>%
  tidy () %>%
  kable()