Predict pairwise similarities

ITEM <- "tree"

Here we’re just looking at one item: tree

get_unique_relation_id <- function (x, y){
  pairs = c(x, y)
  ordered = order(pairs)
  paste0(pairs[ordered[1]], pairs[ordered[2]])
}

ratios <- read_csv(paste0("../../data/keras_similarities/pairwise_country/", ITEM, "_sim_ratios.csv")) %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2))  %>%
  ungroup() %>%
  select(all_codes, everything()) %>%
  mutate(country_name_1 = as.factor(country_name_1),
         country_name_2 = as.factor(country_name_2))

# write_csv(ratios, "../../data/supplementary_data/cultural_sim_measures/temp2.csv")

ratios <- read_csv( "../../data/supplementary_data/cultural_sim_measures/temp2.csv") 

dict <- ratios  %>%
  select(all_codes, country_code_1, 
         country_code_2, country_name_1,
         country_name_2, cont_order_1, 
         cont_order_2 ) %>%
  distinct() %>%
  mutate_all(as.factor) %>%
  mutate(cont_order_1 = as.factor(cont_order_1),
         cont_order_1 = fct_recode(cont_order_1, 
                                   "AM" = "2",
                                   "AU" = "5",
                                   "EU" = "4",
                                   "AS" = "3",
                                   "AF" = "1")) %>%
  ungroup()

Dyadic predictive measures

Centroid geographic distance and linguistic distance. Continuous linguistic measures come from here: https://github.com/ddediu/lgfam-newick/blob/master/paper/family-trees-with-brlength.pdf.

dyadic <- read_csv("../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv")  %>%
  select(1, 4:8)

Merge together dists and sims

Note there a bunch of different ratios we could use here since it’s not symmetrical. Here I’m taking the average of the two.

all0 <- ratios %>%
  left_join(dyadic_clean, by = "all_codes") %>%
  select(all_codes, everything())

all1 <- ratios %>%
  group_by(all_codes) %>%
  slice(1) %>%
  left_join(dyadic_cle an, by = "all_codes") %>%
  select(all_codes, everything())%>%
  mutate(country_name_1 = as.factor(country_name_1),
         country_name_2 = as.factor(country_name_2))

all2 <- ratios %>%
  group_by(all_codes) %>%
  slice(2) %>%
  left_join(dyadic_clean, by = "all_codes") %>%
  select(all_codes, everything())%>%
  mutate(country_name_1 = as.factor(country_name_1),
         country_name_2 = as.factor(country_name_2))

all <- ratios %>%   
  mutate(all_codes = as.factor(all_codes),
         country_code_1 = as.factor(country_code_1),
         country_code_2 = as.factor(country_code_2)) %>%
  group_by(all_codes) %>%
  summarize(mean_ratio = mean(mean_ratio)) %>%
  left_join(dyadic) %>%
  full_join(dict) %>%
 ungroup() %>%
    mutate(all_codes = as.factor(all_codes),
         country_code_1 = as.factor(country_code_1),
         country_code_2 = as.factor(country_code_2),
         cont_pair = paste0(as.character(cont_order_1), as.character(cont_order_2))) %>%
  distinct()

Most similiar countries overall

N <- 1
c1max <- all %>%
  select(country_name_1, country_name_2, mean_ratio) %>%
  group_by(country_name_1) %>%
  top_n(N, mean_ratio)

c2max <- all %>%
  select(country_name_1, country_name_2, mean_ratio) %>%
  group_by(country_name_2) %>%
  top_n(N, mean_ratio)

both_max = bind_rows(c1max, c2max) %>%
           ungroup() %>%
           mutate(index = 1:n())

max = both_max %>%
        gather("country", "country_name", 1:2) %>%
        select(-country) %>%
        #mutate(country_name = as.factor(country_name)) %>%
        group_by(country_name) %>%
        top_n(N, mean_ratio) %>%
        left_join(both_max %>% select(-mean_ratio), by = "index") %>%
        mutate(country_pair = ifelse(country_name == country_name_1, 
                                     as.character(country_name_2),
                                     as.character(country_name_1))) %>%
        select(country_name, country_pair, mean_ratio) %>%
        ungroup()%>%
        distinct() # remove reciprocals

Draw network

max2 <- max %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(as.character(country_name),
                                            as.character(country_pair)))  

max3 <- max2 %>%
  select(all_codes, everything()) %>%
  ungroup() %>%
  count(all_codes) %>%
  left_join(max2 %>% select(-mean_ratio), by = "all_codes")

cont_dict <- dict %>%
  ungroup() %>%
  select(country_name_2, cont_order_2) %>%
  distinct() %>%
  filter(!is.na(cont_order_2)) %>%
  rename(continent = cont_order_2)


graph <- max3 %>%
      select(country_name, country_pair) %>%
      mutate_all(as.factor) %>%
      graph_from_data_frame(directed = FALSE, vertices = cont_dict) %>%
      simplify() 

E(graph)$weight=max3$n # add in weights
  
ggplot(ggnetwork(asNetwork(graph)), 
            aes(x = x, y = y, xend = xend, yend = yend)) +
         geom_edges(aes(size = weight), color = "grey60", 
                    curvature = .15, 
                    show.legend = FALSE) +
         geom_nodes(aes(color = continent), show.legend = FALSE) +
         geom_nodelabel(aes(label =  vertex.names,
                                  fill = continent), 
                        show.legend = FALSE, 
                        label.padding = unit(0.2,"lines"), 
                        label.size = 0.4, size = 3,
                        force = .1) +
        ggtitle(paste0("Top ", N, " most similar countries to each country")) +
        theme_blank()

Between measure correlations

corrs <- correlate(all %>% select(2:7), 
                   use = "pairwise.complete.obs")  

rplot(corrs, 
      legend = TRUE,
      colours = c("skyblue1", "white","indianred2")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

gather(corrs, 
       "variable1", "value", -1) %>%
  filter(rowname == "mean_ratio") %>%
  kable()

rowname	variable1	value
mean_ratio	mean_ratio	NA
mean_ratio	log_normalized_n_events_all	0.0595072
mean_ratio	log_normalized_mean_imports_dollars	0.0073009
mean_ratio	centroid_dist_meters	-0.1492344
mean_ratio	wals_euclidean_dist	-0.0128667
mean_ratio	asjp_dist	-0.1116393

Scatter plots

all %>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = centroid_dist_meters, y = mean_ratio)) +
  geom_point(size = .2) +
  #geom_label(aes(label = all_codes)) +
  geom_smooth(method = "lm") +
  theme_minimal()

all %>%
  filter(country_name_1 != country_name_2) %>%
  ggplot(aes(x = asjp_dist, y = mean_ratio)) +
  geom_point(size = .2) +
  #geom_label(aes(label = all_codes)) +
  geom_smooth(method = "lm") +
  theme_minimal()

Models

lm(mean_ratio ~ centroid_dist_meters , all) %>%
  tidy () %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.9999964	3e-07	3307943.35502	0
centroid_dist_meters	0.0000000	0e+00	-10.48681	0

lm(mean_ratio ~ asjp_dist, all) %>%
  tidy () %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.9999977	7e-07	1.377711e+06	0
asjp_dist	-0.0000060	9e-07	-6.453534e+00	0

lm(mean_ratio ~ wals_euclidean_dist , all) %>%
    tidy () %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.9999946	5e-07	2.082613e+06	0.0000000
wals_euclidean_dist	0.0000000	0e+00	-7.662508e-01	0.4435781

lm(mean_ratio ~ asjp_dist + centroid_dist_meters, all) %>%
  tidy () %>%
  kable()

term	estimate	std.error	statistic
(Intercept)	1.0e+00	8e-07	1.305412e+06
asjp_dist	-5.9e-06	1e-06	-6.183378e+00
centroid_dist_meters	0.0e+00	0e+00	-7.779464e+00

lm(mean_ratio ~ log_normalized_n_events_all + centroid_dist_meters, all) %>%
  tidy () %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.9999961	1.9e-06	5.144605e+05	0.0000000
log_normalized_n_events_all	0.0000000	1.0e-07	-9.522800e-03	0.9924025
centroid_dist_meters	0.0000000	0.0e+00	-8.739032e+00	0.0000000

lm(mean_ratio ~ log_normalized_mean_imports_dollars + centroid_dist_meters, all) %>%
  tidy () %>%
  kable()

term	estimate	std.error	statistic	p.value
(Intercept)	0.9999877	2.5e-06	407932.388259	0.0000000
log_normalized_mean_imports_dollars	-0.0000004	1.0e-07	-3.480903	0.0005045
centroid_dist_meters	0.0000000	0.0e+00	-10.094815	0.0000000