Read in files

geo <- read_csv("../../data/supplementary_data/cultural_sim_measures/geo/all_google_geo_dists.csv") %>%
  select(-capital_dist_meters)
lang <- read_csv("../../data/supplementary_data/cultural_sim_measures/lang/all_google_lang_dists.csv")
events<- read_csv("../../data/supplementary_data/cultural_sim_measures/events/all_google_event_dists.csv")

These are our variables: “centroid_dist_meters”
“wals_euclidean_dist”/“asjp_dist”
“log_normalized_n_events_all” “lop_normalized_mean_imports_dollars”

get_unique_relation_id <- function (x, y){
  pairs = c(x, y)
  ordered = order(pairs)
  paste0(pairs[ordered[1]], pairs[ordered[2]])
}

all_dyadic2 <- events %>%
  full_join(geo)  %>%
  full_join(lang) %>%
  rowwise() %>%
  mutate(all_codes = get_unique_relation_id(country_code_1, country_code_2)) %>%
  ungroup() %>%
  select(all_codes, everything())

# this just cause there's a weird bug
#write_csv(all_dyadic2, "../../data/supplementary_data/cultural_sim_measures/events/all_dyadic2_temp.csv")

all_dyadic2 <- read_csv("../../data/supplementary_data/cultural_sim_measures/events/all_dyadic2_temp.csv") %>%
  mutate(all_codes = as.factor(all_codes))

is.na(all_dyadic2) <- do.call(cbind, lapply(all_dyadic2, is.infinite))

dyadic_clean <- all_dyadic2 %>%
  group_by(all_codes) %>%
  slice(1)  %>%
  ungroup()

Histograms

Summary of dyadic measures:

summary(dyadic_clean)
##    all_codes    country_code_2     country_code_1    
##  AEAE   :   1   Length:2556        Length:2556       
##  AEAR   :   1   Class :character   Class :character  
##  AEAT   :   1   Mode  :character   Mode  :character  
##  AEAU   :   1                                        
##  AEBA   :   1                                        
##  AEBE   :   1                                        
##  (Other):2550                                        
##  log_normalized_n_events_all log_normalized_mean_imports_dollars
##  Min.   :-20.258             Min.   :-31.90                     
##  1st Qu.:-16.251             1st Qu.:-24.07                     
##  Median :-15.175             Median :-22.90                     
##  Mean   :-15.126             Mean   :-22.98                     
##  3rd Qu.:-13.986             3rd Qu.:-21.78                     
##  Max.   : -9.761             Max.   :-17.43                     
##  NA's   :690                 NA's   :315                        
##  centroid_dist_meters wals_euclidean_dist   asjp_dist     
##  Min.   :       0     Min.   :  0.00      Min.   :0.0000  
##  1st Qu.: 2108497     1st Qu.: 82.39      1st Qu.:0.7687  
##  Median : 5328696     Median :141.73      Median :0.8321  
##  Mean   : 6286402     Mean   :117.52      Mean   :0.7175  
##  3rd Qu.: 9655098     3rd Qu.:156.59      3rd Qu.:0.8755  
##  Max.   :19758598     Max.   :216.48      Max.   :0.9458  
##  NA's   :71           NA's   :712         NA's   :835
dyadic_clean %>%
  gather("measure", "value" , 4:8)  %>%
  ggplot(aes(x = value, fill = measure)) +
  geom_histogram() +
  facet_wrap(~measure, scales = "free") +
  theme_bw() +
  theme(legend.position = "none")

Heatmap by continent

# get contintents
dist_by_continent <- all_dyadic2 %>%
  rowwise() %>%
  mutate(country_name_1 = as.factor(countrycode(country_code_1, "iso2c","country.name")),
         country_name_2 = as.factor(countrycode(country_code_2, "iso2c","country.name")),
         continent_name_1 = countrycode(country_code_1, 'iso2c', 'continent'),
         continent_name_2 = countrycode(country_code_2, 'iso2c', 'continent')) %>%
  ungroup() %>%
  mutate(cont_order_1 = as.factor(continent_name_1) %>% as.numeric,
         cont_order_2 = as.factor(continent_name_2) %>% as.numeric) %>%
  select(-continent_name_1, -continent_name_2) %>%
  mutate(country_name_1 = fct_reorder(country_name_1, cont_order_1), 
         country_name_2 = fct_reorder(country_name_2, cont_order_2)) 

centroid_dist_meters

dist_by_continent %>%
  ggplot(aes(x = country_name_1, 
             y = country_name_2)) +
  geom_raster(aes(fill = centroid_dist_meters)) + 
  scale_fill_continuous(low = "#ffffcc", high = "#800026") +
  ggtitle("Geographical distance") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, vjust = 1, size = 6),
        axis.text.y = element_text(size = 6),
        axis.title = element_blank())

wals_euclidean_dist

dist_by_continent %>%
  ggplot(aes(x = country_name_1, 
             y = country_name_2)) +
  geom_raster(aes(fill = wals_euclidean_dist)) + 
  scale_fill_continuous(low = "#ffffcc", high = "#800026") +
  ggtitle("WALS distance") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, vjust = 1, size = 6),
        axis.text.y = element_text(size = 6),
        axis.title = element_blank())

asjp_dist

dist_by_continent %>%
  ggplot(aes(x = country_name_1, 
             y = country_name_2)) +
  geom_raster(aes(fill = asjp_dist)) + 
  scale_fill_continuous(low = "#ffffcc", high = "#800026") +
  ggtitle("ASJP distance") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, vjust = 1, size = 6),
        axis.text.y = element_text(size = 6),
        axis.title = element_blank())

log_normalized_n_events_all

dist_by_continent %>%
  ggplot(aes(x = country_name_1, 
             y = country_name_2)) +
  geom_raster(aes(fill = log_normalized_n_events_all)) + 
  scale_fill_continuous(low = "#ffffcc", high = "#800026") +
  ggtitle("events") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, vjust = 1, size = 6),
        axis.text.y = element_text(size = 6),
        axis.title = element_blank())

log_normalized_mean_imports_dollars

dist_by_continent %>%
  ggplot(aes(x = country_name_1, 
             y = country_name_2)) +
  geom_raster(aes(fill = log_normalized_mean_imports_dollars)) + 
  scale_fill_continuous(low = "#ffffcc", high = "#800026") +
  ggtitle("mean_imports_dollars") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1, vjust = 1, size = 6),
        axis.text.y = element_text(size = 6),
        axis.title = element_blank())

Between measure correlations

corrs <- correlate(all_dyadic2 %>% select(-1:-3), 
                   use = "complete.obs")  %>%
  shave()

rplot(corrs, 
      legend = TRUE,
      colours = c("skyblue1", "white","indianred2")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

gather(corrs, 
       "variable1", "value", -1) %>%
  ungroup() %>%
  arrange(-value) %>%
  as.data.frame() %>%
  filter(!is.na(value)) %>%
  kable()
rowname variable1 value
asjp_dist wals_euclidean_dist 0.7704572
log_normalized_mean_imports_dollars log_normalized_n_events_all 0.4849369
wals_euclidean_dist centroid_dist_meters 0.3330267
asjp_dist centroid_dist_meters 0.1832161
wals_euclidean_dist log_normalized_mean_imports_dollars -0.1558068
asjp_dist log_normalized_mean_imports_dollars -0.1669058
wals_euclidean_dist log_normalized_n_events_all -0.2923945
asjp_dist log_normalized_n_events_all -0.3193012
centroid_dist_meters log_normalized_n_events_all -0.4088289
centroid_dist_meters log_normalized_mean_imports_dollars -0.4438697

Scatterplots

dist vs. imports

ggplot(all_dyadic2, aes(x = centroid_dist_meters,
                              y = log_normalized_mean_imports_dollars)) +
  geom_point() +
  geom_smooth(method = "lm")+
  theme_minimal()

dist vs. events

ggplot(all_dyadic2, aes(x = centroid_dist_meters,
                              y =  log_normalized_n_events_all)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

events vs. imports

ggplot(all_dyadic2, aes(x = log_normalized_n_events_all,
                              y = log_normalized_mean_imports_dollars)) +
  geom_point() +
  geom_smooth(method = "lm")+
  theme_minimal()

dist vs. asjp

ggplot(all_dyadic2, aes(x = centroid_dist_meters,
                              y = asjp_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

events vs. asjp

ggplot(all_dyadic2, aes(x = log_normalized_n_events_all, 
                              y = asjp_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

imports vs. asjp

ggplot(all_dyadic2, aes(x = log_normalized_mean_imports_dollars, 
                        y = asjp_dist)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

write_csv(dist_by_continent, "../../data/supplementary_data/cultural_sim_measures/all_dyadic_vars.csv")