ITEM <- "bread"
Here we’re just looking at one item: bread
Read in drawing pairwise-sims file.
if (ITEM == "tree"){ # this is just because within is in a separate file for tree - will fix this
tree <- read_feather("../../data/keras_similarities/pairwise_country/tree_all_sims.txt")
tree_within <- read_feather("../../data/keras_similarities/pairwise_country/tree_all_sims_within.txt")
d <- bind_rows(tree, tree_within) %>%
mutate(key_id_1 = as.character(key_id_1),
key_id_2 = as.character(key_id_2))
} else {
d <- read_feather(paste0("../../data/keras_similarities/pairwise_country/", ITEM , "_all_sims.txt"))
}
Number of unique participants per each country (sampled 50).(Note that for even smaller countries, some duplicates)
d_big = d %>%
group_by(country_code_1, key_id_1)%>%
slice(1) %>%
ungroup() %>%
count(country_code_1)%>%
arrange(-n)%>%
as.data.frame()
#kable(d_big)
Remove NAs- very few, these come from 8 drawings (not sure why)
d_clean <- d %>%
filter(!is.na(cosine)) %>%
filter(country_code_1 %in% d_big$country_code_1 & country_code_2 %in% d_big$country_code_1)
within_item_cosines <- d_clean %>%
filter(country_code_1 == country_code_2) %>%
filter(key_id_1 != key_id_2) %>%
rename(key_id = key_id_1) %>%
group_by(key_id) %>%
summarize(within_cosine = mean(cosine))
across_item_cosines <- d_clean %>%
filter(key_id_1 != key_id_2) %>%
filter(country_code_1 != country_code_2)
all_across_cosines <- across_item_cosines %>%
bind_rows(across_item_cosines %>% # get compliment
rename(country_code_1 = country_code_2,
country_code_2 = country_code_1,
key_id_1 = key_id_2,
key_id_2 = key_id_1)) %>%
rename(key_id = key_id_1) %>%
group_by(country_code_1, country_code_2, key_id) %>%
summarize(across_cosine = mean(cosine))
pairwise_ratios <- all_across_cosines %>%
left_join(within_item_cosines, by = "key_id") %>%
mutate(ratio = across_cosine/within_cosine) %>%
group_by(country_code_1, country_code_2) %>%
summarize(mean_ratio = mean(ratio))
# get contintents
pairwise_ratios_with_continents <- pairwise_ratios %>%
rowwise() %>%
mutate(country_name_1 = as.factor(countrycode(country_code_1, "iso2c","country.name")),
country_name_2 = as.factor(countrycode(country_code_2, "iso2c","country.name")),
continent_name_1 = countrycode(country_code_1, 'iso2c', 'continent'),
continent_name_2 = countrycode(country_code_2, 'iso2c', 'continent')) %>%
ungroup() %>%
mutate(cont_order_1 = as.factor(continent_name_1) %>% as.numeric,
cont_order_2 = as.factor(continent_name_2) %>% as.numeric) %>%
select(-continent_name_1, -continent_name_2) %>%
mutate(country_name_1 = fct_reorder(country_name_1, cont_order_1),
country_name_2 = fct_reorder(country_name_2, cont_order_2))
# plot heatmap
pairwise_ratios_with_continents %>%
ggplot(aes(x = country_name_1,
y = country_name_2)) +
geom_raster(aes(fill = mean_ratio)) +
scale_fill_continuous(low = "#ffffcc", high = "#800026") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 1, size = 6),
axis.text.y = element_text(size = 6),
axis.title = element_blank())
all_countries <- unique(pairwise_ratios_with_continents$country_name_1)
NUM_COUNTRIES <- length(all_countries)
m = matrix(nrow = NUM_COUNTRIES, ncol = NUM_COUNTRIES)
for (i in 1:dim(pairwise_ratios_with_continents)[1]){
sim = unlist(pairwise_ratios_with_continents[i, "mean_ratio"])
x = which(all_countries == unlist(pairwise_ratios_with_continents[i, "country_name_1"]))
y = which(all_countries == unlist(pairwise_ratios_with_continents[i, "country_name_2"]))
m[x,y] <- sim
}
colnames(m) = all_countries
rownames(m) = all_countries
# plot
ggdendrogram(hclust(dist(m)), size = 2)
write_csv(pairwise_ratios_with_continents,
paste0("../../data/keras_similarities/pairwise_country/", ITEM, "_sim_ratios.csv"))