Explore data

This analysis is very similar to what I did last May for tidyTuesday data set on cocktail recipes, so take a look at both to see what is the same and what is different for the 2 different datasets. Our modeling goal is to use unsupervised algorithms for dimensionality reduction with UN voting data to understand which countries are similar.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
unvotes <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-23/unvotes.csv")
## Parsed with column specification:
## cols(
##   rcid = col_double(),
##   country = col_character(),
##   country_code = col_character(),
##   vote = col_character()
## )
issues <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-23/issues.csv")
## Parsed with column specification:
## cols(
##   rcid = col_double(),
##   short_name = col_character(),
##   issue = col_character()
## )

Let’s create a wide version of this data set via pivot_wider() to use for modeling

unvotes_df <- unvotes %>% 
               select(country, rcid, vote) %>%
               mutate(
                              vote = factor(vote, levels = c("no", "abstain", "yes")),
                              vote = as.numeric(vote),
                              rcid = paste0("rcid_", rcid)
               ) %>%
               pivot_wider(names_from = "rcid", values_from = "vote", values_fill = 2)

Principal component analysis

This analysis only uses the recipe package, the tidymodels package for data preprocessing and feature engineering that contains functions for unsupervised methods. There are lots of options available, like step_ica() and step step_kpca(), but let’s implement a basic principal component analysis.

library(recipes)
## Warning: package 'recipes' was built under R version 4.0.3
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
pca_rec <- recipe(~., data  = unvotes_df) %>% 
               update_role(country, new_role = "id") %>%
               step_normalize(all_predictors()) %>% 
               step_pca(all_predictors(), num_comp = 5)

pca_prep <- prep(pca_rec)

pca_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          1
##  predictor       6202
## 
## Training data contained 200 data points and no missing data.
## 
## Operations:
## 
## Centering and scaling for rcid_3, rcid_4, rcid_5, rcid_6, rcid_7, ... [trained]
## PCA extraction with rcid_3, rcid_4, rcid_5, rcid_6, rcid_7, ... [trained]

We can look at where the countries are in the principal component space by baking the prepped recipe.

theme_set(theme_bw())

bake(pca_prep, new_data =  NULL) %>%
               ggplot(aes(PC1, PC2, label = country)) +
               geom_point(color = 'midnightblue', alpha = 0.7, size = 2) + 
               geom_text(check_overlap = TRUE, hjust = 'inward', family = 'IBMPlexSans') +
               labs(color = NULL)
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

We can look at which votes contribute to the component by tidying the prepped recipe. Let’s join the roll call votes up with the topics to see which topics contribute to the top principal components.

pca_comps <- tidy(pca_prep, 2) %>%
               filter(component %in% paste0("PC", 1:4)) %>%
               left_join(issues %>% mutate(terms = paste0('rcid_', rcid))) %>%
               filter(!is.na(terms)) %>%
               group_by(component) %>%
               top_n(8, abs(value)) %>%
               ungroup()
## Joining, by = "terms"
pca_comps %>%filter(!is.na(terms)) %>%
  mutate(value = abs(value)) %>%
  ggplot(aes(value, fct_reorder(terms, value), fill = issue)) +
  geom_col(position = "dodge") +
  facet_wrap(~component, scales = "free_y") +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = NULL,
    title = "What issues are most important in UN voting country differences?",
    subtitle = "Human rights and economic development votes account for the most variation"
  )