In this notebook we will create an aligned UMAP for Swedish parliamentary election results on a municipality level.
First, load the necessary packages:
pacman::p_load(tidyverse,
janitor, # Useful tools, functions used: clean_names and remove_constant
hablar, # Nice complement to janitor, function used: retype
pxweb, # Interface for the API Statistics Sweden uses
reticulate, # Python bridge
tictoc, # Stop watch
plotly, # Interactive plots
gganimate) # Animated plots
If UMAP 0.5 (development version) isn’t installed you can install it with this code:
py_install("git+https://github.com/lmcinnes/umap.git@43181dec02e3932da9f7a0ee9c12abee5265d7b0", pip = TRUE)
Then we get the data from Statistics Sweden, and clean it up:
get_pxweb_data(url = "http://api.scb.se/OV0104/v1/doris/sv/ssd/ME/ME0104/ME0104C/ME0104T3",
dims = list(Region = c('0114', '0115', '0117', '0120', '0123', '0125', '0126', '0127', '0128', '0136', '0138', '0139', '0140', '0160', '0162', '0163', '0180', '0181', '0182', '0183', '0184', '0186', '0187', '0188', '0191', '0192', '0305', '0319', '0330', '0331', '0360', '0380', '0381', '0382', '0428', '0461', '0480', '0481', '0482', '0483', '0484', '0486', '0488', '0509', '0512', '0513', '0560', '0561', '0562', '0563', '0580', '0581', '0582', '0583', '0584', '0586', '0604', '0617', '0642', '0643', '0662', '0665', '0680', '0682', '0683', '0684', '0685', '0686', '0687', '0760', '0761', '0763', '0764', '0765', '0767', '0780', '0781', '0821', '0834', '0840', '0860', '0861', '0862', '0880', '0881', '0882', '0883', '0884', '0885', '0980', '1060', '1080', '1081', '1082', '1083', '1214', '1229', '1230', '1231', '1233', '1256', '1257', '1260', '1261', '1262', '1263', '1264', '1265', '1266', '1267', '1270', '1272', '1273', '1275', '1276', '1277', '1278', '1280', '1281', '1282', '1283', '1284', '1285', '1286', '1287', '1290', '1291', '1292', '1293', '1315', '1380', '1381', '1382', '1383', '1384', '1401', '1402', '1407', '1415', '1419', '1421', '1427', '1430', '1435', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1452', '1460', '1461', '1462', '1463', '1465', '1466', '1470', '1471', '1472', '1473', '1480', '1481', '1482', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1715', '1730', '1737', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1780', '1781', '1782', '1783', '1784', '1785', '1814', '1860', '1861', '1862', '1863', '1864', '1880', '1881', '1882', '1883', '1884', '1885', '1904', '1907', '1960', '1961', '1962', '1980', '1981', '1982', '1983', '1984', '2021', '2023', '2026', '2029', '2031', '2034', '2039', '2061', '2062', '2080', '2081', '2082', '2083', '2084', '2085', '2101', '2104', '2121', '2132', '2161', '2180', '2181', '2182', '2183', '2184', '2260', '2262', '2280', '2281', '2282', '2283', '2284', '2303', '2305', '2309', '2313', '2321', '2326', '2361', '2380', '2401', '2403', '2404', '2409', '2417', '2418', '2421', '2422', '2425', '2460', '2462', '2463', '2480', '2481', '2482', '2505', '2506', '2510', '2513', '2514', '2518', '2521', '2523', '2560', '2580', '2581', '2582', '2583', '2584'),
Partimm = c('M', 'C', 'FP', 'KD', 'MP', 'S', 'V', 'SD', 'ÖVRIGA'),
ContentsCode = c('ME0104B7'),
Tid = c('*')),
clean = TRUE) %>%
as_tibble() %>%
clean_names() %>%
retype() %>%
remove_constant() %>%
drop_na(values) %>%
mutate(values = values/100) %>%
spread(parti_mm, values, fill = 0) %>%
clean_names() %>%
separate(region, c("kommunid","kommun"), sep = " ", extra = "merge") ->
valresultat_kommuner
Some words in Swedish that are good to know:
Then we create the slices and the dictionary:
valresultat_kommuner %>%
arrange(tid, kommunid) %>%
select(-kommunid, -kommun) %>%
nest(data=-tid) %>%
mutate(data = map(data, ~as.matrix(.x) %>% r_to_py())) %>%
select(-tid) %>%
as.list %>%
.$data %>%
as.array() ->
kommun_slices
valresultat_kommuner %>% arrange(tid, kommunid) %>% group_by(tid) %>%
mutate(row = row_number()) %>%
group_by(kommunid) %>%
mutate(id_lead = lead(row)) %>%
ungroup() %>%
drop_na(id_lead) %>% select(tid, row, id_lead) %>%
mutate(tid = tid %>% factor %>% as.numeric) %>%
retype %>%
nest(data = c(-tid)) %>%
mutate(data = map(data, ~py_dict(.x$row, .x$id_lead))) %>% as.list() %>%
.$data ->
kommuner_dict
Then we import the aligned UMAP function from the UMAP package, in the Python session:
py_run_string("import umap.aligned_umap")
Then we can run the alignment UMAP. The reason for not using a converted function is due to the conversion of either the dictionary or the slices that doesn’t work properly. I haven’t figured out a way to solve that.
tic()
py_run_string("aligned_mapper = umap.aligned_umap.AlignedUMAP().fit(r.kommun_slices, relations=r.kommuner_dict)")
toc()
Then pull the embeddings, convert it into a Python list, and then into an R list:
py_run_string("embeddings = list(aligned_mapper.embeddings_)")
py$embeddings -> embeddings
Then we can make some plots, if we first combine the embeddings with the original data:
valresultat_kommuner %>%
arrange(tid, kommunid) %>%
nest(data=-tid) %>%
bind_cols(embeddings %>% enframe()) %>%
mutate(value = map(value, ~as.data.frame(.x))) %>%
unnest(c(data, value)) %>%
select(-name) ->
combined_data
combined_data %>%
ggplot(aes(V1, V2))+
geom_point()+
labs(x=NULL, y=NULL)+
coord_equal()+
facet_wrap(~tid)+
theme(axis.text = element_blank(), axis.ticks = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())
Then we can create an interactive plot with a time slider. However, the set of municipalities changes over time, which creates “structural breaks” when the set of municipalies changes. This is a bug in Plotly. Compare e.g. 1998 with 2002.
combined_data %>%
rename(`Election Year` = tid) %>%
ggplot(aes(V1, V2))+
geom_point(aes(frame=`Election Year`,
label = kommun))+
labs(x=NULL, y=NULL)+
coord_equal() +
theme(axis.text = element_blank(), axis.ticks = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank()) ->
plotobj
ggplotly(plotobj,
tooltip = "label")
We can create an animated plot to compare with:
combined_data %>%
ggplot(aes(V1, V2, group=kommun))+
theme(axis.text = element_blank(), axis.ticks = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_point()+
transition_time(tid)+
labs(title = 'Year: {frame_time}',
x=NULL,
y=NULL) -> animation_plot
anim_save("animation.gif", animation_plot)
Here we clearly see that the “structural breaks” we saw before was an artifact from Plotly.