As a final project, using the knowledge gained from the classes, we will examine the factors influencing voter turnout. We decided to choose Polish elections because of the access to complete election data on the communes level, knowledge of the local context and the interest in political trends in our country.
The dataset used in this project contains multiple location based (on the commune level) tables with information about the standard of living in a given commune, level of development, culture, expenses and many others obtained mostly from the Polish Central Statistical Office website.
We load multiple libraries for data manipulation and visualization together with tools for data modelling.
# data manipulation
library(tidyverse)
library(tidyr)
library(dplyr)
library(stringr)
library(sf)
library(rmapshaper)
library(tigris)
#visualizations
library(ggplot2)
library(rgdal)
library(leaflet)
library(RColorBrewer)
#modeling and analysis
library(spdep)
library(lmtest)
library(spatialreg)
library(texreg) #for table comparison between different models
# library(rgeos)We load the following csv files and shape files:
# csv files from GUS
unempl <- read.csv("data/bezrobocie.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
inc <- read.csv("data/dochody na mieszk 2016-2019.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
fem <- read.csv("data/feminizacja.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
people <- read.csv("data/ludnosc 2016-2019.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
people_prod <- read.csv("data/ludnosc ogol przed poprod.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
migration <- read.csv("data/saldo migracji stalych na 1000 osob.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
soc_ben <-read.csv("data/swiadczenie wychowawcze 500 sty-czer.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
wat_sew_gas <- read.csv("data/wodkangaz.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
exp <- read.csv("data/wydatki na mieszk 2016-2019.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
area <- read.csv("data/powierzchnia.csv", header=TRUE, sep=",", encoding = "UTF-8")
# data from elections (source: https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/pl/dane_w_arkuszach)
elect <- read.csv("data/wyniki_gl_na_listy_po_obwodach_sejm.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
elect_gm <- read.csv("data/wyniki_gl_na_listy_po_gminach_sejm.csv", header=TRUE, dec=",", sep=";", encoding = "UTF-8")
# shapefiles
voi <-readOGR("data", "Wojewodztwa", encoding = "UTF-8")
pov <-readOGR("data", "Powiaty", encoding = "UTF-8")
com <-readOGR("data", "Gminy", encoding = "UTF-8")
com_shp <- read_sf('data/Gminy.shp')# changing projections
voi<-spTransform(voi, CRS("+proj=longlat +datum=NAD83"))
pov<-spTransform(pov, CRS("+proj=longlat +datum=NAD83"))
com<-spTransform(com, CRS("+proj=longlat +datum=NAD83"))
# expenditures - we calculate a mean of the years since last elections
colnames(exp) <- c("Kod", "Nazwa", "gminy_2016_pln", "gminy_2017_pln",
"gminy_2018_pln", "gminy_2019_pln", "X")
# there are some NA's but last year is full so we can omit NA's and won't have any lacks of data
exp$expenses<-rowMeans(exp[,3:6], na.rm = T) #mean of years 2016-2019
exp[2:7] <- NULL # we leave only code & created data
# feminization
colnames(fem)[3] <- "femin"
fem[c(2,4)] <- NULL
# income - we calculate a mean of the years since last elections
colnames(inc) <- c("Kod", "Nazwa", "gminy_2016_pln", "gminy_2017_pln",
"gminy_2018_pln", "gminy_2019_pln", "X")
inc$income<-rowMeans(inc[,3:6], na.rm = T)
inc[2:7] <- NULL
# migration - we calculate a mean of the years since last elections
colnames(migration) <- c("Kod", "Nazwa", "saldo_migracji_2016", "saldo_migracji_2017",
"saldo_migracji_2018", "saldo_migracji_2019", "X")
migration$migration<-rowMeans(migration[,3:6], na.rm = T)
migration[2:7] <- NULL
# people_density
colnames(people)[6] <- "people_density2019"
people[c(2:5,7)] <- NULL
# people_prod - we divide number of pre- and post-working age population by total population
colnames(people_prod) <- c("Kod", "Nazwa", "All_2016", "All_2017", "All_2018",
"All_2019","przedprodukcyjny_2016", "przedprodukcyjny_2017",
"przedprodukcyjny_2018", "przedprodukcyjny_2019",
"poprodukcyjny_2016", "poprodukcyjny_2017", "poprodukcyjny_2018",
"poprodukcyjny_2019", "X")
people_prod$prework <- people_prod$przedprodukcyjny_2019 / people_prod$All_2019
people_prod$postwork <- people_prod$poprodukcyjny_2019 / people_prod$All_2019
people_prod[c(2:5, 7:15)] <- NULL
# soc_ben - we divide number of families receiving 500+ by total population of the commune
colnames(soc_ben)[3] <- "500+number_per_moth"
soc_ben$benefit500 <- soc_ben$`500+number_per_moth` / people_prod$All_2019
soc_ben[c(2:4)] <- NULL
# unemployment in 2019
colnames(unempl)[3] <- "unemployment"
unempl[c(2,4)] <- NULL
# wat_sew_gas: 3 separate variables
colnames(wat_sew_gas)[3:5] <- c("water", "sewage", "gas")
wat_sew_gas[c(2,6)] <- NULL
# area
colnames(area)[3] <- "area"
area[c(2,4)] <- NULL
colnames(area)[1] <- "Kod"
### merging all sets into one data by the code of commune
data <- Reduce(function(x,y) merge(x = x, y = y, by = "Kod"),
list(exp, fem, inc, migration, people, people_prod, soc_ben,
unempl, wat_sew_gas, area))
# The last digit in this code defines the type of gmina and is not needed.
# In data from elections this digit isn't included so we'll remove it from here also.
data$Kod <-
data$Kod %>% as.character() %>% substr(1, nchar(data$Kod)-1) %>% as.numeric()
#election data on the commune level preperation
matched <- intersect(data$Kod, elect_gm$Kod.TERYT)
all <- union(data$Kod, elect_gm$Kod.TERYT)
non_matched <- all[!all %in% matched]
# There are some numbers that don't match. Most of them results from differences in Warsaw.
# It is a city with code 146501, but in data from election it is divided by
# districts (codes 146502-146519). We need to merge this data together so we'll
# have one data for Warsaw.
elect_gm[c(5, 7:25)] <- NULL # Removing redundant columns
# Renaming columns: column with number of people entitled to vote and
# column with number of valid votes
colnames(elect_gm)[5:6] <- c("entitled", "votes")
# summing Warsaw data to the 1 obs
elect_gm[elect_gm$Kod.TERYT == 146502, 5:16] <-
colSums(elect_gm[elect_gm$Kod.TERYT >= 146501 &
elect_gm$Kod.TERYT <= 146519,
5:16]) # Sum of district values for numerical columns to the 1st district
# Renaming 1st district to the code and name of Warsaw
elect_gm[elect_gm$Kod.TERYT == 146502, 1:2] <- c(146501, "Warszawa")
# removing rest of Warsaw districts and abroad/ships
elect_gm <- elect_gm[elect_gm$Kod.TERYT < 146502 | elect_gm$Kod.TERYT > 149901,]
## Adding new variables from election results
# frequency - our target variable - in %
elect_gm$freq <- elect_gm$votes / elect_gm$entitled * 100
# votes for oposition - as a fraction of all votes:
# a will to change the current government as a possible factor to drive people to the elections
elect_gm$opos <- rowSums(elect_gm[c(7:11, 13:16)], na.rm = T) / elect_gm$votes
elect_gm[c(7:16)] <- NULL # removing redundant variables
# merging data community level
data <- merge(x = data, y = elect_gm, by.x = "Kod", by.y = "Kod.TERYT")
## places & districts
# There are some obs without teritorial codes, they are empty and from the
# unique places outside the country like military bases that didn't participate
# in voting -> we can remove them.
elect <- elect[!is.na(elect$Kod.TERYT),]
matched <- intersect(data$Kod, elect$Kod.TERYT)
all <- union(data$Kod, elect$Kod.TERYT)
non_matched <- all[!all %in% matched]
#Still we have the same problem with Warsaw divided by districts and with ships and abroad
#recoding Warsaw districts to general Warsaw code
elect[elect$Kod.TERYT >= 146502 & elect$Kod.TERYT <= 146519, 2] <- 146501
# WE will need only generally available locations and we will use them to count for
# community the average of people enabled to vote at a one place (obwod) to see
# if maybe possibility of bigger crowds and queues makes people less interested in
# voting in contrast to regions with more locations per number of enabled people.
# We will also calculate the number of physical places which may be useful later.
elect <- elect %>%
filter(Typ.obwodu == "stały") %>%
group_by(Kod.TERYT) %>%
summarise(places = length(unique(Siedziba)),
mean_enabled = mean(as.numeric(Liczba.wyborców.uprawnionych.do.głosowania)))
# Adding this data to our main one
data <- merge(x = data, y = elect, by.x = "Kod", by.y = "Kod.TERYT")
# now we can use area and this number of voting places to make a new variable
data$place_area <- data$area / data$places
#how big area more or less is for one place of voting. Bigger areas may mean for example
# that some people have farther from home and may be less willing to go there.
# It is a simplification, but it allows to measure this aspect which comes from
# the more detailed info and to average it on the level of gminy
# geo join for visualizations
com$JPT_KOD_JE <- str_sub(com$JPT_KOD_JE, 1, nchar(com$JPT_KOD_JE)-1)
com$JPT_KOD_JE <- gsub("^0", "",com$JPT_KOD_JE)
com_shp$JPT_KOD_JE <- str_sub(com_shp$JPT_KOD_JE, 1, nchar(com_shp$JPT_KOD_JE)-1)
com_shp$JPT_KOD_JE <- gsub("^0", "",com_shp$JPT_KOD_JE)
data$Kod <- as.character(data$Kod)
com_joined <- geo_join(com, data, 'JPT_KOD_JE', 'Kod', how = "left")
com_joined_vis <- geo_join(com_shp, data, 'JPT_KOD_JE', 'Kod', how = "left")
com_joined_vis <- st_as_sf(com_joined)
data <- as.data.frame(com_joined)
data <- data[30:54]
#simplify polygons for better performance
com_joined_light <- ms_simplify(com_joined_vis)palFreq <- colorNumeric("BuGn", NULL)
palOpos <- colorNumeric("Blues", NULL)
palUnemploy <- colorNumeric("OrRd", NULL)
com_joined_vis <-
com_joined_vis %>%
dplyr::mutate(lab_txt = paste0('<strong>Gmina</strong>: ',
Gmina,
'<br><strong>Powiat</strong>: ',
Powiat,
'<br><strong>Województwo</strong>: ',
Województwo,
'<br><strong>Election turnout</strong>: ',
round(freq,2), '%',
'<br><strong>Opposition results</strong>: ',
round(opos*100,2), '%',
'<br><strong>unemployment</strong>: ',
unemployment, '%'
))
labs <- as.list(com_joined_vis$lab_txt)
leaflet(com_joined_light,
options = leafletOptions(minZoom = 7)) %>%
addPolygons(color = "#575757",
weight = 1,
smoothFactor = 0.8,
opacity = 0.5,
fillOpacity = 0.75,
fillColor = ~palFreq(freq),
label = lapply(labs, htmltools::HTML),
highlightOptions = highlightOptions(color = "white",
weight = 2,
bringToFront = TRUE),
group = "Turnout") %>%
addPolygons(color = "#575757",
weight = 1,
smoothFactor = 0.8,
opacity = 0.5,
fillOpacity = 0.75,
fillColor = ~palOpos(opos),
label = lapply(labs, htmltools::HTML),
highlightOptions = highlightOptions(color = "white",
weight = 2,
bringToFront = TRUE),
group = "Oposition results") %>%
addPolygons(color = "#575757",
weight = 1,
smoothFactor = 0.8,
opacity = 0.5,
fillOpacity = 0.75,
fillColor = ~palUnemploy(unemployment),
label = lapply(labs, htmltools::HTML),
highlightOptions = highlightOptions(color = "white",
weight = 2,
bringToFront = TRUE),
group = "Unemployment") %>%
addLegend(position = "bottomleft" ,
pal = palFreq,
values = ~freq,
opacity = .5,
bins = 5,
title = 'Election turnout',
labFormat = labelFormat(suffix = '%'),
group = "Turnout") %>%
addLegend(position = "bottomleft" ,
pal = palOpos,
values = ~opos*100,
opacity = .5,
bins = 5,
title = 'Oposition results',
labFormat = labelFormat(suffix = '%'),
group = "Turnout") %>%
addLegend(position = "bottomleft" ,
pal = palUnemploy,
values = ~unemployment,
opacity = .5,
bins = 5,
title = 'Unemployment rate',
labFormat = labelFormat(suffix = '%'),
group = "Turnout") %>%
addLayersControl(baseGroups = c("Turnout", "Oposition results", "Unemployment"),
options = layersControlOptions(collapsed = FALSE))