Final Project
Loading Packages
#Load necessary packages
library(flextable)
library(ggthemes)
library(ggplot2)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::compose() masks flextable::compose()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'naniar'
##
## The following object is masked from 'package:skimr':
##
## n_complete
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following objects are masked from 'package:flextable':
##
## highlight, style
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.0
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::compose() masks flextable::compose()
## ✖ scales::discard() masks purrr::discard()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
Import the data into R
county_presidents <- read.csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/countypres_2000-2020.csv", stringsAsFactors = TRUE)
#Display first 5 rows from the imported dataset
head(county_presidents, n = 5)
## year state state_po county_name county_fips office candidate
## 1 2000 ALABAMA AL AUTAUGA 1001 US PRESIDENT AL GORE
## 2 2000 ALABAMA AL AUTAUGA 1001 US PRESIDENT GEORGE W. BUSH
## 3 2000 ALABAMA AL AUTAUGA 1001 US PRESIDENT RALPH NADER
## 4 2000 ALABAMA AL AUTAUGA 1001 US PRESIDENT OTHER
## 5 2000 ALABAMA AL BALDWIN 1003 US PRESIDENT AL GORE
## party candidatevotes totalvotes version mode
## 1 DEMOCRAT 4942 17208 20220315 TOTAL
## 2 REPUBLICAN 11993 17208 20220315 TOTAL
## 3 GREEN 160 17208 20220315 TOTAL
## 4 OTHER 113 17208 20220315 TOTAL
## 5 DEMOCRAT 13997 56480 20220315 TOTAL
1) Data Dictionary
dataDictionary <- tibble(Variable = colnames(county_presidents),
Description = c("Year of election (1976 to 2020)",
"State Name",
"State postal code abbreviation",
"County name",
"County FIPS code",
"Name of the public office to which the candidate is seeking election",
"Candidate Name",
"Party",
"Number of votes for candidate",
"Total votes cast in the election",
"Version",
"Mode"),
Type = map_chr(county_presidents, .f = function(x){typeof(x)[1]}),
Class = map_chr(county_presidents, .f = function(x){class(x)[1]}))
Variable | Description | Type | Class |
---|---|---|---|
year | Year of election (1976 to 2020) | integer | integer |
state | State Name | integer | factor |
state_po | State postal code abbreviation | integer | factor |
county_name | County name | integer | factor |
county_fips | County FIPS code | integer | integer |
office | Name of the public office to which the candidate is seeking election | integer | factor |
candidate | Candidate Name | integer | factor |
party | Party | integer | factor |
candidatevotes | Number of votes for candidate | integer | integer |
totalvotes | Total votes cast in the election | integer | integer |
version | Version | integer | integer |
mode | Mode | integer | factor |
2) Data Cleaning
a) Merging Data Sets
Merging of data sets was handled in Data Visualizations part.
b) String Manipulation
c) Missingness of data
Name | county_presidents |
Number of rows | 72617 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 6 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
candidate | 0 | 1 | 5 | 17 | 0 | 13 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
state | 0 | 1 | FALSE | 51 | TEX: 5334, GEO: 4452, VIR: 3736, NOR: 3600 |
state_po | 0 | 1 | FALSE | 51 | TX: 5334, GA: 4452, VA: 3736, NC: 3600 |
county_name | 0 | 1 | FALSE | 1892 | WAS: 727, FRA: 576, JEF: 563, JAC: 541 |
office | 0 | 1 | FALSE | 1 | US : 72617 |
party | 0 | 1 | FALSE | 5 | DEM: 20906, REP: 20906, OTH: 19815, GRE: 6035 |
mode | 0 | 1 | FALSE | 16 | TOT: 60583, ELE: 3737, ABS: 1995, PRO: 1832 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1 | 2011.30 | 7.52 | 2000 | 2004 | 2012 | 2020 | 2020 | ▇▃▃▃▇ |
county_fips | 57 | 1 | 30264.48 | 15397.70 | 1001 | 18103 | 29205 | 45057 | 56045 | ▃▇▅▆▇ |
candidatevotes | 0 | 1 | 10781.83 | 46034.94 | 0 | 115 | 1278 | 5848 | 3028885 | ▇▁▁▁▁ |
totalvotes | 0 | 1 | 42514.21 | 121951.40 | 0 | 5175 | 11194 | 29855 | 4264365 | ▇▁▁▁▁ |
version | 0 | 1 | 20220315.00 | 0.00 | 20220315 | 20220315 | 20220315 | 20220315 | 20220315 | ▁▁▇▁▁ |
Only county_fips variable has 57 missing values in this data set
# Lollipop chart for displaying data missingness
county_presidents |> gg_miss_var(show_pct = T) +
labs(title = "Missingness for County Presidents data") +
theme_few()
# Heatmap displaying data missingness
missing_data <- county_presidents |>
select(candidate, year, county_fips, state, office, party) |>
gg_miss_fct(fct = candidate) +
labs(x = "Candidate Name",
y = "Variable",
title = "Interactive heatmap displaying missingness for each variable based on candidate")
ggplotly(missing_data)
3) Exploratory Data Analysis
a) Tables of Summary Statistics
# Load function for printing tables nicely
source("https://raw.githubusercontent.com/dilernia/STA323/main/Functions/make_flex.R")
# Calculating descriptive statistics for candidatevotes in Michigan state
candidateVotesMichiganStats <- county_presidents |>
dplyr::filter(str_to_upper(state) == "MICHIGAN") |>
dplyr::group_by(year) |>
dplyr::summarize(
Minimum = min(candidatevotes, na.rm = TRUE),
Q1 = quantile(candidatevotes, na.rm = TRUE, probs = 0.25),
MEDIAN = median(candidatevotes, na.rm = TRUE),
Q3 = quantile(candidatevotes, na.rm = TRUE, probs = 0.75),
Maximum = max(candidatevotes, na.rm = TRUE),
Mean = mean(candidatevotes, na.rm = TRUE),
Range = Maximum - Minimum,
SD = sd(candidatevotes, na.rm = TRUE),
Variance = var(candidatevotes, na.rm = TRUE)
)
# Printing table of statistics
candidateVotesMichiganStats |>
make_flex(caption = "Quantitative summary statistics for candidate votes per year in Michigan.", ndigits = 2)
year | Minimum | Q1 | MEDIAN | Q3 | Maximum | Mean | Range | SD | Variance |
---|---|---|---|---|---|---|---|---|---|
2,000 | 10 | 246.00 | 2,066.50 | 7,731.50 | 530,414 | 12,749.13 | 530,404 | 42,555.29 | 1,810,952,999.80 |
2,004 | 28 | 550.00 | 5,441.00 | 12,964.00 | 600,047 | 19,434.75 | 600,019 | 55,577.13 | 3,088,817,269.88 |
2,008 | 44 | 759.00 | 5,860.00 | 13,208.00 | 660,085 | 20,087.41 | 660,041 | 58,529.93 | 3,425,752,186.34 |
2,012 | 25 | 590.00 | 5,107.00 | 12,659.00 | 595,846 | 18,999.84 | 595,821 | 54,957.68 | 3,020,346,565.63 |
2,016 | 93 | 1,732.00 | 4,950.00 | 14,646.00 | 519,444 | 19,274.23 | 519,351 | 51,570.21 | 2,659,487,057.90 |
2,020 | 2 | 55.50 | 473.00 | 7,521.00 | 597,170 | 13,347.72 | 597,168 | 48,569.33 | 2,358,979,348.85 |
# Calculating descriptive statistics for total votes in the year 2020
totalVotesIn2020Stats <- county_presidents |>
dplyr::filter(year == 2020) |>
group_by(candidate) |>
dplyr::summarize(
Minimum = min(totalvotes, na.rm = TRUE),
Q1 = quantile(totalvotes, na.rm = TRUE, probs = 0.25),
MEDIAN = median(totalvotes, na.rm = TRUE),
Q3 = quantile(totalvotes, na.rm = TRUE, probs = 0.75),
Maximum = max(totalvotes, na.rm = TRUE),
Mean = mean(totalvotes, na.rm = TRUE),
Range = Maximum - Minimum,
SD = sd(totalvotes, na.rm = TRUE),
Variance = var(totalvotes, na.rm = TRUE)
)
# Printing table of statistics
totalVotesIn2020Stats |>
make_flex(caption = "Quantitative summary statistics for total votes per candidate casted in the election in 2020", ndigits = 2)
candidate | Minimum | Q1 | MEDIAN | Q3 | Maximum | Mean | Range | SD | Variance |
---|---|---|---|---|---|---|---|---|---|
DONALD J TRUMP | 66 | 5,894.00 | 12,947.00 | 35,172.00 | 4,264,365 | 47,462.47 | 4,264,299 | 133,081.20 | 17,710,605,343.95 |
JO JORGENSEN | 66 | 5,832.00 | 12,893.00 | 35,311.00 | 4,264,365 | 47,929.03 | 4,264,299 | 134,838.63 | 18,181,457,114.49 |
JOSEPH R BIDEN JR | 66 | 5,894.00 | 12,947.00 | 35,172.00 | 4,264,365 | 47,462.47 | 4,264,299 | 133,081.20 | 17,710,605,343.95 |
OTHER | 66 | 6,924.25 | 15,204.50 | 42,829.00 | 4,264,365 | 55,797.31 | 4,264,299 | 153,835.07 | 23,665,229,745.10 |
#Frequency table for obtaining candidates participated in each year
candidate_occurrence <- with(county_presidents, tapply(candidate, year, function(x) unique(x)))
#Convert to dataframe and set column names
candidates_year <- candidate_occurrence |>
as.data.frame() |>
setNames(c("Candidates"))
# Make index as first column in dataframe
candidates_year <- cbind(Year = rownames(candidates_year), candidates_year)
rownames(candidates_year) <- rownames(1:nrow(candidates_year))
data.frame(lapply(candidates_year, as.character), stringsAsFactors=FALSE) |>
make_flex(caption = "Presidential candidates for US across years")
Year | Candidates |
---|---|
2000 | c("AL GORE", "GEORGE W. BUSH", "RALPH NADER", "OTHER") |
2004 | c("JOHN KERRY", "GEORGE W. BUSH", "OTHER") |
2008 | c("BARACK OBAMA", "JOHN MCCAIN", "OTHER") |
2012 | c("BARACK OBAMA", "MITT ROMNEY", "OTHER") |
2016 | c("HILLARY CLINTON", "DONALD TRUMP", "OTHER") |
2020 | c("JOSEPH R BIDEN JR", "OTHER", "DONALD J TRUMP", "JO JORGENSEN") |
b) Data Visualizations
county_presidents |>
group_by(candidate, party) |>
summarize(total_candidatevotes = sum(candidatevotes, na.rm = TRUE)) |>
ggplot(aes(x = forcats::fct_reorder(candidate, total_candidatevotes, .desc = TRUE), y = total_candidatevotes, fill = party)) +
geom_col() +
scale_y_continuous(labels = scales::label_comma(), expand = expansion(mult = c(0.0, 0.1))) +
scale_fill_manual(values = c("DEMOCRAT" = "blue", "GREEN" = "green", "LIBERTARIAN" = "#FFDF00", "OTHER" = "gray", "REPUBLICAN" = "red")) +
labs(x = "Candidate Name",
y = "Candidate Votes",
title = "Total number of votes polled across years for each \ncandidate in US",
fill = "Party") +
theme_few() +
theme(title = element_text(face = "bold"),
legend.position = "bottom",
legend.title = element_blank()) +
coord_flip()
## `summarise()` has grouped output by 'candidate'. You can override using the
## `.groups` argument.
county_presidents |>
group_by(year) |>
summarize(total_votes = sum(candidatevotes)) |>
ggplot(aes(x = year, y = total_votes)) +
geom_line(size = 0.5, color = "black") +
scale_y_continuous(labels = label_comma()) +
labs(title = "Total Number of votes across years in US",
x = "Year",
y = "Total Votes") +
theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Downloading state-level shape files from US Census Bureau
if(!file.exists("cb_2018_us_state_500k.zip")) {
download.file(url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip",
destfile = "cb_2018_us_state_500k.zip")
}
# Create directory for geospatial files
if(!dir.exists("GeoFiles")) {
dir.create("GeoFiles")
}
# Unzipping files
utils::unzip("cb_2018_us_state_500k.zip",
exdir = "GeoFiles")
# Loading the shapefiles
state_shape <- st_read("GeoFiles//cb_2018_us_state_500k.shp",
quiet = TRUE)
# Downloading county-level shape files from US Census Bureau
if(!file.exists("cb_2018_us_county_500k.zip")) {
download.file(url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip",
destfile = "cb_2018_us_county_500k.zip")
}
if(!dir.exists("GeoFiles")) {
dir.create("GeoFiles/")
}
# Unzipping files
utils::unzip("cb_2018_us_county_500k.zip",
exdir = "GeoFiles")
# Loading the shapefiles
county_shape <- st_read("GeoFiles//cb_2018_us_county_500k.shp",
quiet = TRUE)
# Downloading and importing state FIPS codes
if(!file.exists("state-geocodes-v2020.xlsx")) {
download.file("https://www2.census.gov/programs-surveys/popest/geographies/2020/state-geocodes-v2020.xlsx",
destfile = "state-geocodes-v2020.xlsx", mode = "wb")
}
state_fips <- readxl::read_excel("state-geocodes-v2020.xlsx", skip = 5) |>
dplyr::transmute(state_fips = `State (FIPS)`,
state = Name)
county_presidents <- county_presidents |>
dplyr::mutate(party_new = case_when(party == "DEMOCRAT" ~ "democrat",
party == "REPUBLICAN" ~ "republican",
TRUE ~ "other"),
prop_vote = candidatevotes / totalvotes,
state_fips = fips(state))
# Merging elections data with shape file data
state_shape_full <- state_shape |>
dplyr::left_join(county_presidents |>
dplyr::mutate(GEOID = str_pad(state_fips, width = 2, side = "left", pad = "0")) |>
group_by(year, state, party_new) |>
slice_max(order_by = prop_vote, n = 1) |>
ungroup() |>
pivot_wider(
id_cols = c(year:office, GEOID),
names_from = party_new,
values_from = c(candidate, prop_vote),
names_sep = "_",
names_glue = "{party_new}_{.value}") |>
rowwise() |>
mutate(across(c(democrat_candidate, other_candidate, republican_candidate),
~ str_to_title(str_c(rev(str_split(., ", ")[[1]]), collapse = " ")))),
by = c("GEOID" = "GEOID"))
percent_democratic <- state_shape_full |>
dplyr::filter(year == 2012,
!is.na(democrat_prop_vote)) |>
tigris::shift_geometry(geoid_column = "GEOID")
# Adding labels column for interactivity
percent_democratic <- percent_democratic |>
dplyr::mutate(Result = paste0("<b>", stringr::str_to_title(state), "<b><br>",
democrat_candidate, ": ", round(democrat_prop_vote, 4)*100, "% <br>",
republican_candidate, ": ", round(republican_prop_vote, 4)*100, "% <br>",
other_candidate, ": ", round(other_prop_vote, 4)*100, "%") |>
lapply(htmltools::HTML))
# Plot it
electionGG <- percent_democratic |>
ggplot(aes(fill = democrat_prop_vote,
text = Result)) +
geom_sf() +
scale_fill_gradient2(low = "#DC3220",
mid = "#ffffff",
high = "#005AB5",
midpoint = 0.50) +
labs(title = "United States 2012 Presidential Election - Democrats",
fill = "Percent Democrat") +
theme_void()
ggplotly(electionGG, tooltip = c("text"))