Final Project

Loading Packages

#Load necessary packages
library(flextable)
library(ggthemes)
library(ggplot2)
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::compose() masks flextable::compose()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(skimr)
library(naniar)

## 
## Attaching package: 'naniar'
## 
## The following object is masked from 'package:skimr':
## 
##     n_complete

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following objects are masked from 'package:flextable':
## 
##     highlight, style
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(leaflet)
library(usmap)
library(tigris)

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

library(gtrendsR)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.0 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::compose()  masks flextable::compose()
## ✖ scales::discard() masks purrr::discard()
## ✖ plotly::filter()  masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

Import the data into R

county_presidents <- read.csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/countypres_2000-2020.csv", stringsAsFactors = TRUE)

#Display first 5 rows from the imported dataset
head(county_presidents, n = 5)

##   year   state state_po county_name county_fips       office      candidate
## 1 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT        AL GORE
## 2 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT GEORGE W. BUSH
## 3 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT    RALPH NADER
## 4 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT          OTHER
## 5 2000 ALABAMA       AL     BALDWIN        1003 US PRESIDENT        AL GORE
##        party candidatevotes totalvotes  version  mode
## 1   DEMOCRAT           4942      17208 20220315 TOTAL
## 2 REPUBLICAN          11993      17208 20220315 TOTAL
## 3      GREEN            160      17208 20220315 TOTAL
## 4      OTHER            113      17208 20220315 TOTAL
## 5   DEMOCRAT          13997      56480 20220315 TOTAL

1) Data Dictionary

dataDictionary <- tibble(Variable = colnames(county_presidents),
                         Description = c("Year of election (1976 to 2020)",
                                         "State Name",
                                         "State postal code abbreviation",
                                         "County name",
                                         "County FIPS code",
                                         "Name of the public office to which the candidate is seeking election",
                                         "Candidate Name",
                                         "Party",
                                         "Number of votes for candidate",
                                         "Total votes cast in the election",
                                         "Version",
                                         "Mode"),
                         Type = map_chr(county_presidents, .f = function(x){typeof(x)[1]}),
                         Class = map_chr(county_presidents, .f = function(x){class(x)[1]}))

dataDictionary |>
  flextable(cwidth = 2)

Variable	Description	Type	Class
year	Year of election (1976 to 2020)	integer	integer
state	State Name	integer	factor
state_po	State postal code abbreviation	integer	factor
county_name	County name	integer	factor
county_fips	County FIPS code	integer	integer
office	Name of the public office to which the candidate is seeking election	integer	factor
candidate	Candidate Name	integer	factor
party	Party	integer	factor
candidatevotes	Number of votes for candidate	integer	integer
totalvotes	Total votes cast in the election	integer	integer
version	Version	integer	integer
mode	Mode	integer	factor

2) Data Cleaning

a) Merging Data Sets

Merging of data sets was handled in Data Visualizations part.

b) String Manipulation

#Clean the candidate names
county_presidents <- county_presidents |>
  mutate(candidate = str_replace(candidate, "Donald Trump|Donald J Trump", "Donald Trump")) |>
  mutate(candidate = str_replace(candidate, "Joseph R Biden Jr", "Joe Biden"))

c) Missingness of data

county_presidents |>
  skim()

Data summary
Name	county_presidents
Number of rows	72617
Number of columns	12
_______________________
Column type frequency:
character	1
factor	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
candidate	0	1	5	17	0	13	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
state	1	FALSE	51	TEX: 5334, GEO: 4452, VIR: 3736, NOR: 3600
state_po	1	FALSE	51	TX: 5334, GA: 4452, VA: 3736, NC: 3600
county_name	1	FALSE	1892	WAS: 727, FRA: 576, JEF: 563, JAC: 541
office	1	FALSE	1	US : 72617
party	1	FALSE	5	DEM: 20906, REP: 20906, OTH: 19815, GRE: 6035
mode	1	FALSE	16	TOT: 60583, ELE: 3737, ABS: 1995, PRO: 1832

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1	2011.30	7.52	2000	2004	2012	2020	2020	▇▃▃▃▇
county_fips	57	1	30264.48	15397.70	1001	18103	29205	45057	56045	▃▇▅▆▇
candidatevotes	0	1	10781.83	46034.94	0	115	1278	5848	3028885	▇▁▁▁▁
totalvotes	0	1	42514.21	121951.40	0	5175	11194	29855	4264365	▇▁▁▁▁
version	0	1	20220315.00	0.00	20220315	20220315	20220315	20220315	20220315	▁▁▇▁▁

Only county_fips variable has 57 missing values in this data set

# Lollipop chart for displaying data missingness
county_presidents |> gg_miss_var(show_pct = T) +
  labs(title = "Missingness for County Presidents data") +
  theme_few()

# Heatmap displaying data missingness
missing_data <- county_presidents |>
  select(candidate, year, county_fips, state, office, party) |>
  gg_miss_fct(fct = candidate) +
  labs(x = "Candidate Name", 
       y = "Variable", 
       title = "Interactive heatmap displaying missingness for each variable based on candidate")

ggplotly(missing_data)

3) Exploratory Data Analysis

a) Tables of Summary Statistics

# Load function for printing tables nicely
source("https://raw.githubusercontent.com/dilernia/STA323/main/Functions/make_flex.R")

# Calculating descriptive statistics for candidatevotes in Michigan state
candidateVotesMichiganStats <- county_presidents |>
  dplyr::filter(str_to_upper(state) == "MICHIGAN") |>
  dplyr::group_by(year) |>
  dplyr::summarize(
  Minimum = min(candidatevotes, na.rm = TRUE),
  Q1 = quantile(candidatevotes, na.rm = TRUE, probs = 0.25),
  MEDIAN = median(candidatevotes, na.rm = TRUE),
  Q3 = quantile(candidatevotes, na.rm = TRUE, probs = 0.75),
  Maximum = max(candidatevotes, na.rm = TRUE),
  Mean = mean(candidatevotes, na.rm = TRUE),
  Range = Maximum - Minimum,
  SD = sd(candidatevotes, na.rm = TRUE),
  Variance = var(candidatevotes, na.rm = TRUE)
)

# Printing table of statistics
candidateVotesMichiganStats |>
  make_flex(caption = "Quantitative summary statistics for candidate votes per year in Michigan.", ndigits = 2)

Table 1: Quantitative summary statistics for candidate votes per year in Michigan.
year	Minimum	Q1	MEDIAN	Q3	Maximum	Mean	Range	SD	Variance
2,000	10	246.00	2,066.50	7,731.50	530,414	12,749.13	530,404	42,555.29	1,810,952,999.80
2,004	28	550.00	5,441.00	12,964.00	600,047	19,434.75	600,019	55,577.13	3,088,817,269.88
2,008	44	759.00	5,860.00	13,208.00	660,085	20,087.41	660,041	58,529.93	3,425,752,186.34
2,012	25	590.00	5,107.00	12,659.00	595,846	18,999.84	595,821	54,957.68	3,020,346,565.63
2,016	93	1,732.00	4,950.00	14,646.00	519,444	19,274.23	519,351	51,570.21	2,659,487,057.90
2,020	2	55.50	473.00	7,521.00	597,170	13,347.72	597,168	48,569.33	2,358,979,348.85

# Calculating descriptive statistics for total votes in the year 2020
totalVotesIn2020Stats <- county_presidents |>
  dplyr::filter(year == 2020) |>
  group_by(candidate) |>
  dplyr::summarize(
  Minimum = min(totalvotes, na.rm = TRUE),
  Q1 = quantile(totalvotes, na.rm = TRUE, probs = 0.25),
  MEDIAN = median(totalvotes, na.rm = TRUE),
  Q3 = quantile(totalvotes, na.rm = TRUE, probs = 0.75),
  Maximum = max(totalvotes, na.rm = TRUE),
  Mean = mean(totalvotes, na.rm = TRUE),
  Range = Maximum - Minimum,
  SD = sd(totalvotes, na.rm = TRUE),
  Variance = var(totalvotes, na.rm = TRUE)
)

# Printing table of statistics
totalVotesIn2020Stats |> 
  make_flex(caption = "Quantitative summary statistics for total votes per candidate casted in the election in 2020", ndigits = 2)

Table 2: Quantitative summary statistics for total votes per candidate casted in the election in 2020
candidate	Minimum	Q1	MEDIAN	Q3	Maximum	Mean	Range	SD	Variance
DONALD J TRUMP	66	5,894.00	12,947.00	35,172.00	4,264,365	47,462.47	4,264,299	133,081.20	17,710,605,343.95
JO JORGENSEN	66	5,832.00	12,893.00	35,311.00	4,264,365	47,929.03	4,264,299	134,838.63	18,181,457,114.49
JOSEPH R BIDEN JR	66	5,894.00	12,947.00	35,172.00	4,264,365	47,462.47	4,264,299	133,081.20	17,710,605,343.95
OTHER	66	6,924.25	15,204.50	42,829.00	4,264,365	55,797.31	4,264,299	153,835.07	23,665,229,745.10

#Frequency table for obtaining candidates participated in each year
candidate_occurrence <- with(county_presidents, tapply(candidate, year, function(x) unique(x)))

#Convert to dataframe and set column names
candidates_year <- candidate_occurrence |>
  as.data.frame() |>
  setNames(c("Candidates")) 

# Make index as first column in dataframe
candidates_year <- cbind(Year = rownames(candidates_year), candidates_year) 
rownames(candidates_year) <- rownames(1:nrow(candidates_year))

data.frame(lapply(candidates_year, as.character), stringsAsFactors=FALSE) |>
  make_flex(caption = "Presidential candidates for US across years")

Table 3: Presidential candidates for US across years
Year	Candidates
2000	c("AL GORE", "GEORGE W. BUSH", "RALPH NADER", "OTHER")
2004	c("JOHN KERRY", "GEORGE W. BUSH", "OTHER")
2008	c("BARACK OBAMA", "JOHN MCCAIN", "OTHER")
2012	c("BARACK OBAMA", "MITT ROMNEY", "OTHER")
2016	c("HILLARY CLINTON", "DONALD TRUMP", "OTHER")
2020	c("JOSEPH R BIDEN JR", "OTHER", "DONALD J TRUMP", "JO JORGENSEN")

b) Data Visualizations

county_presidents |>
  group_by(candidate, party) |>
  summarize(total_candidatevotes = sum(candidatevotes, na.rm = TRUE)) |>
  ggplot(aes(x = forcats::fct_reorder(candidate, total_candidatevotes, .desc = TRUE), y = total_candidatevotes, fill = party)) +
  geom_col() +
  scale_y_continuous(labels = scales::label_comma(), expand = expansion(mult = c(0.0, 0.1))) +
  scale_fill_manual(values = c("DEMOCRAT" = "blue", "GREEN" =  "green", "LIBERTARIAN" = "#FFDF00", "OTHER" = "gray", "REPUBLICAN" = "red")) +
  labs(x = "Candidate Name",
       y = "Candidate Votes",
       title = "Total number of votes polled across years for each \ncandidate in US",
       fill = "Party") +
  theme_few() + 
  theme(title = element_text(face = "bold"),
        legend.position = "bottom",
        legend.title = element_blank()) +
  coord_flip()

## `summarise()` has grouped output by 'candidate'. You can override using the
## `.groups` argument.

county_presidents |>
  group_by(year) |>
  summarize(total_votes = sum(candidatevotes)) |>
  ggplot(aes(x = year, y = total_votes)) +
  geom_line(size = 0.5, color = "black") +
  scale_y_continuous(labels = label_comma()) +
  labs(title = "Total Number of votes across years in US",
       x = "Year",
       y = "Total  Votes") +
  theme_bw()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Downloading state-level shape files from US Census Bureau
if(!file.exists("cb_2018_us_state_500k.zip")) {
download.file(url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip",
              destfile = "cb_2018_us_state_500k.zip")
}

# Create directory for geospatial files
if(!dir.exists("GeoFiles")) {
dir.create("GeoFiles")
}

# Unzipping files
utils::unzip("cb_2018_us_state_500k.zip",
             exdir = "GeoFiles")

# Loading the shapefiles
state_shape <- st_read("GeoFiles//cb_2018_us_state_500k.shp",
                       quiet = TRUE)

# Downloading county-level shape files from US Census Bureau
if(!file.exists("cb_2018_us_county_500k.zip")) {
download.file(url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip",
              destfile = "cb_2018_us_county_500k.zip")
}

if(!dir.exists("GeoFiles")) {
  dir.create("GeoFiles/")
}

# Unzipping files
utils::unzip("cb_2018_us_county_500k.zip",
             exdir = "GeoFiles")

# Loading the shapefiles
county_shape <- st_read("GeoFiles//cb_2018_us_county_500k.shp",
                        quiet = TRUE)

# Downloading and importing state FIPS codes
if(!file.exists("state-geocodes-v2020.xlsx")) {
download.file("https://www2.census.gov/programs-surveys/popest/geographies/2020/state-geocodes-v2020.xlsx",
              destfile = "state-geocodes-v2020.xlsx", mode = "wb")
}

state_fips <- readxl::read_excel("state-geocodes-v2020.xlsx", skip = 5) |> 
  dplyr::transmute(state_fips = `State (FIPS)`,
                   state = Name)

county_presidents <- county_presidents |>
    dplyr::mutate(party_new = case_when(party == "DEMOCRAT" ~ "democrat",
                                    party == "REPUBLICAN" ~ "republican",
                                    TRUE ~ "other"),
                  prop_vote = candidatevotes / totalvotes,
                  state_fips = fips(state))

# Merging elections data with shape file data
state_shape_full <- state_shape |> 
  dplyr::left_join(county_presidents |> 
    dplyr::mutate(GEOID = str_pad(state_fips, width = 2, side = "left", pad = "0")) |> 
    group_by(year, state, party_new) |> 
    slice_max(order_by = prop_vote, n = 1) |> 
    ungroup() |>
    pivot_wider(
        id_cols = c(year:office, GEOID),
        names_from = party_new,
        values_from = c(candidate, prop_vote),
        names_sep = "_",
        names_glue = "{party_new}_{.value}") |> 
      rowwise() |>
    mutate(across(c(democrat_candidate, other_candidate, republican_candidate),
                  ~ str_to_title(str_c(rev(str_split(., ", ")[[1]]), collapse = " ")))),
    by = c("GEOID" = "GEOID"))

percent_democratic <- state_shape_full |> 
  dplyr::filter(year == 2012,
                !is.na(democrat_prop_vote)) |> 
  tigris::shift_geometry(geoid_column = "GEOID")

# Adding labels column for interactivity
percent_democratic <- percent_democratic |> 
dplyr::mutate(Result = paste0("<b>", stringr::str_to_title(state), "<b><br>",
  democrat_candidate, ": ", round(democrat_prop_vote, 4)*100, "% <br>",
  republican_candidate, ": ", round(republican_prop_vote, 4)*100, "% <br>",
  other_candidate, ": ", round(other_prop_vote, 4)*100, "%") |> 
  lapply(htmltools::HTML))

# Plot it
electionGG <- percent_democratic |> 
  ggplot(aes(fill = democrat_prop_vote,
             text = Result)) +
  geom_sf() +
 scale_fill_gradient2(low = "#DC3220",
                        mid = "#ffffff",
                        high = "#005AB5",
                        midpoint = 0.50) + 
  labs(title = "United States 2012 Presidential Election - Democrats", 
       fill = "Percent Democrat") +
  theme_void()

ggplotly(electionGG, tooltip = c("text"))

county_presidents_michigan <- county_presidents |>
  dplyr::mutate(GEOID = str_pad(county_fips, width = 5, side = "left", pad = "0"),
                Result = paste0("<b>", stringr::str_to_title(county_name), "<b><br>", round(prop_vote, 4) * 100, "% <br>")) |>
  dplyr::filter(str_to_upper(state) == "MICHIGAN")

# Merge shape and census data
ggMapData <- county_shape |> 
  full_join(county_presidents_michigan, by = c("GEOID" = "GEOID")) |> 
  dplyr::filter(year == 2020)

# Fixing issue with Alaska and Hawaii
ggMapDataFix <- ggMapData |> 
  tigris::shift_geometry()

## Warning: None of your features are in Alaska, Hawaii, or Puerto Rico, so no geometries will be shifted.
## Transforming your object's CRS to 'ESRI:102003'

# Plotting by thresholding max poverty level
michiganGG <- ggMapDataFix |> rowwise() |> 
  mutate(min_prop_vote = min(prop_vote, 1)) |> 
  ggplot(aes(fill = min_prop_vote, text = Result)) +
  geom_sf(color = "#C0C0C0", size = 0.1) +
  scale_fill_gradient(low = "white", high = "dodgerblue", limits = c(0, 1)) +
  labs(title = "County-level voting rates in Michigan, 2020", fill = "Voting rate") +
  theme_void()

plotly::ggplotly(michiganGG)

county_presidents |>
  group_by(year, party) |>
  summarize(total_votes = sum(totalvotes, na.rm = TRUE)) |>
  ggplot(aes(x = year, y = total_votes, color = party)) +
  geom_point() +
  facet_grid(. ~ party) +
  geom_smooth(se = FALSE) +
  scale_fill_manual(values = c("DEMOCRAT" = "blue", "GREEN" =  "green", "LIBERTARIAN" = "#FFDF00", "OTHER" = "gray", "REPUBLICAN" = "red")) +
  scale_y_continuous(labels = label_comma()) +
  labs(x = "Year",
       y = "Total Votes",
       color = "Party",
       title = "Total votes polled for each party across years in the state of Michigan")  +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40), legend.position = "none")

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 1999.9

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 0.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1999.9

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 2020.1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 0.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger

## Warning: Failed to fit group 2.
## Caused by error in `predLoess()`:
## ! NA/NaN/Inf in foreign function call (arg 5)

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

search <- gtrends(c("obama", "trump", "biden"), time= "2004-01-01 2020-12-31", geo = "US")

time_trend <- search$interest_over_time |>
  dplyr::mutate(hits=ifelse(hits=="<1",0.5,as.numeric(hits)),
                date=as.Date(date))

## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `hits = ifelse(hits == "<1", 0.5, as.numeric(hits))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

time_trend |>
  ggplot(aes(x = date, y = hits, color = keyword)) +
  geom_line() +
  labs(title = "Search trends for president names in US during 2004 to 2020",
       x = "Year",
       y = "Seach Hits",
       color = "Keyword") +
  theme_bw()

4) Monte Carlo Methods of Inference

# Obtain a sample of data
trialCountyData <- county_presidents |>
  dplyr::filter(party %in% c("REPUBLICAN", "DEMOCRAT") & state == "MICHIGAN") |>
  select(party, candidatevotes) |>
  arrange(party) |>
  droplevels()

head(trialCountyData)

##      party candidatevotes
## 1 DEMOCRAT           2696
## 2 DEMOCRAT           2071
## 3 DEMOCRAT          15495
## 4 DEMOCRAT           7053
## 5 DEMOCRAT           4329
## 6 DEMOCRAT           3685

# Implement welch's two-sample t-test
tResult <- trialCountyData |>
  t_test(candidatevotes ~ party, order = c("DEMOCRAT", "REPUBLICAN"), alternative = "greater")

tResult |>
  flextable()

statistic	t_df	p_value	alternative	estimate	lower_ci	upper_ci
0.8688701	832.5574	0.1925843	greater	3,613.359	-3,234.693	Inf

Statement of hypotheses

\[H_0 :\mu_{\text{Democrat}} \le \mu_{Republican}\] \[H_a :\mu_{\text{Democrat}} > \mu_{Republican}\]

where \(\mu_{\text{Democrat}}\) is the average number of candidate votes polled for Democrats and, \(\mu_{Republican}\) is the average number of candidate votes polled for Republicans.

Implementing the permutation test

# Number of permutations to do
nperms <- 500

set.seed(1994)

# Instantiating vector for test statistics
permTs <- vector(length = nperms)

# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
  permTs[p] <- trialCountyData |> 
    mutate(party = sample(party, replace = FALSE)) |> 
    t_test(response = candidatevotes, 
         explanatory = party,
         order = c("DEMOCRAT", "REPUBLICAN"),
         alternative = "greater") |> 
    pull(statistic)
}

Creating a histogram displaying the null distribution obtained for the randomization test

tibble(value = permTs) |>
  ggplot(aes(x = value)) +
  geom_histogram(color = "white") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(x = "Test statistic",
       y = "Frequency",
       title = "Randomization test null distribution")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Adding vertical lines to the plot to indicate where the 5th percentile is (a red dotted line), and where our observed test statistic is (solid blue line).

fifthPercentile <- quantile(permTs, probs = 0.05)

tibble(value = permTs) |>
  ggplot(aes(x = value)) +
  geom_histogram(color = "white") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  geom_vline(xintercept = quantile(permTs, probs = 0.95), color = "red", lty = "dotted", linewidth = 0.75) + 
  geom_vline(xintercept = quantile(tResult$statistic, probs = 0.95), color = "blue", lty = "solid", linewidth = 0.5) + 
  labs(x = "Test statistic",
       y = "Frequency",
       title = "Randomization test null distribution")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculating the p-value for the randomization test.

# calculating p-value for the randomization test
mean(permTs >= tResult$statistic)

## [1] 0.178

janitor::tabyl(permTs >= tResult$statistic)

##  permTs >= tResult$statistic   n percent
##                        FALSE 411   0.822
##                         TRUE  89   0.178

Since p-value is > 0.05, we fail to reject null hypotheses

Interepretation in context: There is statistically significant evidence that the average candidates votes polled for democrats was less than or equal to on average than for republicans group at the 5% significance level.

5) Bootstrap Methods of Inference

Create a subset of the data that has observations for only Repulican candidates

sampleData <- county_presidents |>
  dplyr::filter(party == "REPUBLICAN") |>
  select(party, prop_vote)

Sample size for this dataset

nrow(sampleData)

## [1] 20906

Sample Median for this dataset

median(sampleData$prop_vote, na.rm = TRUE)

## [1] 0.5834376

Next, we resample from our original sample dataset 10000 times to obtain \(B\)

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
boots <- matrix(NA, nrow = nrow(sampleData), ncol = B)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- sampleData |> 
  slice_sample(prop = 1, replace = TRUE) |> 
  dplyr::pull(prop_vote)
}

# Instantiating vector for bootstrap medians
boot_medians <- vector(length = B)

# Calculating medians for bootstrap samples
for(b in 1:B) {
  boot_medians[b] <- median(boots[, b], na.rm = TRUE)
}

Obtain a parametric bootstrap estimate of the standard error of the sample median

# calculating confidence interval bounds
quantile(boot_medians, probs = c(0.025, 0.975), na.rm = TRUE)

##      2.5%     97.5% 
## 0.5801967 0.5858483

tibble(value = boot_medians) |>
  ggplot(aes(x = value)) +
  geom_histogram(color = "black") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  geom_vline(xintercept= quantile(boot_medians, probs = 0.025, na.rm = TRUE), size=1, color="blue",linetype="dotted") +
  geom_vline(xintercept= quantile(boot_medians, probs = 0.975, na.rm = TRUE),size=1, color="blue",linetype="dotted")+
  labs(x = "Sample Median",
       y = "Count",
       title = "Non-parametric bootstrap distribution of sample medians")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Confidence interval interpretation:

We’re 95% confident that 50% of republican candidates in US elections during 2000-2020 will have a proportion of votes between 58% and 58.6% or more.

6) Conclusion

In this project, we first imported the Presidential elections data at county level in USA from 2000 to 2020. Then we check the data missingness using interactive heatmap and lollipop chart, clean the candidate names using string functions. Next, we printed summary statistics and frequency tables for the variables in the dataset. Then we created data visualizations for identifying valuable insights into voting trends, party-wise voting patterns. Also we identified search interests of specific keywords during the election years. Next, we implemented Monte Carlo methods of inference to check if average candidate votes polled for democrats is greater than that of republicans in Michigan. Lastly, we implemented non-parametric bootstrap method on republican party voting proportions.

This file was published in Rpubs and it can be found using following link:

https://rpubs.com/anugolur/1176499