Final Project

Loading Packages

#Load necessary packages
library(flextable)
library(ggthemes)
library(ggplot2)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::compose() masks flextable::compose()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(skimr)
library(naniar)
## 
## Attaching package: 'naniar'
## 
## The following object is masked from 'package:skimr':
## 
##     n_complete
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following objects are masked from 'package:flextable':
## 
##     highlight, style
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(leaflet)
library(usmap)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(gtrendsR)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.0 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::compose()  masks flextable::compose()
## ✖ scales::discard() masks purrr::discard()
## ✖ plotly::filter()  masks dplyr::filter(), stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

Import the data into R

county_presidents <- read.csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/countypres_2000-2020.csv", stringsAsFactors = TRUE)

#Display first 5 rows from the imported dataset
head(county_presidents, n = 5)
##   year   state state_po county_name county_fips       office      candidate
## 1 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT        AL GORE
## 2 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT GEORGE W. BUSH
## 3 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT    RALPH NADER
## 4 2000 ALABAMA       AL     AUTAUGA        1001 US PRESIDENT          OTHER
## 5 2000 ALABAMA       AL     BALDWIN        1003 US PRESIDENT        AL GORE
##        party candidatevotes totalvotes  version  mode
## 1   DEMOCRAT           4942      17208 20220315 TOTAL
## 2 REPUBLICAN          11993      17208 20220315 TOTAL
## 3      GREEN            160      17208 20220315 TOTAL
## 4      OTHER            113      17208 20220315 TOTAL
## 5   DEMOCRAT          13997      56480 20220315 TOTAL

1) Data Dictionary

dataDictionary <- tibble(Variable = colnames(county_presidents),
                         Description = c("Year of election (1976 to 2020)",
                                         "State Name",
                                         "State postal code abbreviation",
                                         "County name",
                                         "County FIPS code",
                                         "Name of the public office to which the candidate is seeking election",
                                         "Candidate Name",
                                         "Party",
                                         "Number of votes for candidate",
                                         "Total votes cast in the election",
                                         "Version",
                                         "Mode"),
                         Type = map_chr(county_presidents, .f = function(x){typeof(x)[1]}),
                         Class = map_chr(county_presidents, .f = function(x){class(x)[1]}))
dataDictionary |>
  flextable(cwidth = 2)

Variable

Description

Type

Class

year

Year of election (1976 to 2020)

integer

integer

state

State Name

integer

factor

state_po

State postal code abbreviation

integer

factor

county_name

County name

integer

factor

county_fips

County FIPS code

integer

integer

office

Name of the public office to which the candidate is seeking election

integer

factor

candidate

Candidate Name

integer

factor

party

Party

integer

factor

candidatevotes

Number of votes for candidate

integer

integer

totalvotes

Total votes cast in the election

integer

integer

version

Version

integer

integer

mode

Mode

integer

factor

2) Data Cleaning

a) Merging Data Sets

Merging of data sets was handled in Data Visualizations part.

b) String Manipulation

#Clean the candidate names
county_presidents <- county_presidents |>
  mutate(candidate = str_replace(candidate, "Donald Trump|Donald J Trump", "Donald Trump")) |>
  mutate(candidate = str_replace(candidate, "Joseph R Biden Jr", "Joe Biden"))

c) Missingness of data

county_presidents |>
  skim()
Data summary
Name county_presidents
Number of rows 72617
Number of columns 12
_______________________
Column type frequency:
character 1
factor 6
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
candidate 0 1 5 17 0 13 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
state 0 1 FALSE 51 TEX: 5334, GEO: 4452, VIR: 3736, NOR: 3600
state_po 0 1 FALSE 51 TX: 5334, GA: 4452, VA: 3736, NC: 3600
county_name 0 1 FALSE 1892 WAS: 727, FRA: 576, JEF: 563, JAC: 541
office 0 1 FALSE 1 US : 72617
party 0 1 FALSE 5 DEM: 20906, REP: 20906, OTH: 19815, GRE: 6035
mode 0 1 FALSE 16 TOT: 60583, ELE: 3737, ABS: 1995, PRO: 1832

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2011.30 7.52 2000 2004 2012 2020 2020 ▇▃▃▃▇
county_fips 57 1 30264.48 15397.70 1001 18103 29205 45057 56045 ▃▇▅▆▇
candidatevotes 0 1 10781.83 46034.94 0 115 1278 5848 3028885 ▇▁▁▁▁
totalvotes 0 1 42514.21 121951.40 0 5175 11194 29855 4264365 ▇▁▁▁▁
version 0 1 20220315.00 0.00 20220315 20220315 20220315 20220315 20220315 ▁▁▇▁▁

Only county_fips variable has 57 missing values in this data set

# Lollipop chart for displaying data missingness
county_presidents |> gg_miss_var(show_pct = T) +
  labs(title = "Missingness for County Presidents data") +
  theme_few()

# Heatmap displaying data missingness
missing_data <- county_presidents |>
  select(candidate, year, county_fips, state, office, party) |>
  gg_miss_fct(fct = candidate) +
  labs(x = "Candidate Name", 
       y = "Variable", 
       title = "Interactive heatmap displaying missingness for each variable based on candidate")

ggplotly(missing_data)

3) Exploratory Data Analysis

a) Tables of Summary Statistics

# Load function for printing tables nicely
source("https://raw.githubusercontent.com/dilernia/STA323/main/Functions/make_flex.R")
# Calculating descriptive statistics for candidatevotes in Michigan state
candidateVotesMichiganStats <- county_presidents |>
  dplyr::filter(str_to_upper(state) == "MICHIGAN") |>
  dplyr::group_by(year) |>
  dplyr::summarize(
  Minimum = min(candidatevotes, na.rm = TRUE),
  Q1 = quantile(candidatevotes, na.rm = TRUE, probs = 0.25),
  MEDIAN = median(candidatevotes, na.rm = TRUE),
  Q3 = quantile(candidatevotes, na.rm = TRUE, probs = 0.75),
  Maximum = max(candidatevotes, na.rm = TRUE),
  Mean = mean(candidatevotes, na.rm = TRUE),
  Range = Maximum - Minimum,
  SD = sd(candidatevotes, na.rm = TRUE),
  Variance = var(candidatevotes, na.rm = TRUE)
)

# Printing table of statistics
candidateVotesMichiganStats |>
  make_flex(caption = "Quantitative summary statistics for candidate votes per year in Michigan.", ndigits = 2)
Table 1: Quantitative summary statistics for candidate votes per year in Michigan.

year

Minimum

Q1

MEDIAN

Q3

Maximum

Mean

Range

SD

Variance

2,000

10

246.00

2,066.50

7,731.50

530,414

12,749.13

530,404

42,555.29

1,810,952,999.80

2,004

28

550.00

5,441.00

12,964.00

600,047

19,434.75

600,019

55,577.13

3,088,817,269.88

2,008

44

759.00

5,860.00

13,208.00

660,085

20,087.41

660,041

58,529.93

3,425,752,186.34

2,012

25

590.00

5,107.00

12,659.00

595,846

18,999.84

595,821

54,957.68

3,020,346,565.63

2,016

93

1,732.00

4,950.00

14,646.00

519,444

19,274.23

519,351

51,570.21

2,659,487,057.90

2,020

2

55.50

473.00

7,521.00

597,170

13,347.72

597,168

48,569.33

2,358,979,348.85

# Calculating descriptive statistics for total votes in the year 2020
totalVotesIn2020Stats <- county_presidents |>
  dplyr::filter(year == 2020) |>
  group_by(candidate) |>
  dplyr::summarize(
  Minimum = min(totalvotes, na.rm = TRUE),
  Q1 = quantile(totalvotes, na.rm = TRUE, probs = 0.25),
  MEDIAN = median(totalvotes, na.rm = TRUE),
  Q3 = quantile(totalvotes, na.rm = TRUE, probs = 0.75),
  Maximum = max(totalvotes, na.rm = TRUE),
  Mean = mean(totalvotes, na.rm = TRUE),
  Range = Maximum - Minimum,
  SD = sd(totalvotes, na.rm = TRUE),
  Variance = var(totalvotes, na.rm = TRUE)
)

# Printing table of statistics
totalVotesIn2020Stats |> 
  make_flex(caption = "Quantitative summary statistics for total votes per candidate casted in the election in 2020", ndigits = 2)
Table 2: Quantitative summary statistics for total votes per candidate casted in the election in 2020

candidate

Minimum

Q1

MEDIAN

Q3

Maximum

Mean

Range

SD

Variance

DONALD J TRUMP

66

5,894.00

12,947.00

35,172.00

4,264,365

47,462.47

4,264,299

133,081.20

17,710,605,343.95

JO JORGENSEN

66

5,832.00

12,893.00

35,311.00

4,264,365

47,929.03

4,264,299

134,838.63

18,181,457,114.49

JOSEPH R BIDEN JR

66

5,894.00

12,947.00

35,172.00

4,264,365

47,462.47

4,264,299

133,081.20

17,710,605,343.95

OTHER

66

6,924.25

15,204.50

42,829.00

4,264,365

55,797.31

4,264,299

153,835.07

23,665,229,745.10

#Frequency table for obtaining candidates participated in each year
candidate_occurrence <- with(county_presidents, tapply(candidate, year, function(x) unique(x)))

#Convert to dataframe and set column names
candidates_year <- candidate_occurrence |>
  as.data.frame() |>
  setNames(c("Candidates")) 

# Make index as first column in dataframe
candidates_year <- cbind(Year = rownames(candidates_year), candidates_year) 
rownames(candidates_year) <- rownames(1:nrow(candidates_year))

data.frame(lapply(candidates_year, as.character), stringsAsFactors=FALSE) |>
  make_flex(caption = "Presidential candidates for US across years")
Table 3: Presidential candidates for US across years

Year

Candidates

2000

c("AL GORE", "GEORGE W. BUSH", "RALPH NADER", "OTHER")

2004

c("JOHN KERRY", "GEORGE W. BUSH", "OTHER")

2008

c("BARACK OBAMA", "JOHN MCCAIN", "OTHER")

2012

c("BARACK OBAMA", "MITT ROMNEY", "OTHER")

2016

c("HILLARY CLINTON", "DONALD TRUMP", "OTHER")

2020

c("JOSEPH R BIDEN JR", "OTHER", "DONALD J TRUMP", "JO JORGENSEN")

b) Data Visualizations

county_presidents |>
  group_by(candidate, party) |>
  summarize(total_candidatevotes = sum(candidatevotes, na.rm = TRUE)) |>
  ggplot(aes(x = forcats::fct_reorder(candidate, total_candidatevotes, .desc = TRUE), y = total_candidatevotes, fill = party)) +
  geom_col() +
  scale_y_continuous(labels = scales::label_comma(), expand = expansion(mult = c(0.0, 0.1))) +
  scale_fill_manual(values = c("DEMOCRAT" = "blue", "GREEN" =  "green", "LIBERTARIAN" = "#FFDF00", "OTHER" = "gray", "REPUBLICAN" = "red")) +
  labs(x = "Candidate Name",
       y = "Candidate Votes",
       title = "Total number of votes polled across years for each \ncandidate in US",
       fill = "Party") +
  theme_few() + 
  theme(title = element_text(face = "bold"),
        legend.position = "bottom",
        legend.title = element_blank()) +
  coord_flip()
## `summarise()` has grouped output by 'candidate'. You can override using the
## `.groups` argument.

county_presidents |>
  group_by(year) |>
  summarize(total_votes = sum(candidatevotes)) |>
  ggplot(aes(x = year, y = total_votes)) +
  geom_line(size = 0.5, color = "black") +
  scale_y_continuous(labels = label_comma()) +
  labs(title = "Total Number of votes across years in US",
       x = "Year",
       y = "Total  Votes") +
  theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Downloading state-level shape files from US Census Bureau
if(!file.exists("cb_2018_us_state_500k.zip")) {
download.file(url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip",
              destfile = "cb_2018_us_state_500k.zip")
}

# Create directory for geospatial files
if(!dir.exists("GeoFiles")) {
dir.create("GeoFiles")
}

# Unzipping files
utils::unzip("cb_2018_us_state_500k.zip",
             exdir = "GeoFiles")

# Loading the shapefiles
state_shape <- st_read("GeoFiles//cb_2018_us_state_500k.shp",
                       quiet = TRUE)
# Downloading county-level shape files from US Census Bureau
if(!file.exists("cb_2018_us_county_500k.zip")) {
download.file(url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip",
              destfile = "cb_2018_us_county_500k.zip")
}

if(!dir.exists("GeoFiles")) {
  dir.create("GeoFiles/")
}

# Unzipping files
utils::unzip("cb_2018_us_county_500k.zip",
             exdir = "GeoFiles")

# Loading the shapefiles
county_shape <- st_read("GeoFiles//cb_2018_us_county_500k.shp",
                        quiet = TRUE)
# Downloading and importing state FIPS codes
if(!file.exists("state-geocodes-v2020.xlsx")) {
download.file("https://www2.census.gov/programs-surveys/popest/geographies/2020/state-geocodes-v2020.xlsx",
              destfile = "state-geocodes-v2020.xlsx", mode = "wb")
}

state_fips <- readxl::read_excel("state-geocodes-v2020.xlsx", skip = 5) |> 
  dplyr::transmute(state_fips = `State (FIPS)`,
                   state = Name)
county_presidents <- county_presidents |>
    dplyr::mutate(party_new = case_when(party == "DEMOCRAT" ~ "democrat",
                                    party == "REPUBLICAN" ~ "republican",
                                    TRUE ~ "other"),
                  prop_vote = candidatevotes / totalvotes,
                  state_fips = fips(state))
# Merging elections data with shape file data
state_shape_full <- state_shape |> 
  dplyr::left_join(county_presidents |> 
    dplyr::mutate(GEOID = str_pad(state_fips, width = 2, side = "left", pad = "0")) |> 
    group_by(year, state, party_new) |> 
    slice_max(order_by = prop_vote, n = 1) |> 
    ungroup() |>
    pivot_wider(
        id_cols = c(year:office, GEOID),
        names_from = party_new,
        values_from = c(candidate, prop_vote),
        names_sep = "_",
        names_glue = "{party_new}_{.value}") |> 
      rowwise() |>
    mutate(across(c(democrat_candidate, other_candidate, republican_candidate),
                  ~ str_to_title(str_c(rev(str_split(., ", ")[[1]]), collapse = " ")))),
    by = c("GEOID" = "GEOID"))
percent_democratic <- state_shape_full |> 
  dplyr::filter(year == 2012,
                !is.na(democrat_prop_vote)) |> 
  tigris::shift_geometry(geoid_column = "GEOID")

# Adding labels column for interactivity
percent_democratic <- percent_democratic |> 
dplyr::mutate(Result = paste0("<b>", stringr::str_to_title(state), "<b><br>",
  democrat_candidate, ": ", round(democrat_prop_vote, 4)*100, "% <br>",
  republican_candidate, ": ", round(republican_prop_vote, 4)*100, "% <br>",
  other_candidate, ": ", round(other_prop_vote, 4)*100, "%") |> 
  lapply(htmltools::HTML))

# Plot it
electionGG <- percent_democratic |> 
  ggplot(aes(fill = democrat_prop_vote,
             text = Result)) +
  geom_sf() +
 scale_fill_gradient2(low = "#DC3220",
                        mid = "#ffffff",
                        high = "#005AB5",
                        midpoint = 0.50) + 
  labs(title = "United States 2012 Presidential Election - Democrats", 
       fill = "Percent Democrat") +
  theme_void()

ggplotly(electionGG, tooltip = c("text"))
county_presidents_michigan <- county_presidents |>
  dplyr::mutate(GEOID = str_pad(county_fips, width = 5, side = "left", pad = "0"),
                Result = paste0("<b>", stringr::str_to_title(county_name), "<b><br>", round(prop_vote, 4) * 100, "% <br>")) |>
  dplyr::filter(str_to_upper(state) == "MICHIGAN")

# Merge shape and census data
ggMapData <- county_shape |> 
  full_join(county_presidents_michigan, by = c("GEOID" = "GEOID")) |> 
  dplyr::filter(year == 2020)

# Fixing issue with Alaska and Hawaii
ggMapDataFix <- ggMapData |> 
  tigris::shift_geometry()
## Warning: None of your features are in Alaska, Hawaii, or Puerto Rico, so no geometries will be shifted.
## Transforming your object's CRS to 'ESRI:102003'
# Plotting by thresholding max poverty level
michiganGG <- ggMapDataFix |> rowwise() |> 
  mutate(min_prop_vote = min(prop_vote, 1)) |> 
  ggplot(aes(fill = min_prop_vote, text = Result)) +
  geom_sf(color = "#C0C0C0", size = 0.1) +
  scale_fill_gradient(low = "white", high = "dodgerblue", limits = c(0, 1)) +
  labs(title = "County-level voting rates in Michigan, 2020", fill = "Voting rate") +
  theme_void()

plotly::ggplotly(michiganGG)
county_presidents |>
  group_by(year, party) |>
  summarize(total_votes = sum(totalvotes, na.rm = TRUE)) |>
  ggplot(aes(x = year, y = total_votes, color = party)) +
  geom_point() +
  facet_grid(. ~ party) +
  geom_smooth(se = FALSE) +
  scale_fill_manual(values = c("DEMOCRAT" = "blue", "GREEN" =  "green", "LIBERTARIAN" = "#FFDF00", "OTHER" = "gray", "REPUBLICAN" = "red")) +
  scale_y_continuous(labels = label_comma()) +
  labs(x = "Year",
       y = "Total Votes",
       color = "Party",
       title = "Total votes polled for each party across years in the state of Michigan")  +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40), legend.position = "none")
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 1999.9
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 0.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1999.9
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 2020.1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 0.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning: Failed to fit group 2.
## Caused by error in `predLoess()`:
## ! NA/NaN/Inf in foreign function call (arg 5)
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

search <- gtrends(c("obama", "trump", "biden"), time= "2004-01-01 2020-12-31", geo = "US")
time_trend <- search$interest_over_time |>
  dplyr::mutate(hits=ifelse(hits=="<1",0.5,as.numeric(hits)),
                date=as.Date(date))
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `hits = ifelse(hits == "<1", 0.5, as.numeric(hits))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
time_trend |>
  ggplot(aes(x = date, y = hits, color = keyword)) +
  geom_line() +
  labs(title = "Search trends for president names in US during 2004 to 2020",
       x = "Year",
       y = "Seach Hits",
       color = "Keyword") +
  theme_bw()

4) Monte Carlo Methods of Inference

# Obtain a sample of data
trialCountyData <- county_presidents |>
  dplyr::filter(party %in% c("REPUBLICAN", "DEMOCRAT") & state == "MICHIGAN") |>
  select(party, candidatevotes) |>
  arrange(party) |>
  droplevels()

head(trialCountyData)
##      party candidatevotes
## 1 DEMOCRAT           2696
## 2 DEMOCRAT           2071
## 3 DEMOCRAT          15495
## 4 DEMOCRAT           7053
## 5 DEMOCRAT           4329
## 6 DEMOCRAT           3685
# Implement welch's two-sample t-test
tResult <- trialCountyData |>
  t_test(candidatevotes ~ party, order = c("DEMOCRAT", "REPUBLICAN"), alternative = "greater")

tResult |>
  flextable()

statistic

t_df

p_value

alternative

estimate

lower_ci

upper_ci

0.8688701

832.5574

0.1925843

greater

3,613.359

-3,234.693

Inf

Statement of hypotheses

\[H_0 :\mu_{\text{Democrat}} \le \mu_{Republican}\] \[H_a :\mu_{\text{Democrat}} > \mu_{Republican}\]

where \(\mu_{\text{Democrat}}\) is the average number of candidate votes polled for Democrats and, \(\mu_{Republican}\) is the average number of candidate votes polled for Republicans.

Implementing the permutation test

# Number of permutations to do
nperms <- 500

set.seed(1994)

# Instantiating vector for test statistics
permTs <- vector(length = nperms)

# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
  permTs[p] <- trialCountyData |> 
    mutate(party = sample(party, replace = FALSE)) |> 
    t_test(response = candidatevotes, 
         explanatory = party,
         order = c("DEMOCRAT", "REPUBLICAN"),
         alternative = "greater") |> 
    pull(statistic)
}

Creating a histogram displaying the null distribution obtained for the randomization test

tibble(value = permTs) |>
  ggplot(aes(x = value)) +
  geom_histogram(color = "white") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(x = "Test statistic",
       y = "Frequency",
       title = "Randomization test null distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Adding vertical lines to the plot to indicate where the 5th percentile is (a red dotted line), and where our observed test statistic is (solid blue line).

fifthPercentile <- quantile(permTs, probs = 0.05)

tibble(value = permTs) |>
  ggplot(aes(x = value)) +
  geom_histogram(color = "white") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  geom_vline(xintercept = quantile(permTs, probs = 0.95), color = "red", lty = "dotted", linewidth = 0.75) + 
  geom_vline(xintercept = quantile(tResult$statistic, probs = 0.95), color = "blue", lty = "solid", linewidth = 0.5) + 
  labs(x = "Test statistic",
       y = "Frequency",
       title = "Randomization test null distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculating the p-value for the randomization test.

# calculating p-value for the randomization test
mean(permTs >= tResult$statistic)
## [1] 0.178
janitor::tabyl(permTs >= tResult$statistic)
##  permTs >= tResult$statistic   n percent
##                        FALSE 411   0.822
##                         TRUE  89   0.178

Since p-value is > 0.05, we fail to reject null hypotheses

Interepretation in context: There is statistically significant evidence that the average candidates votes polled for democrats was less than or equal to on average than for republicans group at the 5% significance level.

5) Bootstrap Methods of Inference

Create a subset of the data that has observations for only Repulican candidates

sampleData <- county_presidents |>
  dplyr::filter(party == "REPUBLICAN") |>
  select(party, prop_vote)

Sample size for this dataset

nrow(sampleData)
## [1] 20906

Sample Median for this dataset

median(sampleData$prop_vote, na.rm = TRUE)
## [1] 0.5834376

Next, we resample from our original sample dataset 10000 times to obtain \(B\)

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
boots <- matrix(NA, nrow = nrow(sampleData), ncol = B)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- sampleData |> 
  slice_sample(prop = 1, replace = TRUE) |> 
  dplyr::pull(prop_vote)
}
# Instantiating vector for bootstrap medians
boot_medians <- vector(length = B)

# Calculating medians for bootstrap samples
for(b in 1:B) {
  boot_medians[b] <- median(boots[, b], na.rm = TRUE)
}

Obtain a parametric bootstrap estimate of the standard error of the sample median

# calculating confidence interval bounds
quantile(boot_medians, probs = c(0.025, 0.975), na.rm = TRUE)
##      2.5%     97.5% 
## 0.5801967 0.5858483
tibble(value = boot_medians) |>
  ggplot(aes(x = value)) +
  geom_histogram(color = "black") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  geom_vline(xintercept= quantile(boot_medians, probs = 0.025, na.rm = TRUE), size=1, color="blue",linetype="dotted") +
  geom_vline(xintercept= quantile(boot_medians, probs = 0.975, na.rm = TRUE),size=1, color="blue",linetype="dotted")+
  labs(x = "Sample Median",
       y = "Count",
       title = "Non-parametric bootstrap distribution of sample medians")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Confidence interval interpretation:

We’re 95% confident that 50% of republican candidates in US elections during 2000-2020 will have a proportion of votes between 58% and 58.6% or more.

6) Conclusion

In this project, we first imported the Presidential elections data at county level in USA from 2000 to 2020. Then we check the data missingness using interactive heatmap and lollipop chart, clean the candidate names using string functions. Next, we printed summary statistics and frequency tables for the variables in the dataset. Then we created data visualizations for identifying valuable insights into voting trends, party-wise voting patterns. Also we identified search interests of specific keywords during the election years. Next, we implemented Monte Carlo methods of inference to check if average candidate votes polled for democrats is greater than that of republicans in Michigan. Lastly, we implemented non-parametric bootstrap method on republican party voting proportions.

This file was published in Rpubs and it can be found using following link:

https://rpubs.com/anugolur/1176499