Setting up: Vaccination Data

# Load CDC Covid-19 Vaccination Data for US counties over the course of the pandemic
vaccine <- read_csv("~/Desktop/EC524/524covid.csv", show_col_types = FALSE)

# This data is updated weekly and the file includes all past entries. 
# We subset the data to isolate one set of observations - 1/1/22
vaccine <- subset(vaccine, Date == '01/01/22')

# Removing counties with missing FIPS
vaccine <- subset(vaccine, FIPS!="UNK")

# Rename variables for ease
vaccine %<>% 
  rename(
    "County"="Recip_County",
    "State"="Recip_State" ,
    "Population"="Census2019" ,
    "Vax12Plus"="Series_Complete_12PlusPop_Pct",
    "Population18"="Census2019_18PlusPop",
    "Population65"="Census2019_65PlusPop"
    )

# Remove rows with missing vaccine data
vaccine <- subset(vaccine, Vax12Plus != 0)

# Remove Guam, Puerto Rico, and Virgin Islands so 51 states (50+DC) instead of 54
vaccine <- subset(vaccine, State!=c("GU", "PR","VI"))

# Pad FIPS codes with 4 digits to have 0 on left (like it should)
vaccine$FIPS<-str_pad(vaccine$FIPS,width=5,side="left",pad="0")

#Create a dummy variable for Metro area
# There is only 1 missing value for Metro - Kootenai County, Idaho, which includes 
# Coeur d'Alene, is a metropolitan area, so the missing value will be getting 1
vaccine%<>% mutate(Metro = ifelse(Metro_status == 'Metro', 1, 
   ifelse(Metro_status == 'Non-metro', 0, 1)))

# Creating a variable for the percent of population under 18 by finding the percent 
# of the population over 18 and taking the difference
vaccine %<>% mutate(PopUnder18=(100-((Population18/Population)*100)))
# Rounding to 2 digits
vaccine$PopUnder18 <- round(vaccine$PopUnder18,digit=2)

# Creating a variable for the percent of population over 65 (a high risk group)
vaccine %<>% mutate(Pop65Plus=((Population65/Population)*100))
# Rounding to 2 digits
vaccine$Pop65Plus <- round(vaccine$Pop65Plus,digit=2)

# Creating a variable for the percent of population between 18 and 65
vaccine %<>% mutate(Pop18to65=(100-PopUnder18-Pop65Plus))
# Rounding to 2 digits
vaccine$Pop18to65 <- round(vaccine$Pop18to65,digit=2)

# Subset data with only relevant variables
vaccine <- subset(vaccine, select=c("FIPS", "County", "State", 
          "Vax12Plus", "Metro", "Population", 
          "Pop18to65", "Pop65Plus", "PopUnder18"))

Setting up: Education

# Load education dataset from the USDA Economic Research Service
education <- read_csv("~/Desktop/EC524/Education.csv", show_col_types = FALSE)

# Pad FIPS codes
education$FIPS<-str_pad(education$FIPS,width=5,side="left",pad="0")

# Remove county as it is formatted differently
education = select(education, -c(County))

#M Merge the vaccine data and the education data by FIPS code and State
df <- merge(vaccine, education, by=c("FIPS", "State"))

Setting up: WFH

# Load work from home dataset from the USDA Economic Research Service
WFH <- read_csv("~/Desktop/EC524/WFH.csv", show_col_types = FALSE)

# Pad FIPS. Did not have a FIPS section - I created this in Excel. 
WFH$FIPS<-str_pad(WFH$FIPS,width=5,side="left",pad="0")

# Round population density
WFH$PopDensity <- round(WFH$PopDensity, digit=2)

# Subset with relevant variables
WFH <- subset(WFH, select=c("FIPS", "PopDensity", "WFH"))

# Merge to df by FIPS
df <- merge(df, WFH, by=c("FIPS"))

# Create percent of population that works from home 
df %<>% mutate(WFH=(WFH/Pop18to65)*100)

Setting up: Poverty

# Load poverty dataset from the USDA Economic Research Service
poverty <- read_csv("~/Desktop/EC524/Poverty.csv", show_col_types = FALSE)

# Remove state and county, as those variables are formatted differently 
poverty = select(poverty, -c(STATEFP, COUNTYFP))

# Pad FIPS
poverty$FIPS<-str_pad(poverty$FIPS,width=5,side="left",pad="0")

# Merge to df by FIPS
df <- merge(df, poverty, by=c("FIPS"))

Setting up: Election 2012

# Load the dataset - includes elections from 2000-2020 and information on each party.
# We are interested in elections within the past 10 years. 
electionsall <- read_csv("~/Desktop/EC524/countypres_2000-2020.csv", show_col_types = FALSE)

# Remove unnecessary variables
electionsall = select(electionsall, -c(state, county_name, office, version, candidate, mode))

# Rename variables for ease
electionsall %<>% 
  rename(
    "State"="state_po",
    "FIPS"="county_fips")

# Pad FIPS
electionsall$FIPS<-str_pad(electionsall$FIPS,width=5,side="left",pad="0")

# Subset 2012 election
election2012 <- subset(electionsall, year == '2012')

# Subset Democrat party for 2012 election
dem2012 <- subset(election2012, party == 'DEMOCRAT')
# Create a variable for the percent of total votes cast for Democrats in the 2012 election
dem2012 %<>% mutate(dem2012pct=candidatevotes/totalvotes*100)
# Round
dem2012$dem2012pct <- round(dem2012$dem2012pct,digit=2)
# Subset to remove irrelevant variables
dem2012 = select(dem2012, -c(year, totalvotes, candidatevotes, party))
# Merge to df by FIPS and State
df <- merge(df, dem2012, by=c("FIPS", "State"))

# Subset Republican party for 2012 election
rep2012 <- subset(election2012, party == 'REPUBLICAN')
# Create a variable for the percent of total votes cast for Republicans in the 2012 election
rep2012 %<>% mutate(rep2012pct=candidatevotes/totalvotes*100)
# Round
rep2012$rep2012pct <- round(rep2012$rep2012pct,digit=2)
# Subset to remove irrelevant variables
rep2012 = select(rep2012, -c(year, totalvotes, candidatevotes, party))
# Merge to df by FIPS and State
df <- merge(df, rep2012, by=c("FIPS", "State"))

# Create a dummy variable for Democrat majority in 2012 election
df%<>% mutate(dem2012 = ifelse(dem2012pct > rep2012pct, 1,0))

Setting up: Election 2016

# Subset 2016 elections
election2016 <- subset(electionsall, year == '2016')

# Subset Democrat party for 2016 election
dem2016 <- subset(election2016, party == 'DEMOCRAT')
# Create a variable for the percent of total votes cast for Democrats in the 2016 election
dem2016 <- dem2016 %>% mutate(dem2016pct=candidatevotes/totalvotes*100)
# Round
dem2016$dem2016pct <- round(dem2016$dem2016pct,digit=2)
# Subset to remove irrelevant variables
dem2016 = select(dem2016, -c(year, totalvotes, candidatevotes, party))
# Merge to df by FIPS and State
df <- merge(df, dem2016, by=c("FIPS", "State"))

# Subset Republican party for 2016 election
rep2016 <- subset(election2016, party == 'REPUBLICAN')
# Create a variable for the percent of total votes cast for Republicans in the 2012 election
rep2016 <- rep2016 %>% mutate(rep2016pct=candidatevotes/totalvotes*100)
# Round
rep2016$rep2016pct <- round(rep2016$rep2016pct,digit=2)
# Subset to remove irrelevant variables
rep2016 = select(rep2016, -c(year, totalvotes, candidatevotes, party))
# Merge to df by FIPS and State
df <- merge(df, rep2016, by=c("FIPS", "State"))

# Create a dummy variable for Democrat majority in 2016 election
df<-df%>% mutate(dem2016 = ifelse(dem2016pct > rep2016pct, 1,0))

Setting up: Election 2020

# Subset 2020 elections
election2020<- subset(electionsall, year == '2020')

# Subset Democrat party for 2020 election
dem2020 <- subset(election2020, party == 'DEMOCRAT')

# Within the 2020 elections, there are several cases of counties that have multiple 
# lines for their results, which is an issue. 
# Fix formatting 
dem2020 <- dem2020 %>%                        
  group_by(FIPS) %>% 
  # Combine the candidate votes for counties with multiple entries, but keep total votes the same
  dplyr::summarise(candidatevotes = sum(candidatevotes), totalvotes=totalvotes, State=State) %>% 
  # Now, we have the accurate candidatevotes value
  as.data.frame()
# Remove duplicated FIPS
dem2020<- dem2020[!duplicated(dem2020$FIPS), ]

# Create a variable for the percent of total votes cast for Democrats in the 2020 election
dem2020 <- dem2020 %>% mutate(dem2020pct=candidatevotes/totalvotes*100)
# Round
dem2020$dem2020pct <- round(dem2020$dem2020pct,digit=2)

# Remove irrelevant variables
dem2020 = select(dem2020, -c(totalvotes, candidatevotes))

# Merge to df by FIPS and State
df <- merge(df, dem2020, by=c("FIPS", "State"))

# Subset Republican party for 2020 election
rep2020 <- subset(election2020, party == 'REPUBLICAN')

# Fix formatting 
rep2020 <- rep2020 %>%                        
  group_by(FIPS) %>%
    # Combine the candidate votes for counties with multiple entries, but keep total votes the same. 
  dplyr::summarise(candidatevotes = sum(candidatevotes), totalvotes=totalvotes, State=State) %>% 
    # Now, we have the accurate candidatevotes value
  as.data.frame()
# Remove duplicated FIPS
rep2020<- rep2020[!duplicated(rep2020$FIPS), ]

# Create a variable for the percent of total votes cast for Republicans in the 2020 election
rep2020 %<>% mutate(rep2020pct=candidatevotes/totalvotes*100)
# Round
rep2020$rep2020pct <- round(rep2020$rep2020pct,digit=2)

# Remove irrelevant variables
rep2020 = select(rep2020, -c(totalvotes, candidatevotes))

# Merge to DF by FIPS and State
df <- merge(df, rep2020, by=c("FIPS", "State"))
# Round
df<-df%>% mutate(dem2020 = ifelse(dem2020pct > rep2020pct, 1,0))

Setting up: Election Voting Patterns

# Create variables for voting patterns
#Previous Democrat voting
df%<>% mutate(pastvoting_dem = (dem2012+dem2016+dem2020))

# Average percent of votes Republican in past 3 presidential elections
df%<>% mutate(avgpct_rep = ((rep2012pct+rep2016pct+rep2020pct)/3))

# Average percent of votes Democrat in past 3 presidential elections
df%<>% mutate(avgpct_dem = ((dem2012pct+dem2016pct+dem2020pct)/3))

# Round
df$pastvoting_dem <- round(df$pastvoting_dem,digit=2)
df$avgpct_rep <- round(df$avgpct_rep,digit=2)
df$avgpct_dem <- round(df$avgpct_dem,digit=2)

# Remove unnecessary variables
df <- subset(df, select=-c(dem2012pct, dem2016pct,dem2020pct, rep2012pct, rep2016pct, 
                   rep2020pct, Pop18to65))
# Filter
df %<>% filter(is.na(avgpct_dem) == FALSE)
#Rename df
# covid<-df
# Write new CSV so do not have to load code every time
# write.csv(df,"~/Desktop/EC524/524Project_743.csv", row.names = FALSE)

Working with Data

Exploring Data: Map

# Look at newly formed dataset
# skim(covid)
# look at covid vaccination rates across the US
p_load(usmap)
covid_map <- covid %>% rename(fips = FIPS)
covid_map %<>% mutate(above_med = ifelse(Vax12Plus >= 54.3, "Above Median", "Below Median"))

# graph vaccination rates
plot_usmap(
  data = covid_map, values = "above_med") + 
  scale_fill_manual(values = c("Above Median" = "dark cyan", "Below Median" = "pink"), 
        name = "Median Vaccination Rate (Jan. 2022)") + 
        theme(legend.position = "right") + 
        ggtitle("COVID-19 Median Vaccination Rates by County") +
        theme(panel.background = element_rect(fill="lavender"))+ 
        theme(plot.title = element_text(hjust = 0.5))

Exploring Data: Correlation Heat Map

# correlation heat map 
# create normal correlation matrix
covid1 <- covid %>% subset(select = -c(1,2,3,17,18, 19))
covid1 %<>% filter(is.na(avgpct_dem) == FALSE)
cormat <- round(cor(covid1),2)
# load melt package
p_load(reshape2)
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
  upper_tri <- get_upper_tri(cormat)
  melted_cormat <- melt(upper_tri, na.rm = TRUE)

# plot heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "dark cyan", high = "indianred1", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Correlation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 60, vjust = 1,  hjust = 1))+
 coord_fixed() +  theme(plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Correlation Matrix Heatmap") + 
  theme(plot.title = element_text(hjust = 0.5)) +
labs(y="Variables", x="Variables")

Split & Recipe

# set seed 
set.seed(34986)
# data cleaning 
# remove county and state columns
covid %<>% subset(select = -c(2,3))
# Make Split
covid_split = covid %>% initial_split(prop = 0.8, strata = Vax12Plus) # split 80-20 into two datasets 
#separate training and testing data
covid_train = covid_split %>% training()
covid_test = covid_split %>% testing()
# define the recipe: 
covid_recipe = 
  recipe(Vax12Plus ~ .,data = covid_train) %>%
  # normalize for numeric predictors
  step_normalize(all_predictors() & all_numeric()) %>% 
  # KNN imputation for categorical predictors
  # unnecessary
  step_impute_knn(all_predictors() & all_nominal(), neighbors = 5) %>%
  # create dummies for categorical variables
  # unnecessary
  step_dummy(all_predictors() & all_nominal()) %>%
  update_role(FIPS, new_role = "id variable")

# define the 5-fold split
covid_cv = covid_train %>% vfold_cv(v = 5)

Linear Regression

# Linear regression
set.seed(1)

# set model
model_linear = linear_reg() %>%
  set_engine("lm")
# create workflow
workflow_linear = workflow() %>%
  add_model(model_linear) %>% 
  add_recipe(covid_recipe)
# fit cross validated samples and extract rmse rsq and mae
cv_linear = 
  workflow_linear %>%
   fit_resamples(
    object = model_linear,
    preprocessor = covid_recipe,
    resamples = covid_cv,
    metrics = yardstick::metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
  )
# examine metrics
cv_linear %>% collect_metrics()
# choose a model with best rmse and finalize workflow
final_lin =
    workflow_linear %>% 
    finalize_workflow(select_best(cv_linear, metric = "rmse"))
# fit model on training data
final_fit_lin = final_lin %>% fit(data = covid_train)

# fit model on test data
final_fit_lin = final_lin %>% last_fit(covid_split)
# get test-sample predictions
final_fit_lin %>% collect_predictions() %>% head()

Linear Regression Results

set.seed(1)
# Linear
# get metrics of final fit
final_fit_lin %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      10.1   Preprocessor1_Model1
## 2 rsq     standard       0.469 Preprocessor1_Model1

Elasticnet

# Elasticnet
# set seed
set.seed(1)
# set tuning values for penalty and mixture
alphas = seq(from = 0, to = 1, by = 0.1)
lambdas = 10^seq(from = 2, to = -3, length = 1e2)

# define the elasticnet model
model_net = 
  linear_reg(penalty = tune(), 
             mixture = tune()) %>% 
  set_engine("glmnet") %>%
  set_mode("regression")

# define our workflow
workflow_net = workflow() %>%
  add_model(model_net) %>% 
  add_recipe(covid_recipe)

# cross-validate elasticnet with lambdas and alphas
cv_net = 
  workflow_net %>%
  tune_grid(
    covid_cv,
    grid = expand_grid(mixture = alphas, penalty = lambdas),
    metrics = yardstick::metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
  )
# collect best hyperparameter values for each metric
#cv_net %>% show_best(metric = "rmse", n = 3)
#cv_net %>% show_best(metric = "rsq", n = 3)
#cv_net %>% collect_metrics(metric = "mae")

# elasticnet chooses lasso for this and penalty = 0.001 chosen by rmse and rsq
# finalize workflow with best rmse model
final_net =
    workflow_net%>% 
    finalize_workflow(select_best(cv_net, metric = "rmse"))
# fit final model to training data
final_fit_net = final_net %>% fit(data = covid_train)
# fit final model to the data split
final_fit_net = final_net %>% last_fit(covid_split)
# get test-sample predictions
final_fit_net %>% collect_predictions() %>% head()

Elasticnet Results

set.seed(1)
# get metrics of final fit
final_fit_net %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      10.1   Preprocessor1_Model1
## 2 rsq     standard       0.469 Preprocessor1_Model1

Elasticnet Graph of Metrics

# graph of metrics
autoplot(cv_net, metric = "rmse") + scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits=c(10.25,10.8))

autoplot(cv_net, metric = "mae") + scale_x_continuous(limits = c(0, 1))+
  scale_y_continuous(limits=c(7.3,7.9))

Random Forest

# random forest
# set seed
set.seed(3440)
# create model tuning mtry and min_n
tune_spec <- rand_forest(
  mtry = tune(),
  trees = 200,
  min_n = tune()
) %>%
  set_mode("regression") %>%
  set_engine("ranger", importance = "impurity")
# define workflow
tune_wf <- workflow() %>%
  add_recipe(covid_recipe) %>%
  add_model(tune_spec)
# create grid of possible mtry and min_n values
rf_grid <- grid_regular(
  mtry(range = c(4, 10)),
  min_n(range = c(2, 8)),
  levels = 5
)
# cross-validate to tune hyperparameters
regular_res <- tune_grid(
  tune_wf,
  resamples = covid_cv,
  grid = rf_grid,
  metrics = yardstick::metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
)
# collect best hyperparameters from metrics
# regular_res %>% show_best(metric="rmse")
# regular_res %>% show_best(metric="rsq")
# regular_res %>% show_best(metric="mae")

# final fit with best model determined by rmse
final_rf =
    tune_wf%>% 
    finalize_workflow(select_best(regular_res, metric = "rmse"))
# fit model on training data
final_fit_rf = final_rf %>% fit(data = covid_train)
# fit model on split
final_fit_rf = final_rf %>% last_fit(covid_split)
# get test-sample predictions
final_fit_rf %>% collect_predictions() %>% head()

Random Forest Results

set.seed(3440)
# get metrics of final fit
final_fit_rf %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       9.85  Preprocessor1_Model1
## 2 rsq     standard       0.496 Preprocessor1_Model1

Random Forest Graph of Metrics

# graph of metrics
autoplot(regular_res, metric = "rmse")

autoplot(regular_res, metric = "mae")

Random Forest Variable Importance

# variable importance 
p_load('vip')
# extract importance from final model fit on test data
final_fit_rf %>% 
  pluck(".workflow", 1) %>%   
  pull_workflow_fit() %>% 
  vip(num_features = 20)

KNN

# KNN
# set seed
set.seed(3440)
# create knn model tuning neighbors
model_knn = 
    nearest_neighbor(neighbors = tune()) %>%
    set_mode("regression") %>%
    set_engine("kknn")
# define knn workflow
workflow_knn =
    workflow() %>%
    add_model(model_knn) %>%
    add_recipe(covid_recipe)
# cross-validate to find best neighbors 
fit_knn_cv =
    workflow_knn %>%
    tune_grid(
        covid_cv,
        grid = data.frame(neighbors = c(1, 5, seq(10, 100, 10))),
        metrics = yardstick::metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)
    )

# collect best number of neighbors
fit_knn_cv %>% show_best(metric = "rmse", n = 3)
fit_knn_cv%>% show_best(metric="rsq", n=3)
fit_knn_cv%>% show_best(metric="mae", n=3)

# final workflow on knn model, selection from best rmse
final_knn =
    workflow_knn %>% 
    finalize_workflow(select_best(fit_knn_cv, metric = "rmse"))
# fit on training data
final_fit_knn = final_knn %>% fit(data = covid_train)
# fit on split
final_fit_knn = final_knn %>% last_fit(covid_split)
# get test-sample predictions
final_fit_knn %>% collect_predictions() %>% head()

KNN Results

# collect test rmse
set.seed(3440)
final_fit_knn %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      10.1   Preprocessor1_Model1
## 2 rsq     standard       0.474 Preprocessor1_Model1

KNN Graph of Metrics

# graph knn cv metrics 
fit_knn_cv %>% 
    collect_metrics(summarize = T) %>%
    ggplot(aes(x = neighbors, y = mean)) +
    geom_line(size = 0.7, alpha = 0.6) +
    geom_point(size = 2.5) +
    facet_wrap(~ toupper(.metric), scales = "free", nrow = 1) +
    scale_x_continuous("Neighbors (k)", labels = scales::label_number()) +
    scale_y_continuous("Estimate") +
    scale_color_viridis_d("CV Folds:") +
    theme_minimal() + 
    theme(legend.position = "bottom")

# show each knn fold's metrics
fit_knn_cv %>% 
    collect_metrics(summarize = F) %>%
    ggplot(aes(x = neighbors, y = .estimate, color = id)) +
    geom_line(size = 0.7, alpha = 0.6) +
    geom_point(size = 2.5) +
    facet_wrap(~ toupper(.metric), scales = "free", nrow = 1) +
    scale_x_continuous("Neighbors (k)", labels = scales::label_number()) +
    scale_y_continuous("Estimate") +
    scale_color_viridis_d("CV Folds:") +
    theme_minimal() + 
    theme(legend.position = "bottom")

EC 524 Term Project

Natalie Walker and Eliana Squire

Setting up: Vaccination Data

Setting up: Education

Setting up: WFH

Setting up: Poverty

Setting up: Election 2012

Setting up: Election 2016

Setting up: Election 2020

Setting up: Election Voting Patterns

Working with Data

Exploring Data: Map

Exploring Data: Correlation Heat Map

Split & Recipe

Linear Regression

Linear Regression Results

Elasticnet

Elasticnet Results

Elasticnet Graph of Metrics

Random Forest

Random Forest Results

Random Forest Graph of Metrics

Random Forest Variable Importance

KNN

KNN Results

KNN Graph of Metrics