Setup

Standard tidyverse and plotly

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

The Data

I used two TinyTuesday DataSets, Emergency Room Data, and US House Results

Emergency Room Data

# Using R
# Option 1: tidytuesdayR R package 
## install.packages("tidytuesdayR")

tuesdata <- tidytuesdayR::tt_load('2025-04-08')

## ---- Compiling #TidyTuesday Information for 2025-04-08 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "care_state.csv"

## OR
tuesdata <- tidytuesdayR::tt_load(2025, week = 14)

## ---- Compiling #TidyTuesday Information for 2025-04-08 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "care_state.csv"

care_state <- tuesdata$care_state

# Option 2: Read directly from GitHub

care_state <- readr::read_csv("C:/Users/japhi/OneDrive/Desktop/Data Analysis/Final Project/care_state.csv")

## Rows: 1232 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): state, condition, measure_id, measure_name, footnote
## dbl  (1): score
## date (2): start_date, end_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Emergency Room Data Cleanup

# Data downloaded manually from https://data.cms.gov/provider-data/dataset/apyc-v239
library(tidyverse)
library(here)

## Warning: package 'here' was built under R version 4.5.2

## here() starts at C:/Users/japhi/OneDrive/Desktop/Data Analysis/Final Project

library(janitor)

## Warning: package 'janitor' was built under R version 4.5.2

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

care_state <- readr::read_csv("C:/Users/japhi/OneDrive/Desktop/Data Analysis/Final Project/care_state.csv", col_types = cols(score = col_character())) |> 
  janitor::clean_names() |> 
  dplyr::mutate(
    score = dplyr::na_if(score, "Not Available") |> 
      as.double(),
    dplyr::across(
      dplyr::ends_with("_date"),
      lubridate::mdy
    )
  )

## Warning: There were 2 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `dplyr::across(dplyr::ends_with("_date"), lubridate::mdy)`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

House Results Data

# Option 1: tidytuesdayR package 
## install.packages("tidytuesdayR")

tuesdata <- tidytuesdayR::tt_load('2023-11-07')

## ---- Compiling #TidyTuesday Information for 2023-11-07 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "house.csv"

## OR
tuesdata <- tidytuesdayR::tt_load(2023, week = 45)

## ---- Compiling #TidyTuesday Information for 2023-11-07 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "house.csv"

house <- tuesdata$house

# Option 2: Read directly from GitHub

house <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-11-07/house.csv')

## Rows: 32452 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): state, state_po, office, district, stage, candidate, party, mode
## dbl (7): year, state_fips, state_cen, state_ic, candidatevotes, totalvotes, ...
## lgl (5): runoff, special, writein, unofficial, fusion_ticket
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

What are these Datasets?

Emergency Room Data:

Contains rows for each state for each measurement
Each Type of measurement is given it’s own code and description
Each Description tells you how to interpret the data

House Results Data:

Contains rows for each district for each election
Contains data for decades, but we are only using the most recent (2022)
Contains results from the General Election and Primaries, but only looking at the General Elections

Ratio of Democrats to Republicans Winners

Find out who won by party in 2022 (Most Recent)

#Who won each election
h = house %>% filter(year %in% c("2022"), stage=="GEN") %>% 
  group_by(state, district) %>% 
  mutate(
    won = if_else(candidatevotes == max(candidatevotes), TRUE, FALSE),
    party = as_factor(party)) %>% 
    ungroup()

#Districts Per State
h = h %>% group_by(state_po) %>% 
  mutate(districts_total = max(district)) %>% 
  ungroup()

r_parties = h %>% filter(party == "REPUBLICAN") %>% 
  group_by(state_po) %>% 
  summarize(
    r_wins = sum(won) ) %>% 
  ungroup()

d_parties = h %>% 
  group_by(state_po) %>% 
  filter(party == "DEMOCRAT") %>% 
  summarize(
    d_wins = sum(won) ) %>% 
  ungroup()

parties = full_join(r_parties, d_parties)

## Joining with `by = join_by(state_po)`

parties = parties %>% mutate(
  r_wins = replace_na(r_wins, 0),
  d_wins = replace_na(d_wins, 0)
)
parties = parties %>% mutate(
  d_ratio = d_wins / (d_wins + r_wins),
  mostly_d = d_ratio > 0.5)

View(parties)

Visualizing Democrat Rates with Cleaveland Plot

g = parties %>%  ggplot(aes(x=d_ratio,y=reorder(state_po,d_ratio))) + geom_point() + geom_line()
ggplotly(g)

How long are patients in emergency rooms (by state)

c = care_state %>% filter(measure_id == "OP_18b")
g = c %>% ggplot(aes(score, reorder(state, score))) + geom_point()
ggplotly(g)

Combining the Data:

str(care_state)

## tibble [1,232 × 8] (S3: tbl_df/tbl/data.frame)
##  $ state       : chr [1:1232] "AK" "AK" "AK" "AK" ...
##  $ condition   : chr [1:1232] "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Emergency Department" "Emergency Department" ...
##  $ measure_id  : chr [1:1232] "HCP_COVID_19" "IMM_3" "OP_18b" "OP_18b_HIGH_MIN" ...
##  $ measure_name: chr [1:1232] "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Healthcare workers given influenza vaccination Higher percentages are better" "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average time patients spent in the emergency department before being sent home A lower number of minutes is better (high)" ...
##  $ score       : num [1:1232] 7.3 80 140 157 136 136 NA 196 230 182 ...
##  $ footnote    : chr [1:1232] NA NA "25, 26" "25, 26" ...
##  $ start_date  : Date[1:1232], format: NA NA ...
##  $ end_date    : Date[1:1232], format: NA NA ...

str(house)

## spc_tbl_ [32,452 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ year          : num [1:32452] 1976 1976 1976 1976 1976 ...
##  $ state         : chr [1:32452] "ALABAMA" "ALABAMA" "ALABAMA" "ALABAMA" ...
##  $ state_po      : chr [1:32452] "AL" "AL" "AL" "AL" ...
##  $ state_fips    : num [1:32452] 1 1 1 1 1 1 1 1 1 1 ...
##  $ state_cen     : num [1:32452] 63 63 63 63 63 63 63 63 63 63 ...
##  $ state_ic      : num [1:32452] 41 41 41 41 41 41 41 41 41 41 ...
##  $ office        : chr [1:32452] "US HOUSE" "US HOUSE" "US HOUSE" "US HOUSE" ...
##  $ district      : chr [1:32452] "001" "001" "001" "002" ...
##  $ stage         : chr [1:32452] "GEN" "GEN" "GEN" "GEN" ...
##  $ runoff        : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ special       : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ candidate     : chr [1:32452] "BILL DAVENPORT" "JACK EDWARDS" "WRITEIN" "J CAROLE KEAHEY" ...
##  $ party         : chr [1:32452] "DEMOCRAT" "REPUBLICAN" NA "DEMOCRAT" ...
##  $ writein       : logi [1:32452] FALSE FALSE TRUE FALSE FALSE TRUE ...
##  $ mode          : chr [1:32452] "TOTAL" "TOTAL" "TOTAL" "TOTAL" ...
##  $ candidatevotes: num [1:32452] 58906 98257 7 66288 90069 ...
##  $ totalvotes    : num [1:32452] 157170 157170 157170 156362 156362 ...
##  $ unofficial    : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ version       : num [1:32452] 20230706 20230706 20230706 20230706 20230706 ...
##  $ fusion_ticket : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   year = col_double(),
##   ..   state = col_character(),
##   ..   state_po = col_character(),
##   ..   state_fips = col_double(),
##   ..   state_cen = col_double(),
##   ..   state_ic = col_double(),
##   ..   office = col_character(),
##   ..   district = col_character(),
##   ..   stage = col_character(),
##   ..   runoff = col_logical(),
##   ..   special = col_logical(),
##   ..   candidate = col_character(),
##   ..   party = col_character(),
##   ..   writein = col_logical(),
##   ..   mode = col_character(),
##   ..   candidatevotes = col_double(),
##   ..   totalvotes = col_double(),
##   ..   unofficial = col_logical(),
##   ..   version = col_double(),
##   ..   fusion_ticket = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

Note that both of these datasets have a “state” column, so we can join the data.

First, Vaccination Rates By Medical Professionals

care_state = care_state %>% mutate(state_po = state)
c = care_state %>% filter(measure_id == "HCP_COVID_19")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)

## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
##  $ state_po    : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ r_wins      : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
##  $ d_wins      : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
##  $ d_ratio     : num [1:56] 1 0.143 0 0.333 0.769 ...
##  $ mostly_d    : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ state       : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ condition   : chr [1:56] "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" ...
##  $ measure_id  : chr [1:56] "HCP_COVID_19" "HCP_COVID_19" "HCP_COVID_19" "HCP_COVID_19" ...
##  $ measure_name: chr [1:56] "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" ...
##  $ score       : num [1:56] 7.3 5.9 2.7 17.4 22.4 11.8 13.2 38.3 6.4 10.3 ...
##  $ footnote    : chr [1:56] NA NA NA NA ...
##  $ start_date  : Date[1:56], format: NA NA ...
##  $ end_date    : Date[1:56], format: NA NA ...

g = Joined %>%  ggplot(aes(d_ratio, score, label=state)) + geom_point() + 
  ggtitle("State Politics vs. Vaccination Rate (COVID) in Healthcare") +
  geom_smooth(method="lm")
ggplotly(g)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

For Influenza

care_state = care_state %>% mutate(state_po = state)
c = care_state %>% filter(measure_id == "IMM_3")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)

## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
##  $ state_po    : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ r_wins      : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
##  $ d_wins      : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
##  $ d_ratio     : num [1:56] 1 0.143 0 0.333 0.769 ...
##  $ mostly_d    : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ state       : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ condition   : chr [1:56] "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" ...
##  $ measure_id  : chr [1:56] "IMM_3" "IMM_3" "IMM_3" "IMM_3" ...
##  $ measure_name: chr [1:56] "Healthcare workers given influenza vaccination Higher percentages are better" "Healthcare workers given influenza vaccination Higher percentages are better" "Healthcare workers given influenza vaccination Higher percentages are better" "Healthcare workers given influenza vaccination Higher percentages are better" ...
##  $ score       : num [1:56] 80 76 81 85 73 92 85 84 60 76 ...
##  $ footnote    : chr [1:56] NA NA NA NA ...
##  $ start_date  : Date[1:56], format: NA NA ...
##  $ end_date    : Date[1:56], format: NA NA ...

g = Joined %>%  ggplot(aes(d_ratio, score, label=state)) + geom_point() + 
  ggtitle("State Politics vs. Vaccination Rate (Influenza) in Healthcare") +
  geom_smooth(method="lm")
ggplotly(g)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Opiod Abuse

c = care_state %>% filter(measure_id == "SAFE_USE_OF_OPIOIDS")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)

## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
##  $ state_po    : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ r_wins      : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
##  $ d_wins      : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
##  $ d_ratio     : num [1:56] 1 0.143 0 0.333 0.769 ...
##  $ mostly_d    : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ state       : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ condition   : chr [1:56] "Electronic Clinical Quality Measure" "Electronic Clinical Quality Measure" "Electronic Clinical Quality Measure" "Electronic Clinical Quality Measure" ...
##  $ measure_id  : chr [1:56] "SAFE_USE_OF_OPIOIDS" "SAFE_USE_OF_OPIOIDS" "SAFE_USE_OF_OPIOIDS" "SAFE_USE_OF_OPIOIDS" ...
##  $ measure_name: chr [1:56] "Safe Use of Opioids - Concurrent Prescribing" "Safe Use of Opioids - Concurrent Prescribing" "Safe Use of Opioids - Concurrent Prescribing" "Safe Use of Opioids - Concurrent Prescribing" ...
##  $ score       : num [1:56] 16 14 16 11 14 15 18 15 16 15 ...
##  $ footnote    : chr [1:56] NA NA NA NA ...
##  $ start_date  : Date[1:56], format: NA NA ...
##  $ end_date    : Date[1:56], format: NA NA ...

g = Joined %>%  ggplot(aes(d_ratio, score, label=state)) + geom_point() + ggtitle("Politics vs. Safe Use of Opiods") +
  geom_smooth(method = "lm")
ggplotly(g)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Analyzing The Pattern

#Analyze this line
lm_o = lm(score~d_ratio, data=Joined)
summary(lm_o)

## 
## Call:
## lm(formula = score ~ d_ratio, data = Joined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0554 -0.9179  0.0905  1.1723  3.9446 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.4720     0.4125  35.085   <2e-16 ***
## d_ratio       0.5833     0.6984   0.835    0.408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.833 on 48 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.01432,    Adjusted R-squared:  -0.006211 
## F-statistic: 0.6975 on 1 and 48 DF,  p-value: 0.4077

Interestingly, It’s not statistically significant, even though it looks like it could have been.

Wait Times

c = care_state %>% filter(measure_id == "OP_18b")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)

## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
##  $ state_po    : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ r_wins      : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
##  $ d_wins      : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
##  $ d_ratio     : num [1:56] 1 0.143 0 0.333 0.769 ...
##  $ mostly_d    : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ state       : chr [1:56] "AK" "AL" "AR" "AZ" ...
##  $ condition   : chr [1:56] "Emergency Department" "Emergency Department" "Emergency Department" "Emergency Department" ...
##  $ measure_id  : chr [1:56] "OP_18b" "OP_18b" "OP_18b" "OP_18b" ...
##  $ measure_name: chr [1:56] "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ ...
##  $ score       : num [1:56] 140 145 133 168 184 133 193 217 158 160 ...
##  $ footnote    : chr [1:56] "25, 26" "25, 26" "25, 26" "25, 26" ...
##  $ start_date  : Date[1:56], format: NA NA ...
##  $ end_date    : Date[1:56], format: NA NA ...

g = Joined %>%  ggplot(aes(score, reorder(state, score), label=state, color=mostly_d)) + geom_point() + 
  ggtitle("Politics vs Wait Times") +
  geom_smooth(method = "lm") + 
  scale_color_manual(values = c("TRUE" = "blue", "FALSE" = "red"))
ggplotly(g)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).

Analyzing the ratio

lm_w = lm(score~d_ratio, data=Joined)
summary(lm_w)

## 
## Call:
## lm(formula = score ~ d_ratio, data = Joined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.207 -17.163   1.078  12.920  73.454 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  130.922      5.495  23.825  < 2e-16 ***
## d_ratio       53.285      9.305   5.727 6.54e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.42 on 48 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.4059, Adjusted R-squared:  0.3935 
## F-statistic:  32.8 on 1 and 48 DF,  p-value: 6.538e-07

Here, p-value: 6.538e-07, so it’s very significant.

Emergency Rooms and Politics