Jackson Philbrook
2025-12-06
Standard tidyverse and plotly
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
I used two TinyTuesday DataSets, Emergency Room Data, and US House Results
Emergency Room Data
# Using R
# Option 1: tidytuesdayR R package
## install.packages("tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2025-04-08')## ---- Compiling #TidyTuesday Information for 2025-04-08 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "care_state.csv"
## ---- Compiling #TidyTuesday Information for 2025-04-08 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "care_state.csv"
care_state <- tuesdata$care_state
# Option 2: Read directly from GitHub
care_state <- readr::read_csv("C:/Users/japhi/OneDrive/Desktop/Data Analysis/Final Project/care_state.csv")## Rows: 1232 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): state, condition, measure_id, measure_name, footnote
## dbl (1): score
## date (2): start_date, end_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Emergency Room Data Cleanup
# Data downloaded manually from https://data.cms.gov/provider-data/dataset/apyc-v239
library(tidyverse)
library(here)## Warning: package 'here' was built under R version 4.5.2
## here() starts at C:/Users/japhi/OneDrive/Desktop/Data Analysis/Final Project
## Warning: package 'janitor' was built under R version 4.5.2
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
care_state <- readr::read_csv("C:/Users/japhi/OneDrive/Desktop/Data Analysis/Final Project/care_state.csv", col_types = cols(score = col_character())) |>
janitor::clean_names() |>
dplyr::mutate(
score = dplyr::na_if(score, "Not Available") |>
as.double(),
dplyr::across(
dplyr::ends_with("_date"),
lubridate::mdy
)
)## Warning: There were 2 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `dplyr::across(dplyr::ends_with("_date"), lubridate::mdy)`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
House Results Data
# Option 1: tidytuesdayR package
## install.packages("tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2023-11-07')## ---- Compiling #TidyTuesday Information for 2023-11-07 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "house.csv"
## ---- Compiling #TidyTuesday Information for 2023-11-07 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "house.csv"
house <- tuesdata$house
# Option 2: Read directly from GitHub
house <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-11-07/house.csv')## Rows: 32452 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): state, state_po, office, district, stage, candidate, party, mode
## dbl (7): year, state_fips, state_cen, state_ic, candidatevotes, totalvotes, ...
## lgl (5): runoff, special, writein, unofficial, fusion_ticket
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Emergency Room Data:
House Results Data:
Find out who won by party in 2022 (Most Recent)
#Who won each election
h = house %>% filter(year %in% c("2022"), stage=="GEN") %>%
group_by(state, district) %>%
mutate(
won = if_else(candidatevotes == max(candidatevotes), TRUE, FALSE),
party = as_factor(party)) %>%
ungroup()
#Districts Per State
h = h %>% group_by(state_po) %>%
mutate(districts_total = max(district)) %>%
ungroup()
r_parties = h %>% filter(party == "REPUBLICAN") %>%
group_by(state_po) %>%
summarize(
r_wins = sum(won) ) %>%
ungroup()
d_parties = h %>%
group_by(state_po) %>%
filter(party == "DEMOCRAT") %>%
summarize(
d_wins = sum(won) ) %>%
ungroup()
parties = full_join(r_parties, d_parties)## Joining with `by = join_by(state_po)`
## tibble [1,232 × 8] (S3: tbl_df/tbl/data.frame)
## $ state : chr [1:1232] "AK" "AK" "AK" "AK" ...
## $ condition : chr [1:1232] "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Emergency Department" "Emergency Department" ...
## $ measure_id : chr [1:1232] "HCP_COVID_19" "IMM_3" "OP_18b" "OP_18b_HIGH_MIN" ...
## $ measure_name: chr [1:1232] "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Healthcare workers given influenza vaccination Higher percentages are better" "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average time patients spent in the emergency department before being sent home A lower number of minutes is better (high)" ...
## $ score : num [1:1232] 7.3 80 140 157 136 136 NA 196 230 182 ...
## $ footnote : chr [1:1232] NA NA "25, 26" "25, 26" ...
## $ start_date : Date[1:1232], format: NA NA ...
## $ end_date : Date[1:1232], format: NA NA ...
## spc_tbl_ [32,452 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ year : num [1:32452] 1976 1976 1976 1976 1976 ...
## $ state : chr [1:32452] "ALABAMA" "ALABAMA" "ALABAMA" "ALABAMA" ...
## $ state_po : chr [1:32452] "AL" "AL" "AL" "AL" ...
## $ state_fips : num [1:32452] 1 1 1 1 1 1 1 1 1 1 ...
## $ state_cen : num [1:32452] 63 63 63 63 63 63 63 63 63 63 ...
## $ state_ic : num [1:32452] 41 41 41 41 41 41 41 41 41 41 ...
## $ office : chr [1:32452] "US HOUSE" "US HOUSE" "US HOUSE" "US HOUSE" ...
## $ district : chr [1:32452] "001" "001" "001" "002" ...
## $ stage : chr [1:32452] "GEN" "GEN" "GEN" "GEN" ...
## $ runoff : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ special : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ candidate : chr [1:32452] "BILL DAVENPORT" "JACK EDWARDS" "WRITEIN" "J CAROLE KEAHEY" ...
## $ party : chr [1:32452] "DEMOCRAT" "REPUBLICAN" NA "DEMOCRAT" ...
## $ writein : logi [1:32452] FALSE FALSE TRUE FALSE FALSE TRUE ...
## $ mode : chr [1:32452] "TOTAL" "TOTAL" "TOTAL" "TOTAL" ...
## $ candidatevotes: num [1:32452] 58906 98257 7 66288 90069 ...
## $ totalvotes : num [1:32452] 157170 157170 157170 156362 156362 ...
## $ unofficial : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ version : num [1:32452] 20230706 20230706 20230706 20230706 20230706 ...
## $ fusion_ticket : logi [1:32452] FALSE FALSE FALSE FALSE FALSE FALSE ...
## - attr(*, "spec")=
## .. cols(
## .. year = col_double(),
## .. state = col_character(),
## .. state_po = col_character(),
## .. state_fips = col_double(),
## .. state_cen = col_double(),
## .. state_ic = col_double(),
## .. office = col_character(),
## .. district = col_character(),
## .. stage = col_character(),
## .. runoff = col_logical(),
## .. special = col_logical(),
## .. candidate = col_character(),
## .. party = col_character(),
## .. writein = col_logical(),
## .. mode = col_character(),
## .. candidatevotes = col_double(),
## .. totalvotes = col_double(),
## .. unofficial = col_logical(),
## .. version = col_double(),
## .. fusion_ticket = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
Note that both of these datasets have a “state” column, so we can join the data.
care_state = care_state %>% mutate(state_po = state)
c = care_state %>% filter(measure_id == "HCP_COVID_19")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
## $ state_po : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ r_wins : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
## $ d_wins : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
## $ d_ratio : num [1:56] 1 0.143 0 0.333 0.769 ...
## $ mostly_d : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ state : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ condition : chr [1:56] "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" ...
## $ measure_id : chr [1:56] "HCP_COVID_19" "HCP_COVID_19" "HCP_COVID_19" "HCP_COVID_19" ...
## $ measure_name: chr [1:56] "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" "Percentage of healthcare personnel who are up to date with COVID-19 vaccinations" ...
## $ score : num [1:56] 7.3 5.9 2.7 17.4 22.4 11.8 13.2 38.3 6.4 10.3 ...
## $ footnote : chr [1:56] NA NA NA NA ...
## $ start_date : Date[1:56], format: NA NA ...
## $ end_date : Date[1:56], format: NA NA ...
g = Joined %>% ggplot(aes(d_ratio, score, label=state)) + geom_point() +
ggtitle("State Politics vs. Vaccination Rate (COVID) in Healthcare") +
geom_smooth(method="lm")
ggplotly(g)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
For Influenza
care_state = care_state %>% mutate(state_po = state)
c = care_state %>% filter(measure_id == "IMM_3")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
## $ state_po : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ r_wins : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
## $ d_wins : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
## $ d_ratio : num [1:56] 1 0.143 0 0.333 0.769 ...
## $ mostly_d : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ state : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ condition : chr [1:56] "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" "Healthcare Personnel Vaccination" ...
## $ measure_id : chr [1:56] "IMM_3" "IMM_3" "IMM_3" "IMM_3" ...
## $ measure_name: chr [1:56] "Healthcare workers given influenza vaccination Higher percentages are better" "Healthcare workers given influenza vaccination Higher percentages are better" "Healthcare workers given influenza vaccination Higher percentages are better" "Healthcare workers given influenza vaccination Higher percentages are better" ...
## $ score : num [1:56] 80 76 81 85 73 92 85 84 60 76 ...
## $ footnote : chr [1:56] NA NA NA NA ...
## $ start_date : Date[1:56], format: NA NA ...
## $ end_date : Date[1:56], format: NA NA ...
g = Joined %>% ggplot(aes(d_ratio, score, label=state)) + geom_point() +
ggtitle("State Politics vs. Vaccination Rate (Influenza) in Healthcare") +
geom_smooth(method="lm")
ggplotly(g)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
c = care_state %>% filter(measure_id == "SAFE_USE_OF_OPIOIDS")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
## $ state_po : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ r_wins : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
## $ d_wins : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
## $ d_ratio : num [1:56] 1 0.143 0 0.333 0.769 ...
## $ mostly_d : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ state : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ condition : chr [1:56] "Electronic Clinical Quality Measure" "Electronic Clinical Quality Measure" "Electronic Clinical Quality Measure" "Electronic Clinical Quality Measure" ...
## $ measure_id : chr [1:56] "SAFE_USE_OF_OPIOIDS" "SAFE_USE_OF_OPIOIDS" "SAFE_USE_OF_OPIOIDS" "SAFE_USE_OF_OPIOIDS" ...
## $ measure_name: chr [1:56] "Safe Use of Opioids - Concurrent Prescribing" "Safe Use of Opioids - Concurrent Prescribing" "Safe Use of Opioids - Concurrent Prescribing" "Safe Use of Opioids - Concurrent Prescribing" ...
## $ score : num [1:56] 16 14 16 11 14 15 18 15 16 15 ...
## $ footnote : chr [1:56] NA NA NA NA ...
## $ start_date : Date[1:56], format: NA NA ...
## $ end_date : Date[1:56], format: NA NA ...
g = Joined %>% ggplot(aes(d_ratio, score, label=state)) + geom_point() + ggtitle("Politics vs. Safe Use of Opiods") +
geom_smooth(method = "lm")
ggplotly(g)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Analyzing The Pattern
##
## Call:
## lm(formula = score ~ d_ratio, data = Joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0554 -0.9179 0.0905 1.1723 3.9446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.4720 0.4125 35.085 <2e-16 ***
## d_ratio 0.5833 0.6984 0.835 0.408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.833 on 48 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.01432, Adjusted R-squared: -0.006211
## F-statistic: 0.6975 on 1 and 48 DF, p-value: 0.4077
Interestingly, It’s not statistically significant, even though it looks like it could have been.
c = care_state %>% filter(measure_id == "OP_18b")
Joined = parties %>% full_join(c, by="state_po")
str(Joined)## tibble [56 × 13] (S3: tbl_df/tbl/data.frame)
## $ state_po : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ r_wins : int [1:56] 0 6 4 6 12 3 0 0 20 9 ...
## $ d_wins : int [1:56] 1 1 0 3 40 5 5 1 8 5 ...
## $ d_ratio : num [1:56] 1 0.143 0 0.333 0.769 ...
## $ mostly_d : logi [1:56] TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ state : chr [1:56] "AK" "AL" "AR" "AZ" ...
## $ condition : chr [1:56] "Emergency Department" "Emergency Department" "Emergency Department" "Emergency Department" ...
## $ measure_id : chr [1:56] "OP_18b" "OP_18b" "OP_18b" "OP_18b" ...
## $ measure_name: chr [1:56] "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ "Average (median) time patients spent in the emergency department before leaving from the visit A lower number o"| __truncated__ ...
## $ score : num [1:56] 140 145 133 168 184 133 193 217 158 160 ...
## $ footnote : chr [1:56] "25, 26" "25, 26" "25, 26" "25, 26" ...
## $ start_date : Date[1:56], format: NA NA ...
## $ end_date : Date[1:56], format: NA NA ...
g = Joined %>% ggplot(aes(score, reorder(state, score), label=state, color=mostly_d)) + geom_point() +
ggtitle("Politics vs Wait Times") +
geom_smooth(method = "lm") +
scale_color_manual(values = c("TRUE" = "blue", "FALSE" = "red"))
ggplotly(g)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
Analyzing the ratio
##
## Call:
## lm(formula = score ~ d_ratio, data = Joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.207 -17.163 1.078 12.920 73.454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 130.922 5.495 23.825 < 2e-16 ***
## d_ratio 53.285 9.305 5.727 6.54e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.42 on 48 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.4059, Adjusted R-squared: 0.3935
## F-statistic: 32.8 on 1 and 48 DF, p-value: 6.538e-07
Here, p-value: 6.538e-07, so it’s very significant.
T Election Data and Science Lab. (2017). U.S. House 1976–2022 (Version 12) [Data set]. Harvard Dataverse. https://doi.org/10.7910/DVN/IG0UN2
Centers for Medicare & Medicaid Services (CMS). (2025). Timely and Effective Care – State [Data set]. Hospital Compare. https://data.cms.gov/provider-data/dataset/apyc-v239