Title
Code block description
#Importing the tidyverse package
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.1 ✔ tibble 3.3.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.2
✔ purrr 1.2.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Creating a path to files
root <- file.path("/Users/0524f/Dropbox/Team 1")
#Reading in cvs files with the data
df <- read_csv(file.path(root, "2_data", "updated_data", "latest_data.csv"))
Rows: 19698 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): sysid, aha_id, validated, deal_type, name, sysname, lagsys, lagsys...
dbl (9): deal, id, parent_id, year, mcrnum, merger, acquirer, target, year_lag
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df0 <- read_csv(file.path(root, "2_data", "raw_data.csv"))
Rows: 19698 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): sysid, aha_id, mcrnum, deal_type, name, sysname, lagsys, lagsysname...
dbl (8): deal, id, parent_id, year, merger, acquirer, target, validated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Creating a Comprehensive Summary Stats Table for "How many unique hospitals existed in our old vs new dataset for the whole sample period?"
#i. By Year
#Summary of the new/latest dataset
summary_new_dataset <- df %>%
group_by(year) %>%
summarize(Unique_New_Hospitals = n_distinct(aha_id), .groups = 'drop')
#Summary of the raw/old dataset
summary_old_dataset <- df0 %>%
group_by(year) %>%
summarize(Unique_Old_Hospitals = n_distinct(aha_id), .groups = 'drop')
#Putting the summaries into a combined table
unique_hospitals_table <- full_join(summary_old_dataset, summary_new_dataset, by = "year") %>%
arrange(year)
#Display
unique_hospitals_table
# A tibble: 6 × 3
year Unique_Old_Hospitals Unique_New_Hospitals
<dbl> <int> <int>
1 2016 3341 3341
2 2017 3351 3351
3 2018 3300 3300
4 2019 3245 3245
5 2020 3236 3236
6 2021 3225 3225
#iii. Graphing part i
# Reshape the data from wide to long format to make it look better on the graph
plot_data <- unique_hospitals_table %>%
pivot_longer(
cols = c(Unique_Old_Hospitals, Unique_New_Hospitals),
names_to = "Dataset_Type",
values_to = "Hospital_Count"
)
# Create the plot
ggplot(plot_data, aes(x = year, y = Hospital_Count, color = Dataset_Type)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "Unique Hospitals Over Time: Old vs. New Datasets",
x = "Year",
y = "Number of Unique Hospitals",
color = "Legend"
) +
scale_color_manual(
values = c("Unique_Old_Hospitals" = "blue", "Unique_New_Hospitals" = "red"),
labels = c("Unique_Old_Hospitals" = "Old Dataset", "Unique_New_Hospitals" = "New Dataset")
)
#Creating a Comprehensive Summary Stats Table for "How many unique hospitals existed in our old vs new dataset for the whole sample period?"
#iia. By number of unique systems including independent hospitals
#Put in data
hospital_systems_in <- function(data) {
data %>%
#Group by year
group_by(year) %>%
#Count the number of unique hospital systems
summarize(Unique_Systems = n_distinct(sysid), .groups = 'drop')
}
#Summary for latest/new dataset
summary_new <- hospital_systems_in(df) %>%
rename(New_Systems_Inc = Unique_Systems)
#Summary for raw/old dataset
summary_old <- hospital_systems_in(df0) %>%
rename(Old_Systems_Inc = Unique_Systems)
#Combine the old versus new
unique_systems_in_table <- left_join(summary_old, summary_new, by = "year")
#Print Table in Quarto
unique_systems_in_table
# A tibble: 6 × 3
year Old_Systems_Inc New_Systems_Inc
<dbl> <int> <int>
1 2016 1365 1387
2 2017 1344 1353
3 2018 1294 1300
4 2019 1267 1257
5 2020 1234 1233
6 2021 1206 1215
#iii.Graphing part iia.
#Reshape
plot_data_in <- unique_systems_in_table %>%
pivot_longer(cols = c(Old_Systems_Inc, New_Systems_Inc),
names_to = "Dataset_Source",
values_to = "System_Count")
# Create plot graph
ggplot(plot_data_in, aes(x = year, y = System_Count, color = Dataset_Source)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
theme_minimal() +
labs(title = "Unique Hospital Systems by Year",
subtitle = "Including Independent Hospitals",
x = "Year",
y = "Number of Unique Systems",
color = "Legend") +
scale_color_manual(values = c("New_Systems_Inc" = "blue", "Old_Systems_Inc" = "red"),
labels = c("New Dataset", "Old Dataset"))
#iib.By number of unique systems excluding independent hospitals
#Put in data
hospital_systems_ex <- function(data) {
data %>%
#Filtering out independent systems
filter(sysid != aha_id) %>%
#Group by Year
group_by(year) %>%
#Count the number of unique hospital systems
summarize(Unique_Systems = n_distinct(sysid), .groups = 'drop')
}
#Summary for latest/new dataset
summary_new <- hospital_systems_ex(df) %>%
rename(New_Systems_Ex = Unique_Systems)
#Summary for raw/old dataset
summary_old <- hospital_systems_ex(df0) %>%
rename(Old_Systems_Ex = Unique_Systems)
#Combine
unique_systems_ex_table <- left_join(summary_old, summary_new, by = "year")
#Display
unique_systems_ex_table
# A tibble: 6 × 3
year Old_Systems_Ex New_Systems_Ex
<dbl> <int> <int>
1 2016 369 380
2 2017 362 373
3 2018 356 366
4 2019 356 362
5 2020 353 363
6 2021 347 357
#Reshape
plot_data_ex <- unique_systems_ex_table %>%
pivot_longer(
cols = c(Old_Systems_Ex, New_Systems_Ex),
names_to = "Dataset_Source",
values_to = "System_Count"
)
# Create the plot graph
ggplot(plot_data_ex, aes(x = year, y = System_Count, color = Dataset_Source)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "Unique Hospital Systems by Year",
subtitle = "Excluding Independent Hospitals",
x = "Year",
y = "Number of Unique Systems",
color = "Legend"
) +
scale_color_manual(
values = c("Old_Systems_Ex" = "orange", "New_Systems_Ex" = "pink"),
labels = c("New Dataset", "Old Dataset")
)
```
The echo: false option disables the printing of code (only output is displayed).