Summary Stats and Figures

Title

Code block description

#Importing the tidyverse package
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Creating a path to files
root <- file.path("/Users/0524f/Dropbox/Team 1")

#Reading in cvs files with the data
df <- read_csv(file.path(root, "2_data", "updated_data", "latest_data.csv"))
Rows: 19698 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): sysid, aha_id, validated, deal_type, name, sysname, lagsys, lagsys...
dbl  (9): deal, id, parent_id, year, mcrnum, merger, acquirer, target, year_lag

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df0 <- read_csv(file.path(root, "2_data", "raw_data.csv"))
Rows: 19698 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): sysid, aha_id, mcrnum, deal_type, name, sysname, lagsys, lagsysname...
dbl (8): deal, id, parent_id, year, merger, acquirer, target, validated

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Creating a Comprehensive Summary Stats Table for "How many unique hospitals existed in our old vs new dataset for the whole sample period?" 
#i. By Year

#Summary of the new/latest dataset
summary_new_dataset <- df %>%
  group_by(year) %>%
  summarize(Unique_New_Hospitals = n_distinct(aha_id), .groups = 'drop')

#Summary of the raw/old dataset
summary_old_dataset <- df0 %>%
  group_by(year) %>%
  summarize(Unique_Old_Hospitals = n_distinct(aha_id), .groups = 'drop')

#Putting the summaries into a combined table
unique_hospitals_table <- full_join(summary_old_dataset, summary_new_dataset, by = "year") %>%
  arrange(year)

#Display
unique_hospitals_table
# A tibble: 6 × 3
   year Unique_Old_Hospitals Unique_New_Hospitals
  <dbl>                <int>                <int>
1  2016                 3341                 3341
2  2017                 3351                 3351
3  2018                 3300                 3300
4  2019                 3245                 3245
5  2020                 3236                 3236
6  2021                 3225                 3225
#iii. Graphing part i

# Reshape the data from wide to long format to make it look better on the graph
plot_data <- unique_hospitals_table %>%
  pivot_longer(
    cols = c(Unique_Old_Hospitals, Unique_New_Hospitals),
    names_to = "Dataset_Type",
    values_to = "Hospital_Count"
  )

# Create the plot
ggplot(plot_data, aes(x = year, y = Hospital_Count, color = Dataset_Type)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "Unique Hospitals Over Time: Old vs. New Datasets",
    x = "Year",
    y = "Number of Unique Hospitals",
    color = "Legend"
  ) +
  scale_color_manual(
    values = c("Unique_Old_Hospitals" = "blue", "Unique_New_Hospitals" = "red"),
    labels = c("Unique_Old_Hospitals" = "Old Dataset", "Unique_New_Hospitals" = "New Dataset")
  )

#Creating a Comprehensive Summary Stats Table for "How many unique hospitals existed in our old vs new dataset for the whole sample period?" 
#iia. By number of unique systems including independent hospitals

#Put in data
hospital_systems_in <- function(data) {
  data %>%
    
  #Group by year
  group_by(year) %>% 
  
  #Count the number of unique hospital systems
  summarize(Unique_Systems = n_distinct(sysid), .groups = 'drop')
}
#Summary for latest/new dataset
summary_new <- hospital_systems_in(df) %>%
  rename(New_Systems_Inc = Unique_Systems)

#Summary for raw/old dataset
summary_old <- hospital_systems_in(df0) %>%
  rename(Old_Systems_Inc = Unique_Systems)

#Combine the old versus new
unique_systems_in_table <- left_join(summary_old, summary_new, by = "year")

#Print Table in Quarto
unique_systems_in_table
# A tibble: 6 × 3
   year Old_Systems_Inc New_Systems_Inc
  <dbl>           <int>           <int>
1  2016            1365            1387
2  2017            1344            1353
3  2018            1294            1300
4  2019            1267            1257
5  2020            1234            1233
6  2021            1206            1215
#iii.Graphing part iia.

#Reshape
plot_data_in <- unique_systems_in_table %>%
  pivot_longer(cols = c(Old_Systems_Inc, New_Systems_Inc), 
               names_to = "Dataset_Source", 
               values_to = "System_Count")

# Create plot graph
ggplot(plot_data_in, aes(x = year, y = System_Count, color = Dataset_Source)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(title = "Unique Hospital Systems by Year",
       subtitle = "Including Independent Hospitals",
       x = "Year",
       y = "Number of Unique Systems",
       color = "Legend") +
  scale_color_manual(values = c("New_Systems_Inc" = "blue", "Old_Systems_Inc" = "red"),
                     labels = c("New Dataset", "Old Dataset"))

#iib.By number of unique systems excluding independent hospitals

#Put in data
hospital_systems_ex <- function(data) {
  data %>%
    
    #Filtering out independent systems
    filter(sysid != aha_id) %>%
    
    #Group by Year
    group_by(year) %>% 
    
    #Count the number of unique hospital systems
    summarize(Unique_Systems = n_distinct(sysid), .groups = 'drop')
}

#Summary for latest/new dataset
summary_new <- hospital_systems_ex(df) %>%
  rename(New_Systems_Ex = Unique_Systems)

#Summary for raw/old dataset
summary_old <- hospital_systems_ex(df0) %>%
  rename(Old_Systems_Ex = Unique_Systems)

#Combine
unique_systems_ex_table <- left_join(summary_old, summary_new, by = "year")


#Display
unique_systems_ex_table
# A tibble: 6 × 3
   year Old_Systems_Ex New_Systems_Ex
  <dbl>          <int>          <int>
1  2016            369            380
2  2017            362            373
3  2018            356            366
4  2019            356            362
5  2020            353            363
6  2021            347            357
#Reshape
plot_data_ex <- unique_systems_ex_table %>%
  pivot_longer(
    cols = c(Old_Systems_Ex, New_Systems_Ex), 
    names_to = "Dataset_Source", 
    values_to = "System_Count"
  )

# Create the plot graph
ggplot(plot_data_ex, aes(x = year, y = System_Count, color = Dataset_Source)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +  
  theme_minimal() +
  labs(
    title = "Unique Hospital Systems by Year",
    subtitle = "Excluding Independent Hospitals",
    x = "Year",
    y = "Number of Unique Systems",
    color = "Legend"
  ) +
  scale_color_manual(
    values = c("Old_Systems_Ex" = "orange", "New_Systems_Ex" = "pink"),
    labels = c("New Dataset", "Old Dataset")
  )

```

The echo: false option disables the printing of code (only output is displayed).