Summary Stats and Figures

installing and accessing data

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
root <- file.path("/Users/tanha/Downloads/statsfigures") # paste your path to the Team 1 folder
raw_data <- read_csv(file.path(root, "raw_data.csv"))
Rows: 19698 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): sysid, aha_id, mcrnum, deal_type, name, sysname, lagsys, lagsysname...
dbl (8): deal, id, parent_id, year, merger, acquirer, target, validated

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
latest_data <- read_csv(file.path(root, "latest_data.csv"))
Rows: 19698 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): sysid, aha_id, validated, deal_type, name, sysname, lagsys, lagsysn...
dbl (8): deal, id, parent_id, year, mcrnum, merger, acquirer, target

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Part B: Number of Deals per Year

#Count unique deals per year for OLD dataset

raw_deals_per_year <- raw_data %>%
  group_by(year) %>%
  summarise(deal = n_distinct(deal), .groups = "drop")
  
  
#Count unique deals per year for NEW dataset

latest_deals_per_year <- latest_data %>%
  group_by(year) %>%
  summarise(deal = n_distinct(deal), .groups = "drop")

#Combine datasets for plotting
deals_combined <- bind_rows(
  raw_deals_per_year %>% mutate(dataset = "Old"),
  latest_deals_per_year %>% mutate(dataset = "New")
)

#Plot bar graph
ggplot(deals_combined, aes(x = factor(year), y = deal, fill = dataset)) +
  geom_bar(stat = "identity", position = "dodge") +       # side-by-side bars
  labs(
    title = "Number of Deals Per Year: Old vs New Dataset",
    x = "Year",
    y = "Number of Deals",
    fill = "Dataset"
  ) +
  geom_text(aes(label = deal),                               # add labels on top
            position = position_dodge(width = 0.9), 
            vjust = -0.5) +
  scale_fill_manual(values = c("Old" = "steelblue", "New" = "lightgreen")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # rotate x-axis if needed

Part C Average Number of Hospitals Participating in a Deal

# Count hospitals per deal for OLD dataset
raw_hospitals_per_deal <- raw_data %>%
  group_by(year, deal) %>%
  summarise(hospitals_in_deal = n_distinct(aha_id), .groups = "drop")

# Count hospitals per deal for NEW dataset
latest_hospitals_per_deal <- latest_data %>%
  group_by(year, deal) %>%
  summarise(hospitals_in_deal = n_distinct(aha_id), .groups = "drop")

# PLot each deal per year
# OLD dataset
ggplot(raw_hospitals_per_deal, aes(x = factor(deal), y = hospitals_in_deal)) +
  geom_col(fill = "steelblue") +
  facet_wrap(~ year, scales = "free_x") +
  labs(
    title = "Number of Hospitals per Deal by Year (Old Dataset)",
    x = "Deal Number",
    y = "Number of Hospitals"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

# NEW dataset
ggplot(latest_hospitals_per_deal, aes(x = factor(deal), y = hospitals_in_deal)) +
  geom_col(fill = "lightgreen") +
  facet_wrap(~ year, scales = "free_x") +
  labs(
    title = "Number of Hospitals per Deal by Year (New Dataset)",
    x = "Deal Number",
    y = "Number of Hospitals"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

#Average number of hospitals per deal across years
# Compute averages per year (OLD)
raw_avg_per_year <- raw_hospitals_per_deal %>%
  group_by(year) %>%
  summarise(avg_hospitals = mean(hospitals_in_deal), .groups = "drop")

# Compute averages per year (NEW)
latest_avg_per_year <- latest_hospitals_per_deal %>%
  group_by(year) %>%
  summarise(avg_hospitals = mean(hospitals_in_deal), .groups = "drop")

# Plot line graph comparing OLD vs NEW
ggplot() +
  geom_line(data = raw_avg_per_year, aes(x = year, y = avg_hospitals), color = "steelblue", size = 1.2) +
  geom_point(data = raw_avg_per_year, aes(x = year, y = avg_hospitals), color = "steelblue", size = 3) +
  geom_line(data = latest_avg_per_year, aes(x = year, y = avg_hospitals), color = "lightgreen", size = 1.2) +
  geom_point(data = latest_avg_per_year, aes(x = year, y = avg_hospitals), color = "lightgreen", size = 3) +
  labs(
    title = "Average Number of Hospitals per Deal Across Years",
    x = "Year",
    y = "Average Hospitals per Deal",
    color = "Dataset"
  ) +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Ignoring unknown labels:
• colour : "Dataset"

Summary Stats Table

# Overall stats for OLD dataset
raw_stats <- raw_hospitals_per_deal %>%
  summarise(
    num_deals_total = n(),
    min_hospitals = min(hospitals_in_deal),
    median_hospitals = median(hospitals_in_deal),
    mean_hospitals = mean(hospitals_in_deal),
    max_hospitals = max(hospitals_in_deal)
  ) %>%
  mutate(dataset = "Old")  # label

# Compute number of deals per year for OLD dataset
raw_deals_per_year <- raw_hospitals_per_deal %>%
  group_by(year) %>%
  summarise(num_deals_year = n(), .groups = "drop")
  
  
  
# Overall stats for NEW dataset
latest_stats <- latest_hospitals_per_deal %>%
  summarise(
    num_deals_total = n(),
    min_hospitals = min(hospitals_in_deal),
    median_hospitals = median(hospitals_in_deal),
    mean_hospitals = mean(hospitals_in_deal),
    max_hospitals = max(hospitals_in_deal)
  ) %>%
  mutate(dataset = "New")  # label

# Compute number of deals per year for NEW dataset
latest_deals_per_year <- latest_hospitals_per_deal %>%
  group_by(year) %>%
  summarise(num_deals_year = n(), .groups = "drop")
  
  
summary_table <- bind_rows(raw_stats, latest_stats) %>%
  select(dataset, everything())


cat("=== Overall Summary Stats (Old vs New) ===\n")
=== Overall Summary Stats (Old vs New) ===
print(summary_table)
# A tibble: 2 × 6
  dataset num_deals_total min_hospitals median_hospitals mean_hospitals
  <chr>             <int>         <int>            <int>          <dbl>
1 Old                 445             1                4           44.3
2 New                 445             1                4           44.3
# ℹ 1 more variable: max_hospitals <int>
cat("\n=== Number of Deals Per Year (Old Dataset) ===\n")

=== Number of Deals Per Year (Old Dataset) ===
print(raw_deals_per_year)
# A tibble: 6 × 2
   year num_deals_year
  <dbl>          <int>
1  2016             88
2  2017             84
3  2018             67
4  2019             81
5  2020             75
6  2021             50
cat("\n=== Number of Deals Per Year (New Dataset) ===\n")

=== Number of Deals Per Year (New Dataset) ===
print(latest_deals_per_year)
# A tibble: 6 × 2
   year num_deals_year
  <dbl>          <int>
1  2016             88
2  2017             84
3  2018             67
4  2019             81
5  2020             75
6  2021             50