# Load necessary libraries
library(readxl)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(tibble) # Needed for tibble() function used in population cleaning
# ---- 1. READ EXPENDITURE FILES & CLEAN ----
rev_raw <- read_excel("Revenue Expenditure.xlsx", sheet = 1, skip = 2)
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...32`
cap_raw <- read_excel("Capital Expendituree.xlsx", sheet = 1, skip = 2)
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
# Basic cleaning: Drop empty columns and rename year column
rev_raw <- rev_raw %>% select(where(~!all(is.na(.))))
cap_raw <- cap_raw %>% select(where(~!all(is.na(.))))
names(rev_raw)[1] <- "Year"
names(cap_raw)[1] <- "Year"

# Remove footer rows and parse the year
rev_raw <- rev_raw %>% 
  filter(!is.na(Year), !grepl("source|description", tolower(as.character(Year)))) %>%
  mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))

cap_raw <- cap_raw %>% 
  filter(!is.na(Year), !grepl("source|description", tolower(as.character(Year)))) %>%
  mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))

# Handle potential extra header row (common fix for these datasets)
if (any(grepl("Medical and Public Health|Health Finance", names(rev_raw)[2], ignore.case = TRUE))) {
  rev_raw <- read_excel("Revenue Expenditure.xlsx", sheet = 1, skip = 3)
  rev_raw <- rev_raw[, colSums(!is.na(rev_raw)) > 0]
  names(rev_raw)[1] <- "Year"
  rev_raw <- rev_raw %>%
    filter(!is.na(Year)) %>%
    filter(!grepl("source|description", tolower(as.character(Year)))) %>%
    mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))
}
if (any(grepl("Medical and Public Health|Health Finance", names(cap_raw)[2], ignore.case = TRUE))) {
  cap_raw <- read_excel("Capital Expendituree.xlsx", sheet = 1, skip = 3)
  cap_raw <- cap_raw[, colSums(!is.na(cap_raw)) > 0]
  names(cap_raw)[1] <- "Year"
  cap_raw <- cap_raw %>%
    filter(!is.na(Year)) %>%
    filter(!grepl("source|description", tolower(as.character(Year)))) %>%
    mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))
}

# ---- 2. PIVOT TO LONG FORMAT & JOIN ----
state_cols_rev <- setdiff(names(rev_raw), c("Year","year"))
state_cols_cap <- setdiff(names(cap_raw), c("Year","year"))

# Pivot Revenue data
rev <- rev_raw %>%
  pivot_longer(cols = all_of(state_cols_rev), names_to = "state", values_to = "rev_amount") %>%
  mutate(
    state = gsub("\\s+", " ", trimws(as.character(state))),
    rev_amount = as.numeric(gsub(",", "", as.character(rev_amount))) 
  ) %>%
  select(state, year, rev_amount) %>%
  filter(!is.na(year), !is.na(rev_amount))

# Pivot Capital data
cap <- cap_raw %>%
  pivot_longer(cols = all_of(state_cols_cap), names_to = "state", values_to = "cap_amount") %>%
  mutate(
    state = gsub("\\s+", " ", trimws(as.character(state))),
    cap_amount = as.numeric(gsub(",", "", as.character(cap_amount)))
  ) %>%
  select(state, year, cap_amount) %>%
  filter(!is.na(year), !is.na(cap_amount))

# Full join and calculate total expenditure
df <- full_join(rev, cap, by = c("state","year")) %>%
  mutate(
    rev_amount = ifelse(is.na(rev_amount), 0, rev_amount),
    cap_amount = ifelse(is.na(cap_amount), 0, cap_amount),
    total_health_exp = rev_amount + cap_amount
  ) %>%
  arrange(state, year)

head(df, 10)
## # A tibble: 10 × 5
##    state           year rev_amount cap_amount total_health_exp
##    <chr>          <int>      <dbl>      <dbl>            <dbl>
##  1 Andhra Pradesh  2000     101332       4794           106126
##  2 Andhra Pradesh  2001     104163       5331           109494
##  3 Andhra Pradesh  2002     109117       3347           112464
##  4 Andhra Pradesh  2003     120039        823           120862
##  5 Andhra Pradesh  2004     118478       1510           119988
##  6 Andhra Pradesh  2005     127787        549           128336
##  7 Andhra Pradesh  2006     149447        662           150109
##  8 Andhra Pradesh  2007     204240       4303           208543
##  9 Andhra Pradesh  2008     238613       2952           241565
## 10 Andhra Pradesh  2009     280052       3985           284037
# ---- 3. CAGR TABLE & TOP-7 PLOT ----
cagr_table <- df %>%
  group_by(state) %>%
  summarise(
    year_start = min(year[total_health_exp > 0], na.rm = TRUE),
    year_end   = max(year[total_health_exp > 0], na.rm = TRUE),
    x_start    = total_health_exp[year == year_start][1],
    x_end      = total_health_exp[year == year_end][1],
    n_years    = year_end - year_start,
    .groups = "drop"
  ) %>%
  filter(is.finite(x_start), is.finite(x_end), n_years > 0, x_start > 0) %>%
  mutate(cagr = (x_end / x_start)^(1 / n_years) - 1) %>%
  arrange(desc(cagr))

print(cagr_table)
## # A tibble: 31 × 7
##    state             year_start year_end x_start  x_end n_years  cagr
##    <chr>                  <int>    <int>   <dbl>  <dbl>   <int> <dbl>
##  1 Uttarakhand             2000     2020    2925 252533      20 0.250
##  2 Chhattisgarh            2000     2020    6820 538955      20 0.244
##  3 Haryana                 2000     2020   26390 621020      20 0.171
##  4 Assam                   2000     2020   28558 609529      20 0.165
##  5 Odisha                  2000     2020   38301 725234      20 0.158
##  6 Jharkhand               2001     2020   29661 458231      19 0.155
##  7 Goa                     2000     2020    8325 146028      20 0.154
##  8 Sikkim                  2000     2020    2999  52544      20 0.154
##  9 Arunachal Pradesh       2000     2020    5772  99255      20 0.153
## 10 Jammu and Kashmir       2000     2020   36368 590230      20 0.150
## # ℹ 21 more rows
top_states <- head(cagr_table$state, 7)

# Plotting the time series for the top 7 states
ggplot(
  df %>% filter(state %in% top_states),
  aes(x = year, y = total_health_exp, color = state)
) +
  geom_line(linewidth = 1) +
  geom_point(size = 1.3) +
  scale_y_continuous(labels = scales::label_number(scale_cut = scales::cut_si("₹"))) +
  labs(
    title = "Total Health Expenditure — Top 7 States by CAGR",
    x = "Year",
    y = "₹ (nominal)",
    color = "State"
  ) +
  theme_minimal(base_size = 12)

# ---- 4. MERGE GDP AND POPULATION DATA & COMPUTE METRICS (Robust Version) ----
gdp_file <- "gdp.csv"
pop_file <- "pop.csv"

# ----- GDP: read and pivot long -----
gdp_raw <- read_csv(gdp_file, show_col_types = FALSE)

state_col_gdp <- names(gdp_raw)[grepl("state|state/ut|state_ut|region|unit", tolower(names(gdp_raw)))]
if (length(state_col_gdp) == 0) state_col_gdp <- names(gdp_raw)[1]
state_col_gdp <- state_col_gdp[1]

val_cols_gdp <- names(gdp_raw)[grepl("gsdp|gdp|\\d{4}", tolower(names(gdp_raw)))]
val_cols_gdp <- setdiff(val_cols_gdp, state_col_gdp)
if (length(val_cols_gdp) == 0) val_cols_gdp <- setdiff(names(gdp_raw), state_col_gdp)

gdp_long <- gdp_raw %>%
  pivot_longer(cols = all_of(val_cols_gdp), names_to = "gdp_col", values_to = "gdp_raw") %>%
  mutate(
    state = gsub("\\s+", " ", trimws(.data[[state_col_gdp]])),
    year = as.integer(str_extract(gdp_col, "(19|20)\\d{2}")),
    year = ifelse(is.na(year), as.integer(str_extract(gdp_col, "\\d{4}")), year),
    gdp = as.numeric(gsub(",", "", as.character(gdp_raw)))
  ) %>%
  select(state, year, gdp) %>%
  filter(!is.na(state))

if (all(is.na(gdp_long$year))) gdp_long <- gdp_long %>% mutate(year = 2011)
gdp <- gdp_long %>% arrange(state, year)

# ----- POPULATION: read and pivot long (handles snapshot columns too) -----
pop_raw <- read_csv(pop_file, show_col_types = FALSE)
state_col_pop <- names(pop_raw)[grepl("state|state/ut|state_ut|region|name", tolower(names(pop_raw)))]
if (length(state_col_pop) == 0) state_col_pop <- names(pop_raw)[1]
state_col_pop <- state_col_pop[1]

val_cols_pop <- names(pop_raw)[grepl("pop|population|\\d{4}", tolower(names(pop_raw)))]
val_cols_pop <- setdiff(val_cols_pop, state_col_pop)

if (length(val_cols_pop) > 0) {
  pop_long <- pop_raw %>%
    pivot_longer(cols = all_of(val_cols_pop), names_to = "pop_col", values_to = "pop_raw") %>%
    mutate(
      state = gsub("\\s+", " ", trimws(.data[[state_col_pop]])),
      year = as.integer(str_extract(pop_col, "(19|20)\\d{2}")),
      year = ifelse(is.na(year), as.integer(str_extract(pop_col, "\\d{4}")), year),
      population = as.numeric(gsub(",", "", as.character(pop_raw)))
    ) %>%
    select(state, year, population) %>%
    filter(!is.na(state))
  pop_snapshot <- pop_long %>% filter(is.na(year)) %>% select(state, population) %>% distinct()
  pop_long <- pop_long %>% filter(!is.na(year))
} else {
  num_cols <- names(pop_raw)[sapply(pop_raw, function(x) all(is.na(as.numeric(gsub(",","",as.character(x)))) == FALSE, na.rm = TRUE))]
  pop_val_col <- setdiff(num_cols, state_col_pop)
  if (length(pop_val_col) == 0) stop("Could not detect population column in pop.csv; open the file and confirm column names.")
  pop_snapshot <- pop_raw %>% rename(state = !!state_col_pop) %>%
    transmute(state = gsub("\\s+", " ", trimws(state)),
              population = as.numeric(gsub(",", "", as.character(.data[[pop_val_col[1]]])))) %>%
    distinct()
  pop_long <- tibble(state = character(), year = integer(), population = numeric())
}

# create full pop table for all years present in df
years_needed <- sort(unique(df$year))
pop_from_years <- pop_long
if (nrow(pop_snapshot) > 0) {
  snap_rep <- expand.grid(state = pop_snapshot$state, year = years_needed, stringsAsFactors = FALSE) %>%
    as_tibble() %>%
    left_join(pop_snapshot, by = "state")
  pop_full <- bind_rows(pop_from_years, snap_rep) %>%
    group_by(state, year) %>%
    summarise(population = first(na.omit(population)), .groups = "drop")
} else {
  pop_full <- pop_from_years
}

states_union <- sort(unique(c(df$state, gdp$state, pop_full$state)))
pop_full <- expand.grid(state = states_union, year = years_needed, stringsAsFactors = FALSE) %>%
  as_tibble() %>%
  left_join(pop_full, by = c("state","year")) %>%
  group_by(state) %>%
  fill(population, .direction = "downup") %>%
  ungroup()

# ----- Harmonize common state name variants -----
recode_pairs <- c("Orissa" = "Odisha", "Pondicherry" = "Puducherry",
                  "NCT of Delhi" = "Delhi", "Jammu and Kashmir" = "Jammu & Kashmir",
                  "Uttaranchal" = "Uttarakhand", "Andhra Pradesh (Undivided)" = "Andhra Pradesh")
gdp <- gdp %>% mutate(state = recode(state, !!!recode_pairs, .default = state))
pop_full <- pop_full %>% mutate(state = recode(state, !!!recode_pairs, .default = state))
df <- df %>% mutate(state = gsub("\\s+", " ", trimws(as.character(state)))) %>% mutate(state = recode(state, !!!recode_pairs, .default = state))

# ----- Diagnostics: print a sample of states so you can check -----
cat("sample states in df:", paste(head(unique(df$state), 12), collapse = ", "), "\n")
## sample states in df: Andhra Pradesh, Arunachal Pradesh, Assam, Bihar, Chhattisgarh, Delhi, Goa, Gujarat, Haryana, Himachal Pradesh, Jammu & Kashmir, Jharkhand
cat("sample states in gdp:", paste(head(unique(gdp$state), 12), collapse = ", "), "\n")
## sample states in gdp: ---, Andhra Pradesh (Post-Bifurcation), Andhra Pradesh, Arunachal Pradesh, Assam, Bihar, Chhattisgarh, Goa, Gujarat, Haryana, Himachal Pradesh, Jammu & Kashmir (Undivided)
cat("sample states in pop:", paste(head(unique(pop_full$state), 12), collapse = ", "), "\n")
## sample states in pop: ---, Andaman & Nicobar Islands, Andhra Pradesh, Andhra Pradesh (Post-Bifurcation), Arunachal Pradesh, Assam, Bihar, Chandigarh, Chhattisgarh, Dadra & Nagar Haveli and Daman & Diu, Delhi, Goa
# ----- Fill GDP for all states x years: use latest available per state if needed -----
gdp_latest <- gdp %>% arrange(state, year) %>% group_by(state) %>% summarise(gdp_latest = last(na.omit(gdp)), .groups="drop")
gdp_full <- expand.grid(state = states_union, year = years_needed, stringsAsFactors = FALSE) %>%
  as_tibble() %>% left_join(gdp %>% select(state, year, gdp), by = c("state","year")) %>%
  left_join(gdp_latest, by = "state") %>%
  mutate(gdp = ifelse(is.na(gdp), gdp_latest, gdp)) %>%
  select(state, year, gdp)

# ----- Join with df and compute metrics -----
df2 <- df %>%
  left_join(gdp_full, by = c("state","year")) %>%
  left_join(pop_full %>% rename(population = population), by = c("state","year"))

# --- FINAL DIAGNOSTIC CHECK (key to solving blank plots) ---
cat("rows after join:", nrow(df2), "\n")
## rows after join: 673
cat("rows with GDP & POP:", sum(!is.na(df2$gdp) & !is.na(df2$population)), "\n")
## rows with GDP & POP: 573
# Filter and compute metrics
df2 <- df2 %>% filter(!is.na(gdp), !is.na(population)) %>%
  mutate(pche = total_health_exp / population,
         he_pct_gdp = 100 * total_health_exp / gdp)

# ----- Plots -----
si_rupee <- scales::label_number(scale_cut = scales::cut_si("₹"))
p1 <- ggplot(df2, aes(x=year, y=pche, group=state)) +
  geom_line(alpha=0.35, linewidth=0.7) +
  stat_summary(aes(group=1), fun=median, geom="line", linewidth=1) +
  scale_y_continuous(labels = si_rupee) + labs(title="PCHE", x="Year", y="₹ per person") + theme_minimal()
p2 <- ggplot(df2, aes(x=year, y=he_pct_gdp, group=state)) +
  geom_line(alpha=0.35, linewidth=0.7) +
  stat_summary(aes(group=1), fun=median, geom="line", linewidth=1) +
  labs(title="Health Expenditure as % of GSDP", x="Year", y="% of GSDP") + theme_minimal()
print(p1); print(p2)

# ---- 5. ELASTICITY TEST (LOG-LOG MODEL) ----
if (exists("df2") && nrow(df2) > 0) {
  df2 <- df2 %>%
    mutate(
      pci = gdp / population, # Per-Capita Income (PCI)
      ln_pche = log(pche),
      ln_pci  = log(pci)
    ) %>%
    filter(is.finite(ln_pche), is.finite(ln_pci))
  
  model <- lm(ln_pche ~ ln_pci, data = df2)
  summary(model)
  
  beta <- coef(model)["ln_pci"]
  if (!is.na(beta)) {
    cat("\nElasticity (beta) =", round(beta, 3), "\n")
    if (beta < 1) cat("Conclusion: beta <", 1, "→ public healthcare is an **INCOME NECESSITY**.\n")
    else if (beta > 1) cat("Conclusion: beta >", 1, "→ public healthcare is an **INCOME LUXURY**.\n")
    else cat("Conclusion: beta ≈ 1 → unit elasticity.\n")
  }
} else {
  message("Elasticity test skipped: df2 is empty.")
}
## 
## Elasticity (beta) = 0.546 
## Conclusion: beta < 1 → public healthcare is an **INCOME NECESSITY**.
# ---- 6. KDE / OVERLAPPING DENSITIES for last 3 years ----
if (exists("df2") && nrow(df2) > 0) {
  years_avail <- sort(unique(df2$year))
  years_pick <- tail(years_avail, 3)

  kde_df <- df2 %>% filter(year %in% years_pick)
  rupee_labeler <- label_number(scale_cut = scales::cut_si("₹"))

  ggplot(kde_df, aes(x = pche, color = factor(year))) +
    geom_density(linewidth = 1) +
    scale_x_continuous(labels = rupee_labeler) +
    labs(
      title = "Distribution of Per-Capita Health Expenditure (PCHE)",
      subtitle = paste("Years:", paste(years_pick, collapse = ", ")),
      x = "PCHE (₹ per person)", y = "Density", color = "Year"
    ) +
    theme_minimal(base_size = 12)
} else {
  message("KDE plot skipped: df2 is empty.")
}