# Load necessary libraries
library(readxl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(tibble) # Needed for tibble() function used in population cleaning
# ---- 1. READ EXPENDITURE FILES & CLEAN ----
rev_raw <- read_excel("Revenue Expenditure.xlsx", sheet = 1, skip = 2)
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...32`
cap_raw <- read_excel("Capital Expendituree.xlsx", sheet = 1, skip = 2)
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
# Basic cleaning: Drop empty columns and rename year column
rev_raw <- rev_raw %>% select(where(~!all(is.na(.))))
cap_raw <- cap_raw %>% select(where(~!all(is.na(.))))
names(rev_raw)[1] <- "Year"
names(cap_raw)[1] <- "Year"
# Remove footer rows and parse the year
rev_raw <- rev_raw %>%
filter(!is.na(Year), !grepl("source|description", tolower(as.character(Year)))) %>%
mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))
cap_raw <- cap_raw %>%
filter(!is.na(Year), !grepl("source|description", tolower(as.character(Year)))) %>%
mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))
# Handle potential extra header row (common fix for these datasets)
if (any(grepl("Medical and Public Health|Health Finance", names(rev_raw)[2], ignore.case = TRUE))) {
rev_raw <- read_excel("Revenue Expenditure.xlsx", sheet = 1, skip = 3)
rev_raw <- rev_raw[, colSums(!is.na(rev_raw)) > 0]
names(rev_raw)[1] <- "Year"
rev_raw <- rev_raw %>%
filter(!is.na(Year)) %>%
filter(!grepl("source|description", tolower(as.character(Year)))) %>%
mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))
}
if (any(grepl("Medical and Public Health|Health Finance", names(cap_raw)[2], ignore.case = TRUE))) {
cap_raw <- read_excel("Capital Expendituree.xlsx", sheet = 1, skip = 3)
cap_raw <- cap_raw[, colSums(!is.na(cap_raw)) > 0]
names(cap_raw)[1] <- "Year"
cap_raw <- cap_raw %>%
filter(!is.na(Year)) %>%
filter(!grepl("source|description", tolower(as.character(Year)))) %>%
mutate(year = as.integer(str_extract(as.character(Year), "(19|20)\\d{2}")))
}
# ---- 2. PIVOT TO LONG FORMAT & JOIN ----
state_cols_rev <- setdiff(names(rev_raw), c("Year","year"))
state_cols_cap <- setdiff(names(cap_raw), c("Year","year"))
# Pivot Revenue data
rev <- rev_raw %>%
pivot_longer(cols = all_of(state_cols_rev), names_to = "state", values_to = "rev_amount") %>%
mutate(
state = gsub("\\s+", " ", trimws(as.character(state))),
rev_amount = as.numeric(gsub(",", "", as.character(rev_amount)))
) %>%
select(state, year, rev_amount) %>%
filter(!is.na(year), !is.na(rev_amount))
# Pivot Capital data
cap <- cap_raw %>%
pivot_longer(cols = all_of(state_cols_cap), names_to = "state", values_to = "cap_amount") %>%
mutate(
state = gsub("\\s+", " ", trimws(as.character(state))),
cap_amount = as.numeric(gsub(",", "", as.character(cap_amount)))
) %>%
select(state, year, cap_amount) %>%
filter(!is.na(year), !is.na(cap_amount))
# Full join and calculate total expenditure
df <- full_join(rev, cap, by = c("state","year")) %>%
mutate(
rev_amount = ifelse(is.na(rev_amount), 0, rev_amount),
cap_amount = ifelse(is.na(cap_amount), 0, cap_amount),
total_health_exp = rev_amount + cap_amount
) %>%
arrange(state, year)
head(df, 10)
## # A tibble: 10 × 5
## state year rev_amount cap_amount total_health_exp
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Andhra Pradesh 2000 101332 4794 106126
## 2 Andhra Pradesh 2001 104163 5331 109494
## 3 Andhra Pradesh 2002 109117 3347 112464
## 4 Andhra Pradesh 2003 120039 823 120862
## 5 Andhra Pradesh 2004 118478 1510 119988
## 6 Andhra Pradesh 2005 127787 549 128336
## 7 Andhra Pradesh 2006 149447 662 150109
## 8 Andhra Pradesh 2007 204240 4303 208543
## 9 Andhra Pradesh 2008 238613 2952 241565
## 10 Andhra Pradesh 2009 280052 3985 284037
# ---- 3. CAGR TABLE & TOP-7 PLOT ----
cagr_table <- df %>%
group_by(state) %>%
summarise(
year_start = min(year[total_health_exp > 0], na.rm = TRUE),
year_end = max(year[total_health_exp > 0], na.rm = TRUE),
x_start = total_health_exp[year == year_start][1],
x_end = total_health_exp[year == year_end][1],
n_years = year_end - year_start,
.groups = "drop"
) %>%
filter(is.finite(x_start), is.finite(x_end), n_years > 0, x_start > 0) %>%
mutate(cagr = (x_end / x_start)^(1 / n_years) - 1) %>%
arrange(desc(cagr))
print(cagr_table)
## # A tibble: 31 × 7
## state year_start year_end x_start x_end n_years cagr
## <chr> <int> <int> <dbl> <dbl> <int> <dbl>
## 1 Uttarakhand 2000 2020 2925 252533 20 0.250
## 2 Chhattisgarh 2000 2020 6820 538955 20 0.244
## 3 Haryana 2000 2020 26390 621020 20 0.171
## 4 Assam 2000 2020 28558 609529 20 0.165
## 5 Odisha 2000 2020 38301 725234 20 0.158
## 6 Jharkhand 2001 2020 29661 458231 19 0.155
## 7 Goa 2000 2020 8325 146028 20 0.154
## 8 Sikkim 2000 2020 2999 52544 20 0.154
## 9 Arunachal Pradesh 2000 2020 5772 99255 20 0.153
## 10 Jammu and Kashmir 2000 2020 36368 590230 20 0.150
## # ℹ 21 more rows
top_states <- head(cagr_table$state, 7)
# Plotting the time series for the top 7 states
ggplot(
df %>% filter(state %in% top_states),
aes(x = year, y = total_health_exp, color = state)
) +
geom_line(linewidth = 1) +
geom_point(size = 1.3) +
scale_y_continuous(labels = scales::label_number(scale_cut = scales::cut_si("₹"))) +
labs(
title = "Total Health Expenditure — Top 7 States by CAGR",
x = "Year",
y = "₹ (nominal)",
color = "State"
) +
theme_minimal(base_size = 12)

# ---- 4. MERGE GDP AND POPULATION DATA & COMPUTE METRICS (Robust Version) ----
gdp_file <- "gdp.csv"
pop_file <- "pop.csv"
# ----- GDP: read and pivot long -----
gdp_raw <- read_csv(gdp_file, show_col_types = FALSE)
state_col_gdp <- names(gdp_raw)[grepl("state|state/ut|state_ut|region|unit", tolower(names(gdp_raw)))]
if (length(state_col_gdp) == 0) state_col_gdp <- names(gdp_raw)[1]
state_col_gdp <- state_col_gdp[1]
val_cols_gdp <- names(gdp_raw)[grepl("gsdp|gdp|\\d{4}", tolower(names(gdp_raw)))]
val_cols_gdp <- setdiff(val_cols_gdp, state_col_gdp)
if (length(val_cols_gdp) == 0) val_cols_gdp <- setdiff(names(gdp_raw), state_col_gdp)
gdp_long <- gdp_raw %>%
pivot_longer(cols = all_of(val_cols_gdp), names_to = "gdp_col", values_to = "gdp_raw") %>%
mutate(
state = gsub("\\s+", " ", trimws(.data[[state_col_gdp]])),
year = as.integer(str_extract(gdp_col, "(19|20)\\d{2}")),
year = ifelse(is.na(year), as.integer(str_extract(gdp_col, "\\d{4}")), year),
gdp = as.numeric(gsub(",", "", as.character(gdp_raw)))
) %>%
select(state, year, gdp) %>%
filter(!is.na(state))
if (all(is.na(gdp_long$year))) gdp_long <- gdp_long %>% mutate(year = 2011)
gdp <- gdp_long %>% arrange(state, year)
# ----- POPULATION: read and pivot long (handles snapshot columns too) -----
pop_raw <- read_csv(pop_file, show_col_types = FALSE)
state_col_pop <- names(pop_raw)[grepl("state|state/ut|state_ut|region|name", tolower(names(pop_raw)))]
if (length(state_col_pop) == 0) state_col_pop <- names(pop_raw)[1]
state_col_pop <- state_col_pop[1]
val_cols_pop <- names(pop_raw)[grepl("pop|population|\\d{4}", tolower(names(pop_raw)))]
val_cols_pop <- setdiff(val_cols_pop, state_col_pop)
if (length(val_cols_pop) > 0) {
pop_long <- pop_raw %>%
pivot_longer(cols = all_of(val_cols_pop), names_to = "pop_col", values_to = "pop_raw") %>%
mutate(
state = gsub("\\s+", " ", trimws(.data[[state_col_pop]])),
year = as.integer(str_extract(pop_col, "(19|20)\\d{2}")),
year = ifelse(is.na(year), as.integer(str_extract(pop_col, "\\d{4}")), year),
population = as.numeric(gsub(",", "", as.character(pop_raw)))
) %>%
select(state, year, population) %>%
filter(!is.na(state))
pop_snapshot <- pop_long %>% filter(is.na(year)) %>% select(state, population) %>% distinct()
pop_long <- pop_long %>% filter(!is.na(year))
} else {
num_cols <- names(pop_raw)[sapply(pop_raw, function(x) all(is.na(as.numeric(gsub(",","",as.character(x)))) == FALSE, na.rm = TRUE))]
pop_val_col <- setdiff(num_cols, state_col_pop)
if (length(pop_val_col) == 0) stop("Could not detect population column in pop.csv; open the file and confirm column names.")
pop_snapshot <- pop_raw %>% rename(state = !!state_col_pop) %>%
transmute(state = gsub("\\s+", " ", trimws(state)),
population = as.numeric(gsub(",", "", as.character(.data[[pop_val_col[1]]])))) %>%
distinct()
pop_long <- tibble(state = character(), year = integer(), population = numeric())
}
# create full pop table for all years present in df
years_needed <- sort(unique(df$year))
pop_from_years <- pop_long
if (nrow(pop_snapshot) > 0) {
snap_rep <- expand.grid(state = pop_snapshot$state, year = years_needed, stringsAsFactors = FALSE) %>%
as_tibble() %>%
left_join(pop_snapshot, by = "state")
pop_full <- bind_rows(pop_from_years, snap_rep) %>%
group_by(state, year) %>%
summarise(population = first(na.omit(population)), .groups = "drop")
} else {
pop_full <- pop_from_years
}
states_union <- sort(unique(c(df$state, gdp$state, pop_full$state)))
pop_full <- expand.grid(state = states_union, year = years_needed, stringsAsFactors = FALSE) %>%
as_tibble() %>%
left_join(pop_full, by = c("state","year")) %>%
group_by(state) %>%
fill(population, .direction = "downup") %>%
ungroup()
# ----- Harmonize common state name variants -----
recode_pairs <- c("Orissa" = "Odisha", "Pondicherry" = "Puducherry",
"NCT of Delhi" = "Delhi", "Jammu and Kashmir" = "Jammu & Kashmir",
"Uttaranchal" = "Uttarakhand", "Andhra Pradesh (Undivided)" = "Andhra Pradesh")
gdp <- gdp %>% mutate(state = recode(state, !!!recode_pairs, .default = state))
pop_full <- pop_full %>% mutate(state = recode(state, !!!recode_pairs, .default = state))
df <- df %>% mutate(state = gsub("\\s+", " ", trimws(as.character(state)))) %>% mutate(state = recode(state, !!!recode_pairs, .default = state))
# ----- Diagnostics: print a sample of states so you can check -----
cat("sample states in df:", paste(head(unique(df$state), 12), collapse = ", "), "\n")
## sample states in df: Andhra Pradesh, Arunachal Pradesh, Assam, Bihar, Chhattisgarh, Delhi, Goa, Gujarat, Haryana, Himachal Pradesh, Jammu & Kashmir, Jharkhand
cat("sample states in gdp:", paste(head(unique(gdp$state), 12), collapse = ", "), "\n")
## sample states in gdp: ---, Andhra Pradesh (Post-Bifurcation), Andhra Pradesh, Arunachal Pradesh, Assam, Bihar, Chhattisgarh, Goa, Gujarat, Haryana, Himachal Pradesh, Jammu & Kashmir (Undivided)
cat("sample states in pop:", paste(head(unique(pop_full$state), 12), collapse = ", "), "\n")
## sample states in pop: ---, Andaman & Nicobar Islands, Andhra Pradesh, Andhra Pradesh (Post-Bifurcation), Arunachal Pradesh, Assam, Bihar, Chandigarh, Chhattisgarh, Dadra & Nagar Haveli and Daman & Diu, Delhi, Goa
# ----- Fill GDP for all states x years: use latest available per state if needed -----
gdp_latest <- gdp %>% arrange(state, year) %>% group_by(state) %>% summarise(gdp_latest = last(na.omit(gdp)), .groups="drop")
gdp_full <- expand.grid(state = states_union, year = years_needed, stringsAsFactors = FALSE) %>%
as_tibble() %>% left_join(gdp %>% select(state, year, gdp), by = c("state","year")) %>%
left_join(gdp_latest, by = "state") %>%
mutate(gdp = ifelse(is.na(gdp), gdp_latest, gdp)) %>%
select(state, year, gdp)
# ----- Join with df and compute metrics -----
df2 <- df %>%
left_join(gdp_full, by = c("state","year")) %>%
left_join(pop_full %>% rename(population = population), by = c("state","year"))
# --- FINAL DIAGNOSTIC CHECK (key to solving blank plots) ---
cat("rows after join:", nrow(df2), "\n")
## rows after join: 673
cat("rows with GDP & POP:", sum(!is.na(df2$gdp) & !is.na(df2$population)), "\n")
## rows with GDP & POP: 573
# Filter and compute metrics
df2 <- df2 %>% filter(!is.na(gdp), !is.na(population)) %>%
mutate(pche = total_health_exp / population,
he_pct_gdp = 100 * total_health_exp / gdp)
# ----- Plots -----
si_rupee <- scales::label_number(scale_cut = scales::cut_si("₹"))
p1 <- ggplot(df2, aes(x=year, y=pche, group=state)) +
geom_line(alpha=0.35, linewidth=0.7) +
stat_summary(aes(group=1), fun=median, geom="line", linewidth=1) +
scale_y_continuous(labels = si_rupee) + labs(title="PCHE", x="Year", y="₹ per person") + theme_minimal()
p2 <- ggplot(df2, aes(x=year, y=he_pct_gdp, group=state)) +
geom_line(alpha=0.35, linewidth=0.7) +
stat_summary(aes(group=1), fun=median, geom="line", linewidth=1) +
labs(title="Health Expenditure as % of GSDP", x="Year", y="% of GSDP") + theme_minimal()
print(p1); print(p2)


# ---- 5. ELASTICITY TEST (LOG-LOG MODEL) ----
if (exists("df2") && nrow(df2) > 0) {
df2 <- df2 %>%
mutate(
pci = gdp / population, # Per-Capita Income (PCI)
ln_pche = log(pche),
ln_pci = log(pci)
) %>%
filter(is.finite(ln_pche), is.finite(ln_pci))
model <- lm(ln_pche ~ ln_pci, data = df2)
summary(model)
beta <- coef(model)["ln_pci"]
if (!is.na(beta)) {
cat("\nElasticity (beta) =", round(beta, 3), "\n")
if (beta < 1) cat("Conclusion: beta <", 1, "→ public healthcare is an **INCOME NECESSITY**.\n")
else if (beta > 1) cat("Conclusion: beta >", 1, "→ public healthcare is an **INCOME LUXURY**.\n")
else cat("Conclusion: beta ≈ 1 → unit elasticity.\n")
}
} else {
message("Elasticity test skipped: df2 is empty.")
}
##
## Elasticity (beta) = 0.546
## Conclusion: beta < 1 → public healthcare is an **INCOME NECESSITY**.
# ---- 6. KDE / OVERLAPPING DENSITIES for last 3 years ----
if (exists("df2") && nrow(df2) > 0) {
years_avail <- sort(unique(df2$year))
years_pick <- tail(years_avail, 3)
kde_df <- df2 %>% filter(year %in% years_pick)
rupee_labeler <- label_number(scale_cut = scales::cut_si("₹"))
ggplot(kde_df, aes(x = pche, color = factor(year))) +
geom_density(linewidth = 1) +
scale_x_continuous(labels = rupee_labeler) +
labs(
title = "Distribution of Per-Capita Health Expenditure (PCHE)",
subtitle = paste("Years:", paste(years_pick, collapse = ", ")),
x = "PCHE (₹ per person)", y = "Density", color = "Year"
) +
theme_minimal(base_size = 12)
} else {
message("KDE plot skipped: df2 is empty.")
}
