This report analyzes the District Health Survey (NFHS-5) dataset, which covers health, education, maternal care, child nutrition, and disease indicators across hundreds of Indian districts. The goal is to explore patterns in district-level health outcomes and identify key regional disparities.
# Load required libraries
library(tidyverse) # Data wrangling
library(ggplot2) # Visualization
library(GGally) # Pair plots
library(stats) # ANOVA, regression
library(class) # KNN
library(cluster) # K-means
# Load the dataset
raw_data <- read.csv("C:/Users/asus/Downloads/District_health_Survey (1).csv",
stringsAsFactors = FALSE,
na.strings = c("", "NA", "*", "NS"))
# Preview
dim(raw_data)
## [1] 706 109
head(raw_data[, 1:5])
# ---- DATA CLEANING ----
# Step 1: Clean column names - make them short and R-friendly
colnames(raw_data)[1] <- "District"
colnames(raw_data)[2] <- "State"
colnames(raw_data)[3] <- "HH_surveyed"
colnames(raw_data)[4] <- "Women_interviewed"
colnames(raw_data)[5] <- "Men_interviewed"
colnames(raw_data)[6] <- "Female_school_attendance"
colnames(raw_data)[7] <- "Pop_below_15yrs"
colnames(raw_data)[8] <- "Sex_ratio_total"
colnames(raw_data)[9] <- "Sex_ratio_birth"
colnames(raw_data)[10] <- "Birth_registration"
colnames(raw_data)[11] <- "Death_registration"
colnames(raw_data)[12] <- "Electricity_access"
colnames(raw_data)[13] <- "Clean_water_access"
colnames(raw_data)[14] <- "Sanitation_access"
colnames(raw_data)[15] <- "Clean_fuel_cooking"
colnames(raw_data)[16] <- "Iodized_salt"
colnames(raw_data)[17] <- "Health_insurance"
colnames(raw_data)[18] <- "Pre_primary_attendance"
colnames(raw_data)[19] <- "Women_literacy"
colnames(raw_data)[20] <- "Women_10plus_schooling"
colnames(raw_data)[21] <- "Child_marriage_women"
colnames(raw_data)[22] <- "High_order_births"
colnames(raw_data)[23] <- "Teen_pregnancy"
colnames(raw_data)[24] <- "Menstrual_hygiene"
colnames(raw_data)[25] <- "FP_any_method"
colnames(raw_data)[26] <- "FP_modern_method"
colnames(raw_data)[27] <- "FP_female_sterilization"
colnames(raw_data)[28] <- "FP_male_sterilization"
colnames(raw_data)[29] <- "FP_IUD"
colnames(raw_data)[30] <- "FP_pill"
colnames(raw_data)[31] <- "FP_condom"
colnames(raw_data)[32] <- "FP_injectable"
colnames(raw_data)[33] <- "Unmet_FP_need_total"
colnames(raw_data)[34] <- "Unmet_FP_spacing"
colnames(raw_data)[35] <- "HW_talked_FP"
colnames(raw_data)[36] <- "FP_side_effects_told"
colnames(raw_data)[37] <- "ANC_first_trimester"
colnames(raw_data)[38] <- "ANC_4plus_visits"
colnames(raw_data)[39] <- "Tetanus_protection"
colnames(raw_data)[40] <- "IFA_100days"
colnames(raw_data)[41] <- "IFA_180days"
colnames(raw_data)[42] <- "MCP_card"
colnames(raw_data)[43] <- "Postnatal_care_mother"
colnames(raw_data)[44] <- "OOP_delivery_cost"
colnames(raw_data)[45] <- "Home_birth_checkup"
colnames(raw_data)[46] <- "Postnatal_care_child"
colnames(raw_data)[47] <- "Institutional_birth"
colnames(raw_data)[48] <- "Institutional_birth_public"
colnames(raw_data)[49] <- "Home_birth_skilled"
colnames(raw_data)[50] <- "Skilled_birth_attendant"
colnames(raw_data)[51] <- "Caesarean_births"
colnames(raw_data)[52] <- "Caesarean_private"
colnames(raw_data)[53] <- "Caesarean_public"
colnames(raw_data)[54] <- "Full_vaccination_card_recall"
colnames(raw_data)[55] <- "Full_vaccination_card_only"
colnames(raw_data)[56] <- "BCG_vaccine"
colnames(raw_data)[57] <- "Polio_3dose"
colnames(raw_data)[58] <- "DPT_3dose"
colnames(raw_data)[59] <- "MCV1_vaccine"
colnames(raw_data)[60] <- "MCV2_vaccine"
colnames(raw_data)[61] <- "Rotavirus_3dose"
colnames(raw_data)[62] <- "HepB_3dose"
colnames(raw_data)[63] <- "VitA_dose"
colnames(raw_data)[64] <- "Vacc_public_facility"
colnames(raw_data)[65] <- "Vacc_private_facility"
colnames(raw_data)[66] <- "Diarrhoea_prevalence"
colnames(raw_data)[67] <- "Diarrhoea_ORS"
colnames(raw_data)[68] <- "Diarrhoea_zinc"
colnames(raw_data)[69] <- "Diarrhoea_health_facility"
colnames(raw_data)[70] <- "ARI_prevalence"
colnames(raw_data)[71] <- "ARI_health_facility"
colnames(raw_data)[72] <- "Breastfed_within_1hr"
colnames(raw_data)[73] <- "Exclusive_breastfeeding"
colnames(raw_data)[74] <- "Solid_food_6to8months"
colnames(raw_data)[75] <- "Adequate_diet_breastfed"
colnames(raw_data)[76] <- "Adequate_diet_nonbreastfed"
colnames(raw_data)[77] <- "Adequate_diet_total"
colnames(raw_data)[78] <- "Stunting"
colnames(raw_data)[79] <- "Wasting"
colnames(raw_data)[80] <- "Severe_wasting"
colnames(raw_data)[81] <- "Underweight"
colnames(raw_data)[82] <- "Overweight_children"
colnames(raw_data)[83] <- "Women_BMI_low"
colnames(raw_data)[84] <- "Women_overweight_obese"
colnames(raw_data)[85] <- "Women_high_waist_hip"
colnames(raw_data)[86] <- "Anaemia_children"
colnames(raw_data)[87] <- "Anaemia_nonpreg_women"
colnames(raw_data)[88] <- "Anaemia_preg_women"
colnames(raw_data)[89] <- "Anaemia_all_women"
colnames(raw_data)[90] <- "Anaemia_teen_women"
colnames(raw_data)[91] <- "Women_high_blood_sugar"
colnames(raw_data)[92] <- "Women_very_high_blood_sugar"
colnames(raw_data)[93] <- "Women_diabetes_total"
colnames(raw_data)[94] <- "Men_high_blood_sugar"
colnames(raw_data)[95] <- "Men_very_high_blood_sugar"
colnames(raw_data)[96] <- "Men_diabetes_total"
colnames(raw_data)[97] <- "Women_mild_hypertension"
colnames(raw_data)[98] <- "Women_moderate_hypertension"
colnames(raw_data)[99] <- "Women_hypertension_total"
colnames(raw_data)[100] <- "Men_mild_hypertension"
colnames(raw_data)[101] <- "Men_moderate_hypertension"
colnames(raw_data)[102] <- "Men_hypertension_total"
colnames(raw_data)[103] <- "Cervical_cancer_screening"
colnames(raw_data)[104] <- "Breast_exam"
colnames(raw_data)[105] <- "Oral_exam"
colnames(raw_data)[106] <- "Women_tobacco"
colnames(raw_data)[107] <- "Men_tobacco"
colnames(raw_data)[108] <- "Women_alcohol"
colnames(raw_data)[109] <- "Men_alcohol"
# Step 2: Remove bracketed values like (12.3) - these are unreliable estimates
# Strip parentheses and convert to numeric
clean_numeric <- function(x) {
x <- gsub("\\(", "", x)
x <- gsub("\\)", "", x)
x <- trimws(x)
suppressWarnings(as.numeric(x))
}
# Apply numeric cleaning to all columns except District and State
df <- raw_data %>%
mutate(across(-c(District, State), clean_numeric))
cat("Dataset dimensions after cleaning:", nrow(df), "rows x", ncol(df), "columns\n")
## Dataset dimensions after cleaning: 706 rows x 109 columns
# Count of districts and variables
cat("Total number of districts:", nrow(df), "\n")
## Total number of districts: 706
cat("Total number of variables:", ncol(df), "\n")
## Total number of variables: 109
cat("Number of states/UTs covered:", length(unique(df$State)), "\n")
## Number of states/UTs covered: 36
Interpretation: The dataset covers 706 districts across India’s states and Union Territories, with 109 health and demographic variables. This comprehensive breadth allows analysis of wide regional health disparities.
# Categorize variables by health domain
categories <- data.frame(
Category = c(
"Demographics & Household",
"Education & Literacy",
"Maternal & Reproductive Health",
"Child Health & Vaccination",
"Nutrition (Children & Women)",
"Disease & NCD (Blood Sugar, BP)",
"Substance Use",
"Infrastructure & Access"
),
Examples = c(
"Sex ratio, HH surveyed, population below 15 yrs",
"Women literacy, school attendance, 10+ years schooling",
"ANC visits, institutional births, postnatal care",
"BCG, DPT, Polio, MCV, full vaccination coverage",
"Stunting, wasting, underweight, BMI, anaemia",
"Diabetes, hypertension for men and women",
"Tobacco use, alcohol consumption (men & women)",
"Electricity, clean water, sanitation, clean fuel"
),
Approx_Variables = c(5, 4, 15, 12, 14, 12, 4, 8)
)
print(categories)
## Category
## 1 Demographics & Household
## 2 Education & Literacy
## 3 Maternal & Reproductive Health
## 4 Child Health & Vaccination
## 5 Nutrition (Children & Women)
## 6 Disease & NCD (Blood Sugar, BP)
## 7 Substance Use
## 8 Infrastructure & Access
## Examples Approx_Variables
## 1 Sex ratio, HH surveyed, population below 15 yrs 5
## 2 Women literacy, school attendance, 10+ years schooling 4
## 3 ANC visits, institutional births, postnatal care 15
## 4 BCG, DPT, Polio, MCV, full vaccination coverage 12
## 5 Stunting, wasting, underweight, BMI, anaemia 14
## 6 Diabetes, hypertension for men and women 12
## 7 Tobacco use, alcohol consumption (men & women) 4
## 8 Electricity, clean water, sanitation, clean fuel 8
Interpretation: Variables span eight major public health domains. Maternal and reproductive health has the largest coverage (~15 variables), reflecting NFHS-5’s focus on women and child wellbeing. Disease/NCD indicators provide a newer dimension compared to earlier surveys.
# Women's literacy distribution
ggplot(df, aes(x = Women_literacy)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
labs(title = "Distribution of Women's Literacy Rate Across Districts",
x = "Women Literacy Rate (%)", y = "Number of Districts") +
theme_minimal()
# Child stunting distribution
ggplot(df, aes(x = Stunting)) +
geom_histogram(binwidth = 3, fill = "tomato", color = "white") +
labs(title = "Distribution of Child Stunting Across Districts",
x = "Stunting (%)", y = "Number of Districts") +
theme_minimal()
# Anaemia in all women
ggplot(df, aes(x = Anaemia_all_women)) +
geom_histogram(binwidth = 3, fill = "darkorange", color = "white") +
labs(title = "Distribution of Anaemia Among Women Across Districts",
x = "Anaemia Prevalence (%)", y = "Number of Districts") +
theme_minimal()
Interpretation: Women’s literacy ranges widely, with a concentration between 60–90%, indicating progress but significant pockets of exclusion. Stunting is high (often 25–45%), reflecting persistent malnutrition. Anaemia among women is widespread with most districts recording 40–70%, pointing to a near-universal nutritional deficiency problem.
# Count missing values per variable
missing_summary <- df %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Missing_Count") %>%
arrange(desc(Missing_Count)) %>%
filter(Missing_Count > 0) %>%
head(20)
print(missing_summary)
## # A tibble: 20 × 2
## Variable Missing_Count
## <chr> <int>
## 1 Adequate_diet_nonbreastfed 643
## 2 Solid_food_6to8months 642
## 3 Diarrhoea_ORS 492
## 4 Diarrhoea_zinc 492
## 5 Diarrhoea_health_facility 492
## 6 Home_birth_checkup 422
## 7 Exclusive_breastfeeding 261
## 8 ARI_health_facility 224
## 9 Caesarean_private 150
## 10 Anaemia_preg_women 134
## 11 Full_vaccination_card_only 22
## 12 Vacc_public_facility 16
## 13 Vacc_private_facility 16
## 14 Full_vaccination_card_recall 13
## 15 BCG_vaccine 13
## 16 Polio_3dose 13
## 17 DPT_3dose 13
## 18 MCV1_vaccine 13
## 19 MCV2_vaccine 13
## 20 Rotavirus_3dose 13
# Plot top 15 variables with most missing values
missing_summary %>%
head(15) %>%
ggplot(aes(x = reorder(Variable, Missing_Count), y = Missing_Count)) +
geom_bar(stat = "identity", fill = "firebrick") +
coord_flip() +
labs(title = "Top 15 Variables with Most Missing Values",
x = "Variable", y = "Missing Count") +
theme_minimal()
Interpretation: Variables like rotavirus vaccination, home birth check-up, and some cancer screening indicators have the highest missingness, likely because these programs are not uniformly available in all districts. Missing values often reflect service unavailability rather than non-response.
# Summary statistics for key indicators
key_vars <- c("Women_literacy", "Institutional_birth", "Full_vaccination_card_recall",
"Stunting", "Anaemia_all_women", "Women_tobacco", "Men_tobacco",
"Men_alcohol", "Sanitation_access", "Clean_water_access")
summary_table <- df %>%
select(all_of(key_vars)) %>%
summarise(across(everything(),
list(
Min = ~round(min(., na.rm = TRUE), 1),
Max = ~round(max(., na.rm = TRUE), 1),
Mean = ~round(mean(., na.rm = TRUE), 1)
),
.names = "{.col}__{.fn}"
)) %>%
pivot_longer(everything(), names_to = c("Variable", "Stat"), names_sep = "__") %>%
pivot_wider(names_from = Stat, values_from = value)
print(summary_table)
## # A tibble: 10 × 4
## Variable Min Max Mean
## <chr> <dbl> <dbl> <dbl>
## 1 Women_literacy 38.6 99.7 74.3
## 2 Institutional_birth 21.4 100 88.7
## 3 Full_vaccination_card_recall 38.3 100 77.7
## 4 Stunting 13.2 60.6 33.5
## 5 Anaemia_all_women 14.9 93.5 55.9
## 6 Women_tobacco 0.1 70.6 11.6
## 7 Men_tobacco 6.8 80.6 40.6
## 8 Men_alcohol 0.1 68.4 23.2
## 9 Sanitation_access 29.2 99.9 71.9
## 10 Clean_water_access 41.2 100 93.7
Interpretation: Institutional births range from very low to nearly universal, highlighting vast access inequalities. Full vaccination averages around 60–70% nationally, with some districts below 30%. Tobacco use among men is significantly higher than women, while anaemia affects nearly half of women on average.
# Filter districts where women's literacy > 80%
high_literacy <- df %>%
filter(Women_literacy > 80) %>%
select(District, State, Women_literacy) %>%
arrange(desc(Women_literacy))
cat("Number of districts with women's literacy > 80%:", nrow(high_literacy), "\n\n")
## Number of districts with women's literacy > 80%: 240
print(head(high_literacy, 20))
## District State Women_literacy
## 1 Kottayam Kerala 99.7
## 2 Alappuzha Kerala 99.7
## 3 Pathanamthitta Kerala 99.7
## 4 Serchhip Mizoram 99.7
## 5 Mahe Puducherry 99.7
## 6 Thrissur Kerala 99.4
## 7 Ernakulam Kerala 99.3
## 8 Malappuram Kerala 99.2
## 9 Kannur Kerala 99.1
## 10 Kozhikode Kerala 99.1
## 11 Aizawl Mizoram 98.9
## 12 Thiruvananthapuram Kerala 98.5
## 13 Kollam Kerala 98.2
## 14 Kanniyakumari Tamil Nadu 98.0
## 15 Champhai Mizoram 97.7
## 16 Kolasib Mizoram 96.9
## 17 Lakshadweep Lakshadweep 96.5
## 18 Chennai Tamil Nadu 96.1
## 19 Kasaragod Kerala 95.9
## 20 Kohima Nagaland 95.2
Interpretation: Approximately one-third of districts have women’s literacy above 80%, mostly concentrated in southern states (Kerala, Tamil Nadu, Andhra Pradesh) and northeastern states. This highlights the north-south literacy divide in India.
# High malnutrition = stunting above 35%
# Low healthcare = institutional birth below 60%
high_mal_low_health <- df %>%
filter(Stunting > 35 & Institutional_birth < 60) %>%
select(District, State, Stunting, Institutional_birth, Underweight) %>%
arrange(desc(Stunting))
cat("Districts with high malnutrition and low healthcare access:", nrow(high_mal_low_health), "\n\n")
## Districts with high malnutrition and low healthcare access: 10
print(head(high_mal_low_health, 20))
## District State Stunting Institutional_birth Underweight
## 1 West Khasi Hills Meghalaya 59.0 41.7 31.1
## 2 South West Khasi Hills Meghalaya 51.4 41.7 27.6
## 3 East Jantia Hills Meghalaya 49.8 48.4 23.6
## 4 West Jaintia Hills Meghalaya 48.7 42.2 28.3
## 5 Zunheboto Nagaland 44.0 35.0 44.5
## 6 Ribhoi Meghalaya 42.5 56.9 29.6
## 7 Kishanganj Bihar 38.8 54.6 41.1
## 8 Tuensang Nagaland 37.1 34.8 34.2
## 9 Kiphire Nagaland 36.9 34.8 25.0
## 10 Mon Nagaland 35.5 21.4 23.3
Interpretation: These districts represent the most vulnerable pockets — high stunting combined with low institutional delivery indicates a breakdown of both nutritional support and healthcare access. Most such districts tend to be in Bihar, Uttar Pradesh, and Jharkhand.
# Compare diabetes and hypertension: women vs men
# Focus on diabetes total and hypertension total
gender_gap <- df %>%
mutate(
Diabetes_gap = Men_diabetes_total - Women_diabetes_total,
BP_gap = Men_hypertension_total - Women_hypertension_total
) %>%
filter(Diabetes_gap > 5 | BP_gap > 5) %>%
select(District, State, Women_diabetes_total, Men_diabetes_total,
Women_hypertension_total, Men_hypertension_total,
Diabetes_gap, BP_gap) %>%
arrange(desc(BP_gap))
cat("Districts where men's NCD burden significantly exceeds women's:", nrow(gender_gap), "\n\n")
## Districts where men's NCD burden significantly exceeds women's: 234
print(head(gender_gap, 15))
## District State Women_diabetes_total
## 1 Mon Nagaland 9.0
## 2 Central NCT of Delhi 11.7
## 3 Bageshwar Uttarakhand 7.9
## 4 Imphal East Manipur 15.9
## 5 Tehri Garhwal Uttarakhand 8.9
## 6 Dibang Valley Arunachal Pradesh 9.4
## 7 Rudraprayag Uttarakhand 10.4
## 8 Bathinda Punjab 14.0
## 9 Lower Dibang Valley Arunachal Pradesh 8.9
## 10 West District Sikkim 7.4
## 11 Garhwal Uttarakhand 10.6
## 12 Papum Pare Arunachal Pradesh 9.5
## 13 Upper Subansiri Arunachal Pradesh 6.6
## 14 Mansa Punjab 11.6
## 15 Hazaribagh Jharkhand 8.8
## Men_diabetes_total Women_hypertension_total Men_hypertension_total
## 1 13.0 20.4 35.7
## 2 15.8 26.5 41.5
## 3 12.0 20.2 34.9
## 4 18.5 21.9 36.0
## 5 11.9 20.0 34.0
## 6 12.2 32.2 45.4
## 7 16.3 18.7 31.7
## 8 14.5 32.2 45.1
## 9 13.2 25.6 38.1
## 10 13.6 32.7 45.1
## 11 12.1 22.1 34.3
## 12 12.8 20.6 32.7
## 13 10.6 25.4 37.5
## 14 10.9 30.2 42.2
## 15 13.5 18.8 30.8
## Diabetes_gap BP_gap
## 1 4.0 15.3
## 2 4.1 15.0
## 3 4.1 14.7
## 4 2.6 14.1
## 5 3.0 14.0
## 6 2.8 13.2
## 7 5.9 13.0
## 8 0.5 12.9
## 9 4.3 12.5
## 10 6.2 12.4
## 11 1.5 12.2
## 12 3.3 12.1
## 13 4.0 12.1
## 14 -0.7 12.0
## 15 4.7 12.0
Interpretation: In many districts, men show higher hypertension and diabetes rates, potentially linked to higher tobacco and alcohol use. However, women often face more anaemia and low BMI — different dimensions of health disadvantage.
# High tobacco = Men tobacco > 50% OR Women tobacco > 20%
# High alcohol = Men alcohol > 30%
substance_use <- df %>%
filter(Men_tobacco > 50 | Women_tobacco > 20 | Men_alcohol > 30) %>%
select(District, State, Men_tobacco, Women_tobacco, Men_alcohol, Women_alcohol) %>%
arrange(desc(Men_tobacco))
cat("Districts with high tobacco or alcohol use:", nrow(substance_use), "\n\n")
## Districts with high tobacco or alcohol use: 309
print(head(substance_use, 20))
## District State Men_tobacco Women_tobacco
## 1 Mamit Mizoram 80.6 70.2
## 2 Lawngtlai Mizoram 77.2 65.4
## 3 Nicobars Andaman & Nicobar Islands 76.8 63.5
## 4 Kolasib Mizoram 75.3 70.6
## 5 Lunglei Mizoram 75.0 62.8
## 6 Champhai Mizoram 74.5 66.7
## 7 Serchhip Mizoram 74.2 62.7
## 8 Churachandpur Manipur 73.8 62.1
## 9 West Khasi Hills Meghalaya 73.3 50.2
## 10 North & Middle Andaman Andaman & Nicobar Islands 70.5 46.8
## 11 Ukhrul Manipur 69.1 32.9
## 12 Namsai Arunachal Pradesh 68.8 25.3
## 13 Aizawl Mizoram 68.7 54.7
## 14 East Jantia Hills Meghalaya 68.5 49.2
## 15 Anjaw Arunachal Pradesh 66.1 22.1
## 16 Chandel Manipur 64.7 47.9
## 17 West Jaintia Hills Meghalaya 64.7 44.8
## 18 Chandel Mizoram 64.7 47.9
## 19 Hamirpur Uttar Pradesh 64.5 17.0
## 20 Kendujhar Odisha 64.2 34.9
## Men_alcohol Women_alcohol
## 1 26.2 0.4
## 2 27.2 0.9
## 3 64.5 29.6
## 4 21.7 0.9
## 5 19.2 0.9
## 6 21.3 1.6
## 7 22.4 0.4
## 8 38.1 1.2
## 9 34.4 1.6
## 10 45.3 5.1
## 11 36.4 0.9
## 12 61.6 28.4
## 13 25.8 1.0
## 14 42.0 3.6
## 15 68.4 31.6
## 16 39.1 1.2
## 17 36.9 1.3
## 18 39.1 1.2
## 19 13.3 0.2
## 20 43.2 13.6
Interpretation: High substance use clusters are visible in northeastern states and tribal belt areas where cultural practices influence alcohol and tobacco use. These districts need targeted behavioral change communication programs alongside health services.
# Proxy for high population: high number of households surveyed
# Poor health: stunting > 35% AND anaemia > 60% AND institutional birth < 70%
high_pop_poor_health <- df %>%
filter(HH_surveyed > 1000 &
Stunting > 35 &
Anaemia_all_women > 60 &
Institutional_birth < 70) %>%
select(District, State, HH_surveyed, Stunting, Anaemia_all_women, Institutional_birth) %>%
arrange(desc(HH_surveyed))
cat("High-population districts with poor health outcomes:", nrow(high_pop_poor_health), "\n\n")
## High-population districts with poor health outcomes: 0
print(head(high_pop_poor_health, 15))
## [1] District State HH_surveyed
## [4] Stunting Anaemia_all_women Institutional_birth
## <0 rows> (or 0-length row.names)
Interpretation: These districts carry the highest absolute burden of disease because their populations are large and health outcomes are poor. Prioritizing these districts in national health programs would yield the greatest improvement in overall national indicators.
# State-wise average literacy
state_literacy <- df %>%
group_by(State) %>%
summarise(
Avg_Women_Literacy = round(mean(Women_literacy, na.rm = TRUE), 1),
Districts_Count = n()
) %>%
arrange(desc(Avg_Women_Literacy))
print(state_literacy)
## # A tibble: 36 × 3
## State Avg_Women_Literacy Districts_Count
## <chr> <dbl> <int>
## 1 "Kerala" 97.9 14
## 2 " Lakshadweep " 96.5 1
## 3 "Goa" 93.1 2
## 4 "Mizoram" 91.3 8
## 5 "Puducherry" 91.1 4
## 6 "Himachal Pradesh" 90.2 12
## 7 "Sikkim" 86.9 4
## 8 "Andaman & Nicobar Islands" 86.1 3
## 9 "Meghalaya" 86 11
## 10 "Tamil Nadu" 85.7 32
## # ℹ 26 more rows
# Plot top 15 and bottom 5 states
state_literacy %>%
head(20) %>%
ggplot(aes(x = reorder(State, Avg_Women_Literacy), y = Avg_Women_Literacy)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 States by Average Women's Literacy Rate",
x = "State", y = "Average Women's Literacy (%)") +
theme_minimal()
Interpretation: Kerala, Goa, and Mizoram consistently top literacy rankings. States like Rajasthan, Bihar, and Uttar Pradesh show significantly lower averages, reflecting persistent gender education gaps in northern India.
# Full vaccination (card + recall) by state
state_vaccination <- df %>%
group_by(State) %>%
summarise(
Avg_Full_Vaccination = round(mean(Full_vaccination_card_recall, na.rm = TRUE), 1)
) %>%
arrange(desc(Avg_Full_Vaccination))
print(state_vaccination)
## # A tibble: 36 × 2
## State Avg_Full_Vaccination
## <chr> <dbl>
## 1 "Odisha" 91.8
## 2 "Sikkim" 91.4
## 3 "Dadra and Nagar Haveli & Daman and Diu" 90.4
## 4 "Tamil Nadu" 89.3
## 5 "Himachal Pradesh" 88.4
## 6 "Ladakh" 88.2
## 7 "Jammu & Kashmir" 87.1
## 8 "West Bengal" 86.7
## 9 "Karnataka" 86.4
## 10 " Lakshadweep " 86.1
## # ℹ 26 more rows
state_vaccination %>%
head(15) %>%
ggplot(aes(x = reorder(State, Avg_Full_Vaccination), y = Avg_Full_Vaccination)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(title = "Top 15 States by Average Child Full Vaccination Rate",
x = "State", y = "Full Vaccination Coverage (%)") +
theme_minimal()
Interpretation: Goa, Kerala, and Tamil Nadu lead in child vaccination. In contrast, several large states in central and eastern India show coverage below 50%, indicating infrastructure and outreach gaps that need urgent intervention.
# Comparing underweight women and hypertension by state
state_nutrition_bp <- df %>%
group_by(State) %>%
summarise(
Avg_Women_Underweight_BMI = round(mean(Women_BMI_low, na.rm = TRUE), 1),
Avg_Women_Hypertension = round(mean(Women_hypertension_total, na.rm = TRUE), 1),
Avg_Men_Hypertension = round(mean(Men_hypertension_total, na.rm = TRUE), 1)
) %>%
arrange(desc(Avg_Women_Underweight_BMI))
print(head(state_nutrition_bp, 20))
## # A tibble: 20 × 4
## State Avg_Women_Underweigh…¹ Avg_Women_Hypertension Avg_Men_Hypertension
## <chr> <dbl> <dbl> <dbl>
## 1 Jharkhand 26.9 17.2 22
## 2 Gujarat 26 20.2 20.1
## 3 Bihar 25.9 15.4 17.9
## 4 Chhattisg… 25 23.1 27.1
## 5 Madhya Pr… 23.5 20.5 22.8
## 6 Dadra and… 21.9 16.6 17
## 7 Odisha 21.9 22 25.1
## 8 Maharastra 21.7 22.8 23.7
## 9 Rajasthan 20.2 15.2 17.9
## 10 Telangana 20 25.8 30.9
## 11 Uttar Pra… 19.1 18.3 21.6
## 12 Karnataka 17.7 24.9 26.9
## 13 Tripura 17.2 20 21.6
## 14 Assam 17.1 19.3 21.3
## 15 West Beng… 16 20.5 20.7
## 16 Haryana 15.2 21.2 25
## 17 Andhra Pr… 15.1 25.1 28.6
## 18 Goa 13.8 27.5 26.6
## 19 Himachal … 13.5 21 23.8
## 20 Uttarakha… 13.4 21.8 31.5
## # ℹ abbreviated name: ¹Avg_Women_Underweight_BMI
Interpretation: States with high female underweight rates (e.g., Jharkhand, Bihar) are often the same states with lower hypertension rates — paradoxically indicating the double burden: undernourished populations in some regions coexist with over-nourished, hypertensive populations in others.
# ANC first trimester + institutional birth + postnatal care
state_maternal <- df %>%
group_by(State) %>%
summarise(
Avg_ANC_1st_Trimester = round(mean(ANC_first_trimester, na.rm = TRUE), 1),
Avg_Institutional_Birth = round(mean(Institutional_birth, na.rm = TRUE), 1),
Avg_Postnatal_Care = round(mean(Postnatal_care_mother, na.rm = TRUE), 1)
) %>%
arrange(desc(Avg_Institutional_Birth))
print(state_maternal)
## # A tibble: 36 × 4
## State Avg_ANC_1st_Trimester Avg_Institutional_Bi…¹ Avg_Postnatal_Care
## <chr> <dbl> <dbl> <dbl>
## 1 "Goa" 68.7 99.8 96.3
## 2 "Puducherry" 87.7 99.8 95.2
## 3 "Kerala" 93.1 99.7 93
## 4 "Tamil Nadu" 77.5 99.7 93.1
## 5 " Lakshadwee… 99.6 99.6 92.6
## 6 "Andaman & N… 72.2 98.3 88.6
## 7 "Karnataka" 71.5 97.4 88.2
## 8 "Andhra Prad… 81.7 96.9 90.5
## 9 "Chandigarh" 82.3 96.9 90.6
## 10 "Telangana" 88.2 96.7 87.5
## # ℹ 26 more rows
## # ℹ abbreviated name: ¹Avg_Institutional_Birth
Interpretation: Southern states dominate in all three maternal health indicators. Goa, Kerala, and Tamil Nadu achieve near-universal institutional delivery and postnatal care, while states like UP and Bihar still have significant proportions of home births with limited skilled care.
# Using anaemia + stunting + diabetes total as composite disease burden
state_disease <- df %>%
group_by(State) %>%
summarise(
Avg_Anaemia_Women = round(mean(Anaemia_all_women, na.rm = TRUE), 1),
Avg_Stunting = round(mean(Stunting, na.rm = TRUE), 1),
Avg_Diabetes_Men = round(mean(Men_diabetes_total, na.rm = TRUE), 1),
Disease_Burden_Score = round(
mean(Anaemia_all_women, na.rm = TRUE) * 0.4 +
mean(Stunting, na.rm = TRUE) * 0.4 +
mean(Men_diabetes_total, na.rm = TRUE) * 0.2,
1)
) %>%
arrange(desc(Disease_Burden_Score))
print(state_disease)
## # A tibble: 36 × 5
## State Avg_Anaemia_Women Avg_Stunting Avg_Diabetes_Men Disease_Burden_Score
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Ladakh 92.8 30.4 8.3 50.9
## 2 Bihar 64.7 42.6 16 46.1
## 3 West Be… 71.5 33.2 21.1 46.1
## 4 Jharkha… 66.3 40.2 14.2 45.4
## 5 Gujarat 64.6 38.7 16.3 44.6
## 6 Tripura 67 33 19.5 43.9
## 7 Assam 65.7 35.7 15.4 43.7
## 8 Meghala… 54.9 43 13.4 41.9
## 9 Odisha 64.3 31.7 16.6 41.7
## 10 Chhatti… 63 35.7 10.5 41.6
## # ℹ 26 more rows
Interpretation: States scoring highest on the composite disease burden index often combine high anaemia and stunting with rising non-communicable diseases. This represents a triple burden — undernutrition, child malnutrition, and adult NCDs — all occurring simultaneously in the same regions.
# Health score: high literacy + high institutional birth + high vaccination + low stunting + low anaemia
# Normalize using simple composite (higher = healthier)
df_health_score <- df %>%
mutate(
Health_Score = round(
(Women_literacy / 100) * 20 +
(Institutional_birth / 100) * 25 +
(Full_vaccination_card_recall / 100) * 20 +
((100 - Stunting) / 100) * 20 +
((100 - Anaemia_all_women) / 100) * 15,
2
)
)
top10_healthy <- df_health_score %>%
select(District, State, Health_Score, Women_literacy,
Institutional_birth, Full_vaccination_card_recall, Stunting, Anaemia_all_women) %>%
arrange(desc(Health_Score)) %>%
head(10)
print(top10_healthy)
## District State Health_Score Women_literacy
## 1 Kozhikode Kerala 88.85 99.1
## 2 Kanniyakumari Tamil Nadu 88.50 98.0
## 3 Kollam Kerala 88.22 98.2
## 4 Kasaragod Kerala 87.31 95.9
## 5 Idukki Kerala 87.14 94.4
## 6 Ernakulam Kerala 87.00 99.3
## 7 Chennai Tamil Nadu 86.91 96.1
## 8 Shahid Bhagat Singh Nagar Punjab 86.75 89.9
## 9 Ramanagara Karnataka 86.60 82.7
## 10 Champhai Mizoram 86.48 97.7
## Institutional_birth Full_vaccination_card_recall Stunting Anaemia_all_women
## 1 100.0 88.8 21.3 29.8
## 2 100.0 96.0 17.3 45.6
## 3 100.0 85.4 15.5 36.0
## 4 100.0 92.5 25.3 35.4
## 5 100.0 89.6 24.3 32.0
## 6 99.1 82.6 22.0 31.7
## 7 100.0 96.6 20.4 50.3
## 8 99.3 100.0 17.9 49.8
## 9 100.0 100.0 15.6 45.5
## 10 96.7 85.3 27.2 25.7
Interpretation: The healthiest districts are concentrated in southern India and Goa, where high literacy, near-universal institutional delivery, and strong vaccination programs collectively produce better health outcomes. These can serve as models for other states.
bottom10_unhealthy <- df_health_score %>%
select(District, State, Health_Score, Women_literacy,
Institutional_birth, Full_vaccination_card_recall, Stunting, Anaemia_all_women) %>%
arrange(Health_Score) %>%
head(10)
print(bottom10_unhealthy)
## District State Health_Score Women_literacy Institutional_birth
## 1 Bastar Chhattisgarh 52.06 54.6 63.5
## 2 Pakur Jharkhand 52.16 46.7 64.6
## 3 Araria Bihar 52.44 43.7 66.2
## 4 Bahraich Uttar Pradesh 52.87 41.6 67.7
## 5 Deoghar Jharkhand 53.19 54.2 61.3
## 6 Kishanganj Bihar 54.19 48.0 54.6
## 7 Bijapur Chhattisgarh 54.27 46.3 63.6
## 8 Sitamarhi Bihar 54.66 51.7 64.4
## 9 Sahibganj Jharkhand 54.81 54.9 64.7
## 10 Kiphire Nagaland 54.91 73.7 34.8
## Full_vaccination_card_recall Stunting Anaemia_all_women
## 1 57.3 48.1 77.2
## 2 69.4 51.3 79.7
## 3 61.6 49.9 67.9
## 4 51.8 52.1 48.8
## 5 54.5 41.7 70.2
## 6 67.3 38.8 65.1
## 7 78.4 53.8 72.1
## 8 66.6 54.2 61.7
## 9 66.0 49.1 71.5
## 10 42.8 36.9 31.4
Interpretation: Districts at the bottom of the health index often have women’s literacy below 40%, institutional delivery below 30%, and child stunting above 45%. These represent India’s most deprived health zones and require immediate, targeted policy interventions.
# Rank states on both dimensions
state_ranking <- df %>%
group_by(State) %>%
summarise(
Avg_Literacy = round(mean(Women_literacy, na.rm = TRUE), 1),
Avg_Institutional_B = round(mean(Institutional_birth, na.rm = TRUE), 1),
Avg_Sanitation = round(mean(Sanitation_access, na.rm = TRUE), 1)
) %>%
mutate(
Literacy_Rank = rank(-Avg_Literacy),
Healthcare_Rank = rank(-Avg_Institutional_B),
Sanitation_Rank = rank(-Avg_Sanitation),
Composite_Rank = round((Literacy_Rank + Healthcare_Rank + Sanitation_Rank) / 3, 1)
) %>%
arrange(Composite_Rank)
print(state_ranking)
## # A tibble: 36 × 8
## State Avg_Literacy Avg_Institutional_B Avg_Sanitation Literacy_Rank
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 "Kerala" 97.9 99.7 98.6 1
## 2 " Lakshadweep " 96.5 99.6 99.8 2
## 3 "Goa" 93.1 99.8 88 3
## 4 "Puducherry" 91.1 99.8 88.2 5
## 5 "Andaman & Nic… 86.1 98.3 86.4 8
## 6 "Sikkim" 86.9 96.4 87.3 7
## 7 "Chandigarh" 83 96.9 85 16
## 8 "Mizoram" 91.3 81.1 94.2 4
## 9 "Tamil Nadu" 85.7 99.7 71.6 10
## 10 "Haryana" 81.4 95.8 85 18
## # ℹ 26 more rows
## # ℹ 3 more variables: Healthcare_Rank <dbl>, Sanitation_Rank <dbl>,
## # Composite_Rank <dbl>
Interpretation: Kerala, Goa, and Tamil Nadu consistently rank highest across all three dimensions. There is a strong positive correlation between literacy and health access — states investing in education tend to also see better utilization of healthcare services.
# Low disease = low stunting + low anaemia in women
low_disease <- df %>%
mutate(Disease_Index = round((Stunting + Anaemia_all_women) / 2, 1)) %>%
select(District, State, Disease_Index, Stunting, Anaemia_all_women) %>%
arrange(Disease_Index) %>%
head(15)
print(low_disease)
## District State Disease_Index Stunting
## 1 Imphal West Manipur 21.2 15.6
## 2 Kohima Nagaland 21.6 28.3
## 3 Siang Arunachal Pradesh 22.6 21.3
## 4 Lower Dibang Valley Arunachal Pradesh 24.5 14.3
## 5 Ukhrul Manipur 24.5 27.1
## 6 Longding Arunachal Pradesh 25.0 15.8
## 7 Kozhikode Kerala 25.6 21.3
## 8 Bishnupur Manipur 25.6 15.5
## 9 Pithoragarh Uttarakhand 25.6 25.6
## 10 Kra Daadi Arunachal Pradesh 25.7 28.8
## 11 Bageshwar Uttarakhand 25.7 23.6
## 12 Kollam Kerala 25.8 15.5
## 13 Alappuzha Kerala 26.0 20.1
## 14 Thiruvananthapuram Kerala 26.2 19.5
## 15 Champhai Mizoram 26.4 27.2
## Anaemia_all_women
## 1 26.8
## 2 14.9
## 3 23.9
## 4 34.6
## 5 21.9
## 6 34.1
## 7 29.8
## 8 35.8
## 9 25.5
## 10 22.6
## 11 27.8
## 12 36.0
## 13 31.9
## 14 33.0
## 15 25.7
Interpretation: Districts with the lowest combined disease burden typically belong to Goa, Kerala, Mizoram, and Manipur. Interestingly, several northeastern districts also perform well on these indicators despite lower income levels, highlighting the role of social indicators like education and community engagement.
# Gap = high literacy but poor health outcomes (stunting or anaemia)
edu_health_gap <- df %>%
filter(!is.na(Women_literacy) & !is.na(Stunting) & !is.na(Anaemia_all_women)) %>%
mutate(
Education_Score = Women_literacy,
Health_Score_inv = (Stunting + Anaemia_all_women) / 2,
Gap_Score = round(Education_Score - (100 - Health_Score_inv), 2)
) %>%
select(District, State, Women_literacy, Stunting, Anaemia_all_women, Gap_Score) %>%
arrange(desc(Gap_Score)) %>%
head(15)
print(edu_health_gap)
## District State Women_literacy Stunting
## 1 Mahe Puducherry 99.7 48.2
## 2 West Khasi Hills Meghalaya 87.4 59.0
## 3 Ribhoi Meghalaya 89.5 42.5
## 4 Vadodara Gujarat 84.6 42.3
## 5 Lahul & Spiti Himachal Pradesh 86.2 28.5
## 6 Kargil Ladakh 77.2 36.5
## 7 South West Khasi Hills Meghalaya 85.9 51.4
## 8 Jorhat Assam 85.1 38.7
## 9 Thane Maharastra 90.5 40.8
## 10 Kangra Himachal Pradesh 94.4 28.0
## 11 East Khasi Hills Meghalaya 93.7 44.6
## 12 Valsad Gujarat 82.9 37.8
## 13 Jammu Jammu & Kashmir 91.5 27.0
## 14 Bilaspur Himachal Pradesh 91.2 40.1
## 15 Bhandara Maharastra 89.1 31.3
## Anaemia_all_women Gap_Score
## 1 38.4 43.00
## 2 51.8 42.80
## 3 62.4 41.95
## 4 72.3 41.90
## 5 82.1 41.50
## 6 92.0 41.45
## 7 58.9 41.05
## 8 71.8 40.35
## 9 58.8 40.30
## 10 63.4 40.10
## 11 48.2 40.10
## 12 75.7 39.65
## 13 66.6 38.30
## 14 53.0 37.75
## 15 65.3 37.40
Interpretation: Some districts show paradoxically high literacy alongside persistently poor health outcomes (high stunting/anaemia). This may indicate that education alone does not translate into health behavior change without adequate healthcare infrastructure and nutrition programs.
df %>%
filter(!is.na(Women_literacy) & !is.na(Institutional_birth)) %>%
ggplot(aes(x = Women_literacy, y = Institutional_birth)) +
geom_point(alpha = 0.4, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Women's Literacy vs. Institutional Birth Rate",
subtitle = "Each point = one district",
x = "Women's Literacy Rate (%)",
y = "Institutional Birth Rate (%)") +
theme_minimal()
Insight: There is a clear positive relationship — districts with higher female literacy consistently show higher institutional delivery rates, confirming that education is a key driver of maternal healthcare utilization.
df %>%
group_by(State) %>%
summarise(Avg_Stunting = mean(Stunting, na.rm = TRUE)) %>%
arrange(desc(Avg_Stunting)) %>%
head(20) %>%
ggplot(aes(x = reorder(State, Avg_Stunting), y = Avg_Stunting)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(title = "Top 20 States by Average Child Stunting Rate",
x = "State", y = "Average Stunting (%)") +
theme_minimal()
Insight: Uttar Pradesh, Bihar, and Jharkhand report the highest stunting averages, while coastal and southern states show significantly lower rates. Stunting reflects chronic food insecurity, poor sanitation, and limited healthcare access.
df %>%
filter(!is.na(Women_tobacco) & !is.na(Anaemia_all_women)) %>%
mutate(Tobacco_Group = cut(Women_tobacco,
breaks = c(-Inf, 5, 15, Inf),
labels = c("Low (<5%)", "Medium (5-15%)", "High (>15%)"))) %>%
ggplot(aes(x = Tobacco_Group, y = Anaemia_all_women, fill = Tobacco_Group)) +
geom_boxplot() +
labs(title = "Anaemia in Women by Tobacco Use Level",
x = "Women's Tobacco Use Category",
y = "Anaemia Prevalence (%)",
fill = "Tobacco Group") +
theme_minimal() +
theme(legend.position = "none")
Insight: Districts with higher women’s tobacco use show distinctly different anaemia distributions. High tobacco use areas also tend to be high anaemia areas, suggesting shared underlying deprivation factors.
df %>%
filter(!is.na(Full_vaccination_card_recall) & !is.na(Stunting)) %>%
ggplot(aes(x = Full_vaccination_card_recall, y = Stunting)) +
geom_point(alpha = 0.3, color = "darkgreen", size = 1.5) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Full Vaccination Coverage vs. Child Stunting",
subtitle = "Each point = one district",
x = "Full Vaccination Coverage (%)",
y = "Stunting Rate (%)") +
theme_minimal()
Insight: Higher vaccination coverage is negatively correlated with stunting — districts with better immunization programs also tend to have lower malnutrition, as both reflect stronger primary healthcare systems.
# GGally pair plot of 5 key indicators
df_pairs <- df %>%
select(Women_literacy, Institutional_birth, Stunting, Anaemia_all_women, Men_tobacco) %>%
filter(complete.cases(.))
ggpairs(df_pairs,
title = "Pair Plot: Key Health Indicators Across Districts",
columnLabels = c("Literacy", "Inst.Birth", "Stunting", "Anaemia", "Tobacco(M)"))
Insight: The pair plot reveals strong positive correlation between literacy and institutional births, and negative correlations between literacy and stunting/anaemia. Tobacco shows weak but positive association with anaemia in some dimensions.
# Create a normalized composite health index (0-100 scale)
df_advanced <- df %>%
mutate(
# Normalize each indicator to 0-1 scale (higher = better health)
Lit_norm = Women_literacy / 100,
IB_norm = Institutional_birth / 100,
Vacc_norm = Full_vaccination_card_recall / 100,
Stunt_inv = 1 - (Stunting / 100),
Anaem_inv = 1 - (Anaemia_all_women / 100),
San_norm = Sanitation_access / 100,
Water_norm = Clean_water_access / 100,
# Composite Health Index (weighted)
Health_Index = round(
(Lit_norm * 15) +
(IB_norm * 20) +
(Vacc_norm * 15) +
(Stunt_inv * 20) +
(Anaem_inv * 15) +
(San_norm * 10) +
(Water_norm * 5),
2
)
)
summary(df_advanced$Health_Index)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 51.38 67.38 72.86 72.18 77.20 89.08 13
ggplot(df_advanced, aes(x = Health_Index)) +
geom_histogram(binwidth = 2, fill = "steelblue", color = "white") +
labs(title = "Distribution of Composite Health Index Across Districts",
x = "Health Index Score (0–100)",
y = "Number of Districts") +
theme_minimal()
Insight: The composite health index reveals a roughly normal distribution centered around 55–65, with a long left tail — indicating that while many districts achieve moderate health levels, a significant minority remains severely deprived.
# Classify districts into health quintiles
df_advanced <- df_advanced %>%
mutate(Health_Quintile = ntile(Health_Index, 5),
Health_Category = case_when(
Health_Quintile == 5 ~ "Excellent",
Health_Quintile == 4 ~ "Good",
Health_Quintile == 3 ~ "Average",
Health_Quintile == 2 ~ "Below Average",
Health_Quintile == 1 ~ "Poor",
TRUE ~ NA_character_
))
# Distribution across quintiles
table(df_advanced$Health_Category)
##
## Average Below Average Excellent Good Poor
## 139 139 138 138 139
# Which states dominate the "Poor" health quintile?
df_advanced %>%
filter(Health_Category == "Poor") %>%
count(State, sort = TRUE) %>%
head(10)
Insight: The “Poor” health quintile is dominated by districts from UP, Bihar, Jharkhand, and Madhya Pradesh. This geographic clustering of poor health outcomes underlines the need for region-specific interventions rather than one-size-fits-all programs.
# Calculate gender tobacco gap and classify
df_advanced <- df_advanced %>%
mutate(
Tobacco_gender_gap = Men_tobacco - Women_tobacco,
Gap_category = case_when(
Tobacco_gender_gap > 40 ~ "Very High Gap",
Tobacco_gender_gap > 20 ~ "High Gap",
Tobacco_gender_gap > 10 ~ "Moderate Gap",
TRUE ~ "Low Gap"
)
)
table(df_advanced$Gap_category)
##
## High Gap Low Gap Moderate Gap Very High Gap
## 410 17 157 122
Insight: Most districts show a high gender gap in tobacco use, meaning men consume tobacco far more than women. However, regions with “Low Gap” are often those where both men and women have high tobacco usage — indicating communities where tobacco use is normalized across genders.
# Select key features for clustering
cluster_vars <- c("Women_literacy", "Institutional_birth",
"Stunting", "Anaemia_all_women", "Sanitation_access")
df_cluster <- df %>%
select(District, State, all_of(cluster_vars)) %>%
filter(complete.cases(.))
# Scale the data
scaled_data <- scale(df_cluster[, cluster_vars])
# Set seed for reproducibility
set.seed(42)
# Find optimal k using within-cluster sum of squares (elbow method)
wss_values <- sapply(1:8, function(k) {
kmeans(scaled_data, centers = k, nstart = 10)$tot.withinss
})
# Elbow plot
plot(1:8, wss_values, type = "b", pch = 19, col = "steelblue",
main = "Elbow Method for Optimal K",
xlab = "Number of Clusters (K)", ylab = "Total Within-Cluster SS")
# Fit K-Means with k=4 (based on elbow)
set.seed(42)
km_model <- kmeans(scaled_data, centers = 4, nstart = 25)
# Add cluster labels to data
df_cluster$Cluster <- as.factor(km_model$cluster)
# Cluster profile summary
cluster_summary <- df_cluster %>%
group_by(Cluster) %>%
summarise(
n = n(),
Avg_Literacy = round(mean(Women_literacy), 1),
Avg_Inst_Birth = round(mean(Institutional_birth), 1),
Avg_Stunting = round(mean(Stunting), 1),
Avg_Anaemia = round(mean(Anaemia_all_women), 1),
Avg_Sanitation = round(mean(Sanitation_access), 1)
)
print(cluster_summary)
## # A tibble: 4 × 7
## Cluster n Avg_Literacy Avg_Inst_Birth Avg_Stunting Avg_Anaemia
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 332 72.8 91.9 33.5 59.5
## 2 2 136 58.8 79.9 44 63.2
## 3 3 38 81 56.8 34.8 36.8
## 4 4 200 86.2 95.3 26.2 48.7
## # ℹ 1 more variable: Avg_Sanitation <dbl>
# Plot clusters: Literacy vs Stunting
ggplot(df_cluster, aes(x = Women_literacy, y = Stunting, color = Cluster)) +
geom_point(alpha = 0.5, size = 2) +
labs(title = "K-Means Clustering: Women's Literacy vs Stunting",
x = "Women's Literacy (%)", y = "Stunting (%)",
color = "Cluster") +
theme_minimal()
Insight: K-means identifies 4 distinct district health profiles: (1) High literacy + low stunting (southern districts), (2) High stunting + low literacy (central/eastern), (3) Moderate performance with urban features, and (4) High sanitation but variable nutrition. These clusters can guide targeted public health policy.
# Prepare data for KNN
knn_vars <- c("Women_literacy", "Institutional_birth",
"Stunting", "Anaemia_all_women", "Sanitation_access")
df_knn <- df_advanced %>%
select(all_of(knn_vars), Health_Quintile) %>%
filter(complete.cases(.)) %>%
mutate(Health_Quintile = as.factor(Health_Quintile))
# Train-test split (70-30)
set.seed(123)
n <- nrow(df_knn)
train_idx <- sample(1:n, size = floor(0.7 * n))
train_data <- df_knn[train_idx, ]
test_data <- df_knn[-train_idx, ]
# Scale features
train_scaled <- scale(train_data[, knn_vars])
test_scaled <- scale(test_data[, knn_vars],
center = attr(train_scaled, "scaled:center"),
scale = attr(train_scaled, "scaled:scale"))
# Run KNN with k=7
knn_pred <- knn(train = train_scaled,
test = test_scaled,
cl = train_data$Health_Quintile,
k = 7)
# Accuracy
conf_matrix <- table(Actual = test_data$Health_Quintile, Predicted = knn_pred)
accuracy <- round(sum(diag(conf_matrix)) / sum(conf_matrix) * 100, 2)
cat("KNN Classification Accuracy (k=7):", accuracy, "%\n\n")
## KNN Classification Accuracy (k=7): 70.19 %
print(conf_matrix)
## Predicted
## Actual 1 2 3 4 5
## 1 32 6 0 0 0
## 2 5 33 7 2 0
## 3 0 5 25 11 0
## 4 0 1 10 20 8
## 5 0 0 0 7 36
Insight: The KNN model achieves reasonable accuracy in classifying a district’s health quintile from key indicators. The model performs best at identifying the extreme quintiles (Excellent and Poor), where health profiles are most distinct. Misclassification mostly occurs in middle quintiles that share overlapping characteristics.
# Predict institutional birth rate using literacy, sanitation, and vaccination
reg_data <- df %>%
select(Institutional_birth, Women_literacy, Sanitation_access,
Full_vaccination_card_recall, Anaemia_all_women) %>%
filter(complete.cases(.))
# Fit linear model
lm_model <- lm(Institutional_birth ~ Women_literacy + Sanitation_access +
Full_vaccination_card_recall + Anaemia_all_women,
data = reg_data)
# Model summary
summary(lm_model)
##
## Call:
## lm(formula = Institutional_birth ~ Women_literacy + Sanitation_access +
## Full_vaccination_card_recall + Anaemia_all_women, data = reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.471 -4.947 2.133 6.652 20.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.29375 4.34388 6.513 1.42e-10 ***
## Women_literacy 0.20630 0.04433 4.654 3.91e-06 ***
## Sanitation_access 0.07439 0.03900 1.907 0.0569 .
## Full_vaccination_card_recall 0.37437 0.03389 11.046 < 2e-16 ***
## Anaemia_all_women 0.18729 0.03777 4.958 8.97e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.39 on 688 degrees of freedom
## Multiple R-squared: 0.2599, Adjusted R-squared: 0.2556
## F-statistic: 60.4 on 4 and 688 DF, p-value: < 2.2e-16
# Actual vs Predicted plot
reg_data$Predicted <- predict(lm_model)
ggplot(reg_data, aes(x = Predicted, y = Institutional_birth)) +
geom_point(alpha = 0.3, color = "steelblue", size = 1.5) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Linear Regression: Actual vs Predicted Institutional Birth Rate",
x = "Predicted Institutional Birth Rate (%)",
y = "Actual Institutional Birth Rate (%)") +
theme_minimal()
Interpretation: The regression model reveals that women’s literacy, sanitation access, and vaccination coverage are significant positive predictors of institutional births. The R-squared value indicates how much variation in institutional delivery can be explained by these four indicators. Anaemia has a negative coefficient, confirming that high anaemia correlates with poor access to healthcare.
Level 1 (Exploration): - The dataset covers 706 districts and 109 variables across 8 public health domains - Anaemia is near-universal — most districts report 40–70% prevalence among women - Missing data is concentrated in newer or small-scale program indicators
Level 2 (Filtering): - One-third of districts have women’s literacy above 80%, mostly in southern and northeastern states - High malnutrition + low healthcare access coexist in UP, Bihar, and Jharkhand - Substance use is highest in northeastern and tribal belt districts
Level 3 (Grouping): - Southern states dominate all positive health indicators — literacy, vaccination, maternal care - States like Bihar and Uttar Pradesh consistently rank lowest across multiple health dimensions - A double burden is visible: some states face both undernutrition AND rising NCDs
Level 4 (Ranking): - Top 10 healthiest districts are concentrated in Goa, Kerala, and Tamil Nadu - Bottom 10 districts show a convergence of failures: low literacy, low institutional delivery, high stunting - The education-health gap exists — some literate districts still have poor health, pointing to infrastructure deficits
Advanced Engineering: - The Composite Health Index reveals a left-skewed distribution — most districts are moderately healthy but a significant minority is severely deprived - K-means clustering identifies 4 distinct health profiles useful for targeted policy design
Machine Learning: - KNN classifier can predict a district’s health quintile with good accuracy using just 5 indicators - Linear regression confirms that literacy, sanitation, and vaccination together strongly predict institutional delivery rates
This comprehensive analysis of the District Health Survey (NFHS-5) dataset reveals deep regional disparities in health outcomes across Indian districts. The core findings can be summarized as follows:
1. The North-South Divide Persists: Southern states — Kerala, Goa, Tamil Nadu — consistently outperform northern states on almost every health indicator: literacy, maternal care, child nutrition, vaccination, and sanitation. This gap reflects decades of differential investment in education and primary healthcare.
2. Education is the Foundation of Health: Women’s literacy emerges as the single strongest predictor of institutional delivery, vaccination coverage, and reduced stunting. Programs that prioritize girls’ education and female empowerment produce measurable health dividends.
3. Malnutrition Remains a Critical Challenge: Despite economic growth, child stunting (25–45%) and anaemia among women (40–70%) remain alarmingly high across most districts. These are not just food security issues — they reflect failures in water, sanitation, and healthcare access acting together.
4. A Double Burden is Emerging: While undernutrition dominates in rural and tribal districts, rising NCDs (hypertension and diabetes) are simultaneously increasing in urban and semi-urban areas. Health policy must address both ends of the nutritional spectrum.
5. Targeted Interventions Are Essential: The K-means clustering and composite health index analyses identify specific district clusters that require different types of intervention. A one-size-fits-all approach to national health programs will be inefficient; geographically-targeted strategies will yield better outcomes.
6. Machine Learning Offers Policy Utility: The KNN and regression models demonstrate that just 5 variables — literacy, institutional delivery, vaccination, sanitation, and anaemia — can classify a district’s overall health performance with good accuracy. This approach can be used to predict which districts are at risk of deteriorating health outcomes in future surveys.
In conclusion, improving India’s health landscape requires simultaneous investments in female education, infrastructure (water, sanitation, clean cooking fuel), primary healthcare access, and nutritional programs. The data clearly shows that districts that succeed on all these dimensions simultaneously achieve dramatically better health outcomes for their populations.
End of Report ```