library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ purrr     1.2.1
## ✔ ggplot2   4.0.2     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(GGally)
library(ggthemes)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Load the dataset & Print the first 5 values of the dataset

df <- read.csv("C:\\Users\\asus\\Downloads\\dHealth.csv") %>%
  clean_names()
print(names(df))
##   [1] "district_names"                                                                                                                                                           
##   [2] "state_ut"                                                                                                                                                                 
##   [3] "number_of_households_surveyed"                                                                                                                                            
##   [4] "number_of_women_age_15_49_years_interviewed"                                                                                                                              
##   [5] "number_of_men_age_15_54_years_interviewed"                                                                                                                                
##   [6] "female_population_age_6_years_and_above_who_ever_attended_school"                                                                                                         
##   [7] "population_below_age_15_years"                                                                                                                                            
##   [8] "x_sex_ratio_of_the_total_population_females_per_1_000_males"                                                                                                              
##   [9] "sex_ratio_at_birth_for_children_born_in_the_last_five_years_females_per_1_000_males"                                                                                      
##  [10] "children_under_age_5_years_whose_birth_was_registered_with_the_civil_authority"                                                                                           
##  [11] "deaths_in_the_last_3_years_registered_with_the_civil_authority"                                                                                                           
##  [12] "population_living_in_households_with_electricity"                                                                                                                         
##  [13] "population_living_in_households_with_an_improved_drinking_water_source1"                                                                                                  
##  [14] "population_living_in_households_that_use_an_improved_sanitation_facility2"                                                                                                
##  [15] "households_using_clean_fuel_for_cooking3"                                                                                                                                 
##  [16] "households_using_iodized_salt"                                                                                                                                            
##  [17] "households_with_any_usual_member_covered_under_a_health_insurance_financing_scheme"                                                                                       
##  [18] "children_age_5_years_who_attended_pre_primary_school_during_the_school_year_2019_20"                                                                                      
##  [19] "women_age_15_49_who_are_literate4"                                                                                                                                        
##  [20] "women_age_15_49_with_10_or_more_years_of_schooling"                                                                                                                       
##  [21] "women_age_20_24_years_married_before_age_18_years"                                                                                                                        
##  [22] "births_in_the_5_years_preceding_the_survey_that_are_third_or_higher_order"                                                                                                
##  [23] "women_age_15_19_years_who_were_already_mothers_or_pregnant_at_the_time_of_the_survey"                                                                                     
##  [24] "women_age_15_24_years_who_use_hygienic_methods_of_protection_during_their_menstrual_period5"                                                                              
##  [25] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_any_method6"                                                                               
##  [26] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_any_modern_method6"                                                                        
##  [27] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_female_sterilization"                                                                      
##  [28] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_male_sterilization"                                                                        
##  [29] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_iud_ppiud"                                                                                 
##  [30] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_pill"                                                                                      
##  [31] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_condom"                                                                                    
##  [32] "current_use_of_family_planning_methods_currently_married_women_age_15_49_years_injectables"                                                                               
##  [33] "total_unmet_need_for_family_planning_currently_married_women_age_15_49_years_7"                                                                                           
##  [34] "unmet_need_for_spacing_currently_married_women_age_15_49_years_7"                                                                                                         
##  [35] "health_worker_ever_talked_to_female_non_users_about_family_planning"                                                                                                      
##  [36] "current_users_ever_told_about_side_effects_of_current_method_of_family_planning8"                                                                                         
##  [37] "mothers_who_had_an_antenatal_check_up_in_the_first_trimester_for_last_birth_in_the_5_years_before_the_survey"                                                             
##  [38] "mothers_who_had_at_least_4_antenatal_care_visits_for_last_birth_in_the_5_years_before_the_survey"                                                                         
##  [39] "mothers_whose_last_birth_was_protected_against_neonatal_tetanus_for_last_birth_in_the_5_years_before_the_survey_9"                                                        
##  [40] "mothers_who_consumed_iron_folic_acid_for_100_days_or_more_when_they_were_pregnant_for_last_birth_in_the_5_years_before_the_survey"                                        
##  [41] "mothers_who_consumed_iron_folic_acid_for_180_days_or_more_when_they_were_pregnant_for_last_birth_in_the_5_years_before_the_survey"                                        
##  [42] "registered_pregnancies_for_which_the_mother_received_a_mother_and_child_protection_mcp_card_for_last_birth_in_the_5_years_before_the_survey"                              
##  [43] "mothers_who_received_postnatal_care_from_a_doctor_nurse_lhv_anm_midwife_other_health_personnel_within_2_days_of_delivery_for_last_birth_in_the_5_years_before_the_survey" 
##  [44] "average_out_of_pocket_expenditure_per_delivery_in_a_public_health_facility_for_last_birth_in_the_5_years_before_the_survey_rs"                                            
##  [45] "children_born_at_home_who_were_taken_to_a_health_facility_for_a_check_up_within_24_hours_of_birth_for_last_birth_in_the_5_years_before_the_survey"                        
##  [46] "children_who_received_postnatal_care_from_a_doctor_nurse_lhv_anm_midwife_other_health_personnel_within_2_days_of_delivery_for_last_birth_in_the_5_years_before_the_survey"
##  [47] "institutional_births_in_the_5_years_before_the_survey"                                                                                                                    
##  [48] "institutional_births_in_public_facility_in_the_5_years_before_the_survey"                                                                                                 
##  [49] "home_births_that_were_conducted_by_skilled_health_personnel_in_the_5_years_before_the_survey_10"                                                                          
##  [50] "births_attended_by_skilled_health_personnel_in_the_5_years_before_the_survey_10"                                                                                          
##  [51] "births_delivered_by_caesarean_section_in_the_5_years_before_the_survey"                                                                                                   
##  [52] "births_in_a_private_health_facility_that_were_delivered_by_caesarean_section_in_the_5_years_before_the_survey"                                                            
##  [53] "births_in_a_public_health_facility_that_were_delivered_by_caesarean_section_in_the_5_years_before_the_survey"                                                             
##  [54] "children_age_12_23_months_fully_vaccinated_based_on_information_from_either_vaccination_card_or_mother_s_recall11"                                                        
##  [55] "children_age_12_23_months_fully_vaccinated_based_on_information_from_vaccination_card_only12"                                                                             
##  [56] "children_age_12_23_months_who_have_received_bcg"                                                                                                                          
##  [57] "children_age_12_23_months_who_have_received_3_doses_of_polio_vaccine13"                                                                                                   
##  [58] "children_age_12_23_months_who_have_received_3_doses_of_penta_or_dpt_vaccine"                                                                                              
##  [59] "children_age_12_23_months_who_have_received_the_first_dose_of_measles_containing_vaccine_mcv"                                                                             
##  [60] "children_age_24_35_months_who_have_received_a_second_dose_of_measles_containing_vaccine_mcv"                                                                              
##  [61] "children_age_12_23_months_who_have_received_3_doses_of_rotavirus_vaccine14"                                                                                               
##  [62] "children_age_12_23_months_who_have_received_3_doses_of_penta_or_hepatitis_b_vaccine"                                                                                      
##  [63] "children_age_9_35_months_who_received_a_vitamin_a_dose_in_the_last_6_months"                                                                                              
##  [64] "children_age_12_23_months_who_received_most_of_their_vaccinations_in_a_public_health_facility"                                                                            
##  [65] "children_age_12_23_months_who_received_most_of_their_vaccinations_in_a_private_health_facility"                                                                           
##  [66] "prevalence_of_diarrhoea_in_the_2_weeks_preceding_the_survey_children_under_age_5_years"                                                                                   
##  [67] "children_with_diarrhoea_in_the_2_weeks_preceding_the_survey_who_received_oral_rehydration_salts_ors_children_under_age_5_years"                                           
##  [68] "children_with_diarrhoea_in_the_2_weeks_preceding_the_survey_who_received_zinc_children_under_age_5_years"                                                                 
##  [69] "children_swith_diarrhoea_in_the_2_weeks_preceding_the_survey_taken_to_a_health_facility_or_health_provider_children_under_age_5_years"                                    
##  [70] "children_prevalence_of_symptoms_of_acute_respiratory_infection_ari_in_the_2_weeks_preceding_the_survey_children_under_age_5_years"                                        
##  [71] "children_with_fever_or_symptoms_of_ari_in_the_2_weeks_preceding_the_survey_taken_to_a_health_facility_or_health_provider_children_under_age_5_years"                      
##  [72] "children_under_age_3_years_breastfed_within_one_hour_of_birth15"                                                                                                          
##  [73] "children_under_age_6_months_exclusively_breastfed16"                                                                                                                      
##  [74] "children_age_6_8_months_receiving_solid_or_semi_solid_food_and_breastmilk16"                                                                                              
##  [75] "breastfeeding_children_age_6_23_months_receiving_an_adequate_diet16_17"                                                                                                   
##  [76] "non_breastfeeding_children_age_6_23_months_receiving_an_adequate_diet16_17"                                                                                               
##  [77] "total_children_age_6_23_months_receiving_an_adequate_diet16_17"                                                                                                           
##  [78] "children_under_5_years_who_are_stunted_height_for_age_18"                                                                                                                 
##  [79] "children_under_5_years_who_are_wasted_weight_for_height_18"                                                                                                               
##  [80] "children_under_5_years_who_are_severely_wasted_weight_for_height_19"                                                                                                      
##  [81] "children_under_5_years_who_are_underweight_weight_for_age_18"                                                                                                             
##  [82] "children_under_5_years_who_are_overweight_weight_for_height_20"                                                                                                           
##  [83] "women_age_15_49_years_whose_body_mass_index_bmi_is_below_normal_bmi_18_5_kg_m2_21"                                                                                        
##  [84] "women_age_15_49_years_who_are_overweight_or_obese_bmi_25_0_kg_m2_21"                                                                                                      
##  [85] "women_age_15_49_years_who_have_high_risk_waist_to_hip_ratio_0_85"                                                                                                         
##  [86] "children_age_6_59_months_who_are_anaemic_11_0_g_dl_22"                                                                                                                    
##  [87] "non_pregnant_women_age_15_49_years_who_are_anaemic_12_0_g_dl_22"                                                                                                          
##  [88] "pregnant_women_age_15_49_years_who_are_anaemic_11_0_g_dl_22"                                                                                                              
##  [89] "all_women_age_15_49_years_who_are_anaemic22"                                                                                                                              
##  [90] "all_women_age_15_19_years_who_are_anaemic22"                                                                                                                              
##  [91] "women_age_15_years_and_above_with_high_141_160_mg_dl_blood_sugar_level23"                                                                                                 
##  [92] "women_age_15_years_and_above_wih_very_high_160_mg_dl_blood_sugar_level23"                                                                                                 
##  [93] "women_age_15_years_and_above_wih_high_or_very_high_140_mg_dl_blood_sugar_level_or_taking_medicine_to_control_blood_sugar_level23"                                         
##  [94] "men_age_15_years_and_above_wih_high_141_160_mg_dl_blood_sugar_level23"                                                                                                    
##  [95] "men_age_15_years_and_above_wih_very_high_160_mg_dl_blood_sugar_level23"                                                                                                   
##  [96] "men_age_15_years_and_above_wih_high_or_very_high_140_mg_dl_blood_sugar_level_or_taking_medicine_to_control_blood_sugar_level23"                                           
##  [97] "women_age_15_years_and_above_wih_mildly_elevated_blood_pressure_systolic_140_159_mm_of_hg_and_or_diastolic_90_99_mm_of_hg"                                                
##  [98] "women_age_15_years_and_above_wih_moderately_or_severely_elevated_blood_pressure_systolic_160_mm_of_hg_and_or_diastolic_100_mm_of_hg"                                      
##  [99] "women_age_15_years_and_above_wih_elevated_blood_pressure_systolic_140_mm_of_hg_and_or_diastolic_90_mm_of_hg_or_taking_medicine_to_control_blood_pressure"                 
## [100] "men_age_15_years_and_above_wih_mildly_elevated_blood_pressure_systolic_140_159_mm_of_hg_and_or_diastolic_90_99_mm_of_hg"                                                  
## [101] "men_age_15_years_and_above_wih_moderately_or_severely_elevated_blood_pressure_systolic_160_mm_of_hg_and_or_diastolic_100_mm_of_hg"                                        
## [102] "men_age_15_years_and_above_wih_elevated_blood_pressure_systolic_140_mm_of_hg_and_or_diastolic_90_mm_of_hg_or_taking_medicine_to_control_blood_pressure"                   
## [103] "women_age_30_49_years_ever_undergone_a_screening_test_for_cervical_cancer"                                                                                                
## [104] "women_age_30_49_years_ever_undergone_a_breast_examination_for_breast_cancer"                                                                                              
## [105] "women_age_30_49_years_ever_undergone_an_oral_cavity_examination_for_oral_cancer"                                                                                          
## [106] "women_age_15_years_and_above_who_use_any_kind_of_tobacco"                                                                                                                 
## [107] "men_age_15_years_and_above_who_use_any_kind_of_tobacco"                                                                                                                   
## [108] "women_age_15_years_and_above_who_consume_alcohol"                                                                                                                         
## [109] "men_age_15_years_and_above_who_consume_alcohol"

Renaming all the columns to a shorter version

new_names <- c(
  "district", "state", "hh_surveyed", "women_surveyed", "men_surveyed", 
  "female_schooling", "pop_under_15", "sex_ratio_total", "sex_ratio_birth", 
  "child_birth_reg", "death_reg", "hh_electricity", "hh_water", "hh_sanitation", 
  "hh_cooking_fuel", "hh_iodized_salt", "hh_insurance", "child_preprimary", 
  "women_literate", "women_10yr_schooling", "women_early_marriage", 
  "births_3rd_order", "teen_pregnancy", "menstrual_hygiene", "fp_any", 
  "fp_modern", "fp_female_ster", "fp_male_ster", "fp_iud", "fp_pill", 
  "fp_condom", "fp_injectable", "fp_unmet_total", "fp_unmet_spacing", 
  "fp_hw_talk", "fp_side_effects_info", "anc_1st_tri", "anc_4plus", 
  "mother_tetanus", "ifa_100d", "ifa_180d", "mcp_card", "pnc_mother_2d", 
  "delivery_cost_public", "home_birth_checkup", "pnc_child_2d", "inst_births", 
  "inst_births_public", "home_birth_skilled", "birth_skilled_total", 
  "birth_c_section", "birth_c_sec_private", "birth_c_sec_public", 
  "vacc_full_all", "vacc_full_card", "vacc_bcg", "vacc_polio3", "vacc_penta_dpt3", 
  "vacc_measles1", "vacc_measles2", "vacc_rotavirus3", "vacc_penta_hepb3", 
  "vacc_vita", "vacc_facility_public", "vacc_facility_private", 
  "child_diarrhea_prev", "child_diarrhea_ors", "child_diarrhea_zinc", 
  "child_diarrhea_hfac", "child_ari_prev", "child_ari_hfac", "bf_early", 
  "bf_exclusive_6m", "bf_solid_6_8m", "diet_adequate_bf", "diet_adequate_nonbf", 
  "diet_adequate_total", "stunting", "wasting", "severe_wasting", "underweight", 
  "child_overweight", "women_bmi_low", "women_obese", "women_high_whr", 
  "child_anaemia", "women_nonpreg_anaemia", "women_preg_anaemia", 
  "women_total_anaemia", "teen_anaemia", "women_sugar_high", "women_sugar_vhigh", 
  "women_sugar_elevated", "men_sugar_high", "men_sugar_vhigh", "men_sugar_elevated", 
  "women_bp_mild", "women_bp_mod_sev", "women_bp_elevated", "men_bp_mild", 
  "men_bp_mod_sev", "men_bp_elevated", "screening_cervical", "screening_breast", 
  "screening_oral", "women_tobacco", "men_tobacco", "women_alcohol", "men_alcohol"
)
df <- df %>%
  setNames(new_names)
# Printing the new names
print(names(df))
##   [1] "district"              "state"                 "hh_surveyed"          
##   [4] "women_surveyed"        "men_surveyed"          "female_schooling"     
##   [7] "pop_under_15"          "sex_ratio_total"       "sex_ratio_birth"      
##  [10] "child_birth_reg"       "death_reg"             "hh_electricity"       
##  [13] "hh_water"              "hh_sanitation"         "hh_cooking_fuel"      
##  [16] "hh_iodized_salt"       "hh_insurance"          "child_preprimary"     
##  [19] "women_literate"        "women_10yr_schooling"  "women_early_marriage" 
##  [22] "births_3rd_order"      "teen_pregnancy"        "menstrual_hygiene"    
##  [25] "fp_any"                "fp_modern"             "fp_female_ster"       
##  [28] "fp_male_ster"          "fp_iud"                "fp_pill"              
##  [31] "fp_condom"             "fp_injectable"         "fp_unmet_total"       
##  [34] "fp_unmet_spacing"      "fp_hw_talk"            "fp_side_effects_info" 
##  [37] "anc_1st_tri"           "anc_4plus"             "mother_tetanus"       
##  [40] "ifa_100d"              "ifa_180d"              "mcp_card"             
##  [43] "pnc_mother_2d"         "delivery_cost_public"  "home_birth_checkup"   
##  [46] "pnc_child_2d"          "inst_births"           "inst_births_public"   
##  [49] "home_birth_skilled"    "birth_skilled_total"   "birth_c_section"      
##  [52] "birth_c_sec_private"   "birth_c_sec_public"    "vacc_full_all"        
##  [55] "vacc_full_card"        "vacc_bcg"              "vacc_polio3"          
##  [58] "vacc_penta_dpt3"       "vacc_measles1"         "vacc_measles2"        
##  [61] "vacc_rotavirus3"       "vacc_penta_hepb3"      "vacc_vita"            
##  [64] "vacc_facility_public"  "vacc_facility_private" "child_diarrhea_prev"  
##  [67] "child_diarrhea_ors"    "child_diarrhea_zinc"   "child_diarrhea_hfac"  
##  [70] "child_ari_prev"        "child_ari_hfac"        "bf_early"             
##  [73] "bf_exclusive_6m"       "bf_solid_6_8m"         "diet_adequate_bf"     
##  [76] "diet_adequate_nonbf"   "diet_adequate_total"   "stunting"             
##  [79] "wasting"               "severe_wasting"        "underweight"          
##  [82] "child_overweight"      "women_bmi_low"         "women_obese"          
##  [85] "women_high_whr"        "child_anaemia"         "women_nonpreg_anaemia"
##  [88] "women_preg_anaemia"    "women_total_anaemia"   "teen_anaemia"         
##  [91] "women_sugar_high"      "women_sugar_vhigh"     "women_sugar_elevated" 
##  [94] "men_sugar_high"        "men_sugar_vhigh"       "men_sugar_elevated"   
##  [97] "women_bp_mild"         "women_bp_mod_sev"      "women_bp_elevated"    
## [100] "men_bp_mild"           "men_bp_mod_sev"        "men_bp_elevated"      
## [103] "screening_cervical"    "screening_breast"      "screening_oral"       
## [106] "women_tobacco"         "men_tobacco"           "women_alcohol"        
## [109] "men_alcohol"
view(df)
head(df)
##                  district                     state hh_surveyed women_surveyed
## 1                Nicobars Andaman & Nicobar Islands         882            764
## 2 North & Middle Andaman  Andaman & Nicobar Islands         874            789
## 3          South Andaman  Andaman & Nicobar Islands         868            844
## 4             Srikakulam             Andhra Pradesh         874            780
## 5           Vizianagaram             Andhra Pradesh         902            853
## 6          Visakhapatnam             Andhra Pradesh         869            818
##   men_surveyed female_schooling pop_under_15 sex_ratio_total sex_ratio_birth
## 1          125             78.0         23.0             973            927 
## 2          108             82.7         19.8             950            844 
## 3          134             84.7         21.0             967            935 
## 4          100             60.0         20.7            1140           1163 
## 5          134             56.0         20.6            1114            898 
## 6          112             66.8         21.4            1066            974 
##   child_birth_reg death_reg hh_electricity hh_water hh_sanitation
## 1            98.0     83.2            97.9     98.8          83.5
## 2           100.0    (92.6)           93.2     92.2          86.4
## 3            96.5     92.2            99.6     97.9          89.3
## 4            95.0     71.0            99.9     87.7          71.6
## 5            95.4     81.7            99.5     93.1          61.7
## 6            90.5     71.3            99.6     91.8          77.8
##   hh_cooking_fuel hh_iodized_salt hh_insurance child_preprimary women_literate
## 1            56.9            99.4          2.7           (29.5)           87.5
## 2            61.3            99.9          2.1           (30.1)           84.0
## 3            91.9            99.7          1.2           (50.8)           86.7
## 4            74.7            76.5         75.6            (0.0)           64.3
## 5            60.3            85.0         76.7           (25.2)           58.3
## 6            72.9            82.2         64.9            (5.0)           69.5
##   women_10yr_schooling women_early_marriage births_3rd_order teen_pregnancy
## 1                 53.5                11.4              0.0            1.8 
## 2                 41.0                15.4              1.5            3.8 
## 3                 57.5                17.1              0.5            2.8 
## 4                 42.5                25.4              0.0            5.5 
## 5                 37.6                33.7              2.2           12.7 
## 6                 46.0                25.4              0.0            9.5 
##   menstrual_hygiene fp_any fp_modern fp_female_ster fp_male_ster fp_iud fp_pill
## 1             100.0   65.3      57.2           46.4          0.0    2.7     2.0
## 2             100.0   84.1      73.1           48.3          0.6    6.4     7.8
## 3              98.2   57.1      50.5           34.0          0.0    2.8     1.8
## 4              78.0   72.3      72.2           71.0          0.3    0.6     0.0
## 5              72.5   71.2      71.2           69.3          1.3    0.0     0.0
## 6              85.7   68.0      67.7           64.9          1.5    0.3     0.0
##   fp_condom fp_injectable fp_unmet_total fp_unmet_spacing fp_hw_talk
## 1       4.9           1.2            9.5              3.3       40.4
## 2       9.3           0.0            5.8              1.3       23.2
## 3      10.6           0.3           17.6              8.6       31.2
## 4       0.3           0.0            5.7              3.6       16.0
## 5       0.6           0.0            6.7              4.7       21.1
## 6       0.8           0.0            4.8              2.4       15.2
##   fp_side_effects_info anc_1st_tri anc_4plus mother_tetanus ifa_100d ifa_180d
## 1                49.4        62.8      71.7           78.0     72.6     43.9 
## 2                83.2        74.5      79.2           91.1     83.7     24.1 
## 3                88.2        79.4      85.9           92.1     81.0     61.9 
## 4                45.8        79.7      78.4           94.4     67.5     35.3 
## 5                36.4        76.1      71.4           91.3     59.6     32.4 
## 6                35.3        79.4      58.6           88.0     75.0     40.1 
##   mcp_card pnc_mother_2d delivery_cost_public home_birth_checkup pnc_child_2d
## 1    97.9          85.1                 2278                   *        92.5 
## 2    99.2          92.5                 1904                   *        94.3 
## 3    98.9          88.1                 3460                   *        89.8 
## 4   100.0          90.8                 3479                   *        97.7 
## 5    98.8          83.9                 1931                   *        89.2 
## 6    97.0          84.8                 2200                   *        90.9 
##   inst_births inst_births_public home_birth_skilled birth_skilled_total
## 1        97.8               96.7                0.8                98.6
## 2        97.7               95.0                0.7                98.3
## 3        99.5               83.8                0.0                96.9
## 4        97.9               52.2                0.5                96.4
## 5        99.0               70.6                0.5                97.6
## 6        95.3               69.3                0.0                94.4
##   birth_c_section birth_c_sec_private birth_c_sec_public vacc_full_all
## 1            11.5                   *              10.7         (64.2)
## 2            12.9                   *              11.4              *
## 3            37.1              (79.1)              29.6         (76.3)
## 4            57.0               73.8               44.5         (82.8)
## 5            41.3               70.3               30.3         (76.8)
## 6            26.5               57.2               16.8         (76.5)
##   vacc_full_card vacc_bcg vacc_polio3 vacc_penta_dpt3 vacc_measles1
## 1         (94.1)   (80.4)      (69.1)          (71.9)        (67.3)
## 2              *        *           *               *             *
## 3         (96.6)  (100.0)      (79.0)          (94.8)        (81.7)
## 4        (100.0)   (93.3)      (82.8)          (89.7)        (93.3)
## 5              *  (100.0)      (76.8)          (90.3)        (92.5)
## 6         (93.5)   (97.9)      (76.5)          (90.5)        (95.1)
##   vacc_measles2 vacc_rotavirus3 vacc_penta_hepb3 vacc_vita vacc_facility_public
## 1        (20.7)           (3.1)           (68.6)     94.9               (100.0)
## 2             *               *                *    (89.6)                    *
## 3        (33.7)           (0.0)           (85.3)     84.0                (93.1)
## 4        (34.9)          (74.8)           (89.7)     69.6                (97.0)
## 5        (35.0)          (77.3)           (83.6)     85.9               (100.0)
## 6        (45.1)          (72.9)           (79.6)     91.3                (83.8)
##   vacc_facility_private child_diarrhea_prev child_diarrhea_ors
## 1                 (0.0)                 5.7                  *
## 2                     *                 4.5                  *
## 3                 (4.3)                 6.0                  *
## 4                 (3.0)                11.9                  *
## 5                 (0.0)                 7.5                  *
## 6                 (9.7)                 8.1                  *
##   child_diarrhea_zinc child_diarrhea_hfac child_ari_prev child_ari_hfac
## 1                   *                   *            1.8         (85.7)
## 2                   *                   *            7.0              *
## 3                   *                   *            0.0         (77.3)
## 4                   *                   *            1.3         (79.7)
## 5                   *                   *            1.4         (83.5)
## 6                   *                   *            2.0         (72.3)
##   bf_early bf_exclusive_6m bf_solid_6_8m diet_adequate_bf diet_adequate_nonbf
## 1    55.4                *             *           (19.4)                   *
## 2    27.3                *             *            (6.5)                   *
## 3    51.1                *             *           (22.3)                   *
## 4    42.8                *             *           (14.0)                   *
## 5    55.6                *             *            (2.5)                   *
## 6    64.3                *             *            (6.9)                   *
##   diet_adequate_total stunting wasting severe_wasting underweight
## 1              (18.7)    21.6    15.7            7.8        24.6 
## 2               (5.9)    27.0    27.0            8.3        42.8 
## 3               23.5     21.1    12.6            3.5        17.4 
## 4               16.1     19.7    19.5            7.4        21.4 
## 5                1.8     36.4    19.2            8.3        32.2 
## 6               11.8     31.0    21.5           11.2        33.5 
##   child_overweight women_bmi_low women_obese women_high_whr child_anaemia
## 1             1.5            8.2        39.1           62.5         37.7 
## 2             0.8            8.6        35.9           79.3         30.4 
## 3             7.2           10.0        39.0           78.2         43.4 
## 4             4.5           13.8        27.2           54.0         59.6 
## 5             4.7           16.9        28.8           58.0         66.7 
## 6             4.8           17.4        23.8           58.0         72.6 
##   women_nonpreg_anaemia women_preg_anaemia women_total_anaemia teen_anaemia
## 1                  38.4                  *                38.3        48.0 
## 2                  62.5                  *                62.1        47.8 
## 3                  57.6                  *                57.7        43.2 
## 4                  62.8                  *                62.6        59.2 
## 5                  64.6                  *                64.0        73.9 
## 6                  58.6                  *                58.0        58.9 
##   women_sugar_high women_sugar_vhigh women_sugar_elevated men_sugar_high
## 1              7.4               3.9                 13.1            9.6
## 2              7.2               6.4                 16.7            9.1
## 3              7.5               9.5                 18.4            9.3
## 4              8.2               7.8                 17.4            6.8
## 5              6.2               7.0                 14.3            5.8
## 6              6.1               8.6                 17.0            7.3
##   men_sugar_vhigh men_sugar_elevated women_bp_mild women_bp_mod_sev
## 1             4.4               15.4          23.2              8.5
## 2             6.9               18.3          18.4              4.0
## 3             7.8               18.1          12.7              4.9
## 4             8.6               17.6          12.8              5.9
## 5             7.5               14.5          12.9              6.6
## 6             8.5               18.2          12.1              5.9
##   women_bp_elevated men_bp_mild men_bp_mod_sev men_bp_elevated
## 1              35.4        32.9           11.1            47.0
## 2              27.4        22.6            6.0            32.2
## 3              23.0        17.9            6.1            26.9
## 4              22.1        14.4            5.5            22.9
## 5              25.2        14.8            6.4            25.1
## 6              23.9        17.0            7.0            29.2
##   screening_cervical screening_breast screening_oral women_tobacco men_tobacco
## 1               13.4             13.2            5.4          63.5        76.8
## 2                1.7              0.3           15.8          46.8        70.5
## 3                1.3              0.7            8.0          19.6        50.8
## 4                1.0              0.2            3.8           7.1        21.3
## 5                4.9              0.6            7.3          11.4        21.5
## 6                1.7              0.7            4.1           6.3        22.8
##   women_alcohol men_alcohol
## 1          29.6        64.5
## 2           5.1        45.3
## 3           1.7        32.8
## 4           0.6        28.3
## 5           0.8        32.3
## 6           1.3        30.2

#2 # Check the structure of the dataset

str(df)
## 'data.frame':    706 obs. of  109 variables:
##  $ district             : chr  "Nicobars" "North & Middle Andaman " "South Andaman " "Srikakulam " ...
##  $ state                : chr  "Andaman & Nicobar Islands" "Andaman & Nicobar Islands" "Andaman & Nicobar Islands" "Andhra Pradesh" ...
##  $ hh_surveyed          : num  882 874 868 874 902 869 888 884 865 851 ...
##  $ women_surveyed       : num  764 789 844 780 853 818 824 841 820 807 ...
##  $ men_surveyed         : num  125 108 134 100 134 112 105 122 119 93 ...
##  $ female_schooling     : num  78 82.7 84.7 60 56 66.8 75.4 75.4 74 64.9 ...
##  $ pop_under_15         : num  23 19.8 21 20.7 20.6 21.4 20.5 21.5 20.4 22.4 ...
##  $ sex_ratio_total      : num  973 950 967 1140 1114 ...
##  $ sex_ratio_birth      : chr  "927 " "844 " "935 " "1163 " ...
##  $ child_birth_reg      : num  98 100 96.5 95 95.4 90.5 93 93.5 96.4 92.3 ...
##  $ death_reg            : chr  "83.2 " "(92.6)" "92.2 " "71.0 " ...
##  $ hh_electricity       : num  97.9 93.2 99.6 99.9 99.5 99.6 98.8 99.3 99.6 99.2 ...
##  $ hh_water             : num  98.8 92.2 97.9 87.7 93.1 91.8 97.9 99.1 94.4 99.3 ...
##  $ hh_sanitation        : num  83.5 86.4 89.3 71.6 61.7 77.8 77.7 80.8 79.1 83.4 ...
##  $ hh_cooking_fuel      : num  56.9 61.3 91.9 74.7 60.3 72.9 80.3 86.8 89.8 91.7 ...
##  $ hh_iodized_salt      : num  99.4 99.9 99.7 76.5 85 82.2 81.2 83.4 87.5 85.8 ...
##  $ hh_insurance         : num  2.7 2.1 1.2 75.6 76.7 64.9 66.4 67.6 68.1 71.1 ...
##  $ child_preprimary     : chr  "(29.5)" "(30.1)" "(50.8)" "(0.0)" ...
##  $ women_literate       : num  87.5 84 86.7 64.3 58.3 69.5 77.9 77 76.9 68.5 ...
##  $ women_10yr_schooling : num  53.5 41 57.5 42.5 37.6 46 43.2 46.5 46.2 32.6 ...
##  $ women_early_marriage : chr  "11.4 " "15.4 " "17.1 " "25.4 " ...
##  $ births_3rd_order     : chr  "0.0 " "1.5 " "0.5 " "0.0 " ...
##  $ teen_pregnancy       : chr  "1.8 " "3.8 " "2.8 " "5.5 " ...
##  $ menstrual_hygiene    : num  100 100 98.2 78 72.5 85.7 71 84.4 92.6 88 ...
##  $ fp_any               : num  65.3 84.1 57.1 72.3 71.2 68 66.3 77.8 79.1 73.3 ...
##  $ fp_modern            : num  57.2 73.1 50.5 72.2 71.2 67.7 66.3 77.2 78.1 73.2 ...
##  $ fp_female_ster       : num  46.4 48.3 34 71 69.3 64.9 64.1 74.5 76.5 72.9 ...
##  $ fp_male_ster         : num  0 0.6 0 0.3 1.3 1.5 0.9 0.7 1 0 ...
##  $ fp_iud               : num  2.7 6.4 2.8 0.6 0 0.3 0.1 0.6 0 0.2 ...
##  $ fp_pill              : num  2 7.8 1.8 0 0 0 0.3 0.4 0 0 ...
##  $ fp_condom            : num  4.9 9.3 10.6 0.3 0.6 0.8 1.1 0.6 0.4 0.1 ...
##  $ fp_injectable        : num  1.2 0 0.3 0 0 0 0 0.4 0 0 ...
##  $ fp_unmet_total       : num  9.5 5.8 17.6 5.7 6.7 4.8 8 3 2.5 3.2 ...
##  $ fp_unmet_spacing     : num  3.3 1.3 8.6 3.6 4.7 2.4 4.4 1.8 1.4 1.7 ...
##  $ fp_hw_talk           : num  40.4 23.2 31.2 16 21.1 15.2 12.5 12.5 16.1 16.6 ...
##  $ fp_side_effects_info : chr  "49.4 " "83.2 " "88.2 " "45.8 " ...
##  $ anc_1st_tri          : chr  "62.8 " "74.5 " "79.4 " "79.7 " ...
##  $ anc_4plus            : chr  "71.7 " "79.2 " "85.9 " "78.4 " ...
##  $ mother_tetanus       : chr  "78.0 " "91.1 " "92.1 " "94.4 " ...
##  $ ifa_100d             : chr  "72.6 " "83.7 " "81.0 " "67.5 " ...
##  $ ifa_180d             : chr  "43.9 " "24.1 " "61.9 " "35.3 " ...
##  $ mcp_card             : chr  "97.9 " "99.2 " "98.9 " "100.0 " ...
##  $ pnc_mother_2d        : chr  "85.1 " "92.5 " "88.1 " "90.8 " ...
##  $ delivery_cost_public : chr  "2278 " "1904 " "3460 " "3479 " ...
##  $ home_birth_checkup   : chr  "*" "*" "*" "*" ...
##  $ pnc_child_2d         : chr  "92.5 " "94.3 " "89.8 " "97.7 " ...
##  $ inst_births          : num  97.8 97.7 99.5 97.9 99 95.3 96.6 98.7 98.9 98.6 ...
##  $ inst_births_public   : num  96.7 95 83.8 52.2 70.6 69.3 46 48.8 40.1 49.6 ...
##  $ home_birth_skilled   : num  0.8 0.7 0 0.5 0.5 0 2.2 0.7 0.5 0.9 ...
##  $ birth_skilled_total  : num  98.6 98.3 96.9 96.4 97.6 94.4 89.9 98.5 98 95.5 ...
##  $ birth_c_section      : num  11.5 12.9 37.1 57 41.3 26.5 52.2 55.7 66.1 53.8 ...
##  $ birth_c_sec_private  : chr  "*" "*" "(79.1)" "73.8 " ...
##  $ birth_c_sec_public   : chr  "10.7 " "11.4 " "29.6 " "44.5 " ...
##  $ vacc_full_all        : chr  "(64.2)" "*" "(76.3)" "(82.8)" ...
##  $ vacc_full_card       : chr  "(94.1)" "*" "(96.6)" "(100.0)" ...
##  $ vacc_bcg             : chr  "(80.4)" "*" "(100.0)" "(93.3)" ...
##  $ vacc_polio3          : chr  "(69.1)" "*" "(79.0)" "(82.8)" ...
##  $ vacc_penta_dpt3      : chr  "(71.9)" "*" "(94.8)" "(89.7)" ...
##  $ vacc_measles1        : chr  "(67.3)" "*" "(81.7)" "(93.3)" ...
##  $ vacc_measles2        : chr  "(20.7)" "*" "(33.7)" "(34.9)" ...
##  $ vacc_rotavirus3      : chr  "(3.1)" "*" "(0.0)" "(74.8)" ...
##  $ vacc_penta_hepb3     : chr  "(68.6)" "*" "(85.3)" "(89.7)" ...
##  $ vacc_vita            : chr  "94.9 " "(89.6)" "84.0 " "69.6 " ...
##  $ vacc_facility_public : chr  "(100.0)" "*" "(93.1)" "(97.0)" ...
##  $ vacc_facility_private: chr  "(0.0)" "*" "(4.3)" "(3.0)" ...
##  $ child_diarrhea_prev  : num  5.7 4.5 6 11.9 7.5 8.1 13.3 2.7 7.8 10 ...
##  $ child_diarrhea_ors   : chr  "*" "*" "*" "*" ...
##  $ child_diarrhea_zinc  : chr  "*" "*" "*" "*" ...
##  $ child_diarrhea_hfac  : chr  "*" "*" "*" "*" ...
##  $ child_ari_prev       : num  1.8 7 0 1.3 1.4 2 2.2 1 2.4 1 ...
##  $ child_ari_hfac       : chr  "(85.7)" "*" "(77.3)" "(79.7)" ...
##  $ bf_early             : chr  "55.4 " "27.3 " "51.1 " "42.8 " ...
##  $ bf_exclusive_6m      : chr  "*" "*" "*" "*" ...
##  $ bf_solid_6_8m        : chr  "*" "*" "*" "*" ...
##  $ diet_adequate_bf     : chr  "(19.4)" "(6.5)" "(22.3)" "(14.0)" ...
##  $ diet_adequate_nonbf  : chr  "*" "*" "*" "*" ...
##  $ diet_adequate_total  : chr  "(18.7)" "(5.9)" "23.5 " "16.1 " ...
##  $ stunting             : chr  "21.6 " "27.0 " "21.1 " "19.7 " ...
##  $ wasting              : chr  "15.7 " "27.0 " "12.6 " "19.5 " ...
##  $ severe_wasting       : chr  "7.8 " "8.3 " "3.5 " "7.4 " ...
##  $ underweight          : chr  "24.6 " "42.8 " "17.4 " "21.4 " ...
##  $ child_overweight     : chr  "1.5 " "0.8 " "7.2 " "4.5 " ...
##  $ women_bmi_low        : num  8.2 8.6 10 13.8 16.9 17.4 10.2 10.1 10.5 9.6 ...
##  $ women_obese          : num  39.1 35.9 39 27.2 28.8 23.8 44.4 45.3 40.6 46.4 ...
##  $ women_high_whr       : num  62.5 79.3 78.2 54 58 58 49.2 51.6 53.5 53.4 ...
##  $ child_anaemia        : chr  "37.7 " "30.4 " "43.4 " "59.6 " ...
##  $ women_nonpreg_anaemia: num  38.4 62.5 57.6 62.8 64.6 58.6 63.2 63.1 60.4 59.8 ...
##  $ women_preg_anaemia   : chr  "*" "*" "*" "*" ...
##  $ women_total_anaemia  : num  38.3 62.1 57.7 62.6 64 58 63 63 60.3 59.5 ...
##  $ teen_anaemia         : chr  "48.0 " "47.8 " "43.2 " "59.2 " ...
##  $ women_sugar_high     : num  7.4 7.2 7.5 8.2 6.2 6.1 7.5 7.3 7.9 7.8 ...
##  $ women_sugar_vhigh    : num  3.9 6.4 9.5 7.8 7 8.6 12.7 13.1 13.4 13 ...
##  $ women_sugar_elevated : num  13.1 16.7 18.4 17.4 14.3 17 21.7 23.8 23.3 22.7 ...
##  $ men_sugar_high       : num  9.6 9.1 9.3 6.8 5.8 7.3 9.2 7.2 8.5 10.7 ...
##  $ men_sugar_vhigh      : num  4.4 6.9 7.8 8.6 7.5 8.5 15.5 10.4 11.9 13.4 ...
##  $ men_sugar_elevated   : num  15.4 18.3 18.1 17.6 14.5 18.2 27.6 18.9 22.5 25.9 ...
##  $ women_bp_mild        : num  23.2 18.4 12.7 12.8 12.9 12.1 13 14.7 13.4 14.7 ...
##  $ women_bp_mod_sev     : num  8.5 4 4.9 5.9 6.6 5.9 6.6 6.1 4.3 5.9 ...
##  $ women_bp_elevated    : num  35.4 27.4 23 22.1 25.2 23.9 29 28.8 24 25.8 ...
##   [list output truncated]

#3 # Convert all columns except ‘district’ and ‘state’ to numeric, removing any non-numeric characters

df <- df %>%
  mutate(across(-c(district, state), ~ {
    cleaned_value <- str_remove_all(.x, "[\\(\\)\\*]")
    cleaned_value <- str_trim(cleaned_value)
    as.numeric(cleaned_value)
  }))
str(df)
## 'data.frame':    706 obs. of  109 variables:
##  $ district             : chr  "Nicobars" "North & Middle Andaman " "South Andaman " "Srikakulam " ...
##  $ state                : chr  "Andaman & Nicobar Islands" "Andaman & Nicobar Islands" "Andaman & Nicobar Islands" "Andhra Pradesh" ...
##  $ hh_surveyed          : num  882 874 868 874 902 869 888 884 865 851 ...
##  $ women_surveyed       : num  764 789 844 780 853 818 824 841 820 807 ...
##  $ men_surveyed         : num  125 108 134 100 134 112 105 122 119 93 ...
##  $ female_schooling     : num  78 82.7 84.7 60 56 66.8 75.4 75.4 74 64.9 ...
##  $ pop_under_15         : num  23 19.8 21 20.7 20.6 21.4 20.5 21.5 20.4 22.4 ...
##  $ sex_ratio_total      : num  973 950 967 1140 1114 ...
##  $ sex_ratio_birth      : num  927 844 935 1163 898 ...
##  $ child_birth_reg      : num  98 100 96.5 95 95.4 90.5 93 93.5 96.4 92.3 ...
##  $ death_reg            : num  83.2 92.6 92.2 71 81.7 71.3 68.2 90.1 86.3 82.5 ...
##  $ hh_electricity       : num  97.9 93.2 99.6 99.9 99.5 99.6 98.8 99.3 99.6 99.2 ...
##  $ hh_water             : num  98.8 92.2 97.9 87.7 93.1 91.8 97.9 99.1 94.4 99.3 ...
##  $ hh_sanitation        : num  83.5 86.4 89.3 71.6 61.7 77.8 77.7 80.8 79.1 83.4 ...
##  $ hh_cooking_fuel      : num  56.9 61.3 91.9 74.7 60.3 72.9 80.3 86.8 89.8 91.7 ...
##  $ hh_iodized_salt      : num  99.4 99.9 99.7 76.5 85 82.2 81.2 83.4 87.5 85.8 ...
##  $ hh_insurance         : num  2.7 2.1 1.2 75.6 76.7 64.9 66.4 67.6 68.1 71.1 ...
##  $ child_preprimary     : num  29.5 30.1 50.8 0 25.2 5 11.2 2.7 15.1 7.6 ...
##  $ women_literate       : num  87.5 84 86.7 64.3 58.3 69.5 77.9 77 76.9 68.5 ...
##  $ women_10yr_schooling : num  53.5 41 57.5 42.5 37.6 46 43.2 46.5 46.2 32.6 ...
##  $ women_early_marriage : num  11.4 15.4 17.1 25.4 33.7 25.4 26 22.1 25.3 35.4 ...
##  $ births_3rd_order     : num  0 1.5 0.5 0 2.2 0 1.3 0.5 0 1 ...
##  $ teen_pregnancy       : num  1.8 3.8 2.8 5.5 12.7 9.5 6.2 13.4 9.6 20.7 ...
##  $ menstrual_hygiene    : num  100 100 98.2 78 72.5 85.7 71 84.4 92.6 88 ...
##  $ fp_any               : num  65.3 84.1 57.1 72.3 71.2 68 66.3 77.8 79.1 73.3 ...
##  $ fp_modern            : num  57.2 73.1 50.5 72.2 71.2 67.7 66.3 77.2 78.1 73.2 ...
##  $ fp_female_ster       : num  46.4 48.3 34 71 69.3 64.9 64.1 74.5 76.5 72.9 ...
##  $ fp_male_ster         : num  0 0.6 0 0.3 1.3 1.5 0.9 0.7 1 0 ...
##  $ fp_iud               : num  2.7 6.4 2.8 0.6 0 0.3 0.1 0.6 0 0.2 ...
##  $ fp_pill              : num  2 7.8 1.8 0 0 0 0.3 0.4 0 0 ...
##  $ fp_condom            : num  4.9 9.3 10.6 0.3 0.6 0.8 1.1 0.6 0.4 0.1 ...
##  $ fp_injectable        : num  1.2 0 0.3 0 0 0 0 0.4 0 0 ...
##  $ fp_unmet_total       : num  9.5 5.8 17.6 5.7 6.7 4.8 8 3 2.5 3.2 ...
##  $ fp_unmet_spacing     : num  3.3 1.3 8.6 3.6 4.7 2.4 4.4 1.8 1.4 1.7 ...
##  $ fp_hw_talk           : num  40.4 23.2 31.2 16 21.1 15.2 12.5 12.5 16.1 16.6 ...
##  $ fp_side_effects_info : num  49.4 83.2 88.2 45.8 36.4 35.3 32.4 28.8 27.7 28.2 ...
##  $ anc_1st_tri          : num  62.8 74.5 79.4 79.7 76.1 79.4 76.3 82.2 81.5 89.4 ...
##  $ anc_4plus            : num  71.7 79.2 85.9 78.4 71.4 58.6 51 62.7 73.3 62.5 ...
##  $ mother_tetanus       : num  78 91.1 92.1 94.4 91.3 88 87 95.3 95.8 90.3 ...
##  $ ifa_100d             : num  72.6 83.7 81 67.5 59.6 75 62.7 82.8 56.2 75.1 ...
##  $ ifa_180d             : num  43.9 24.1 61.9 35.3 32.4 40.1 32.3 43.7 39.2 42.5 ...
##  $ mcp_card             : num  97.9 99.2 98.9 100 98.8 97 91.4 100 96.4 96 ...
##  $ pnc_mother_2d        : num  85.1 92.5 88.1 90.8 83.9 84.8 91.9 94.8 88.3 94 ...
##  $ delivery_cost_public : num  2278 1904 3460 3479 1931 ...
##  $ home_birth_checkup   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pnc_child_2d         : num  92.5 94.3 89.8 97.7 89.2 90.9 93.5 96.4 90.3 97.2 ...
##  $ inst_births          : num  97.8 97.7 99.5 97.9 99 95.3 96.6 98.7 98.9 98.6 ...
##  $ inst_births_public   : num  96.7 95 83.8 52.2 70.6 69.3 46 48.8 40.1 49.6 ...
##  $ home_birth_skilled   : num  0.8 0.7 0 0.5 0.5 0 2.2 0.7 0.5 0.9 ...
##  $ birth_skilled_total  : num  98.6 98.3 96.9 96.4 97.6 94.4 89.9 98.5 98 95.5 ...
##  $ birth_c_section      : num  11.5 12.9 37.1 57 41.3 26.5 52.2 55.7 66.1 53.8 ...
##  $ birth_c_sec_private  : num  NA NA 79.1 73.8 70.3 57.2 72 72 79.7 72.2 ...
##  $ birth_c_sec_public   : num  10.7 11.4 29.6 44.5 30.3 16.8 34.1 40.4 47.9 37 ...
##  $ vacc_full_all        : num  64.2 NA 76.3 82.8 76.8 76.5 59 80 90 76.2 ...
##  $ vacc_full_card       : num  94.1 NA 96.6 100 NA 93.5 67.3 87 97.2 100 ...
##  $ vacc_bcg             : num  80.4 NA 100 93.3 100 97.9 92.7 94.8 93.9 97.6 ...
##  $ vacc_polio3          : num  69.1 NA 79 82.8 76.8 76.5 66.3 80 90 76.2 ...
##  $ vacc_penta_dpt3      : num  71.9 NA 94.8 89.7 90.3 90.5 83.9 89 93.9 97.6 ...
##  $ vacc_measles1        : num  67.3 NA 81.7 93.3 92.5 95.1 76.7 91.7 90 92.5 ...
##  $ vacc_measles2        : num  20.7 NA 33.7 34.9 35 45.1 32.5 26.3 33.8 39.6 ...
##  $ vacc_rotavirus3      : num  3.1 NA 0 74.8 77.3 72.9 77.8 77.7 62.1 87.8 ...
##  $ vacc_penta_hepb3     : num  68.6 NA 85.3 89.7 83.6 79.6 86.7 89 93.9 97.6 ...
##  $ vacc_vita            : num  94.9 89.6 84 69.6 85.9 91.3 81.4 73.9 81 80.2 ...
##  $ vacc_facility_public : num  100 NA 93.1 97 100 83.8 85.3 100 100 97.4 ...
##  $ vacc_facility_private: num  0 NA 4.3 3 0 9.7 7.5 0 0 2.6 ...
##  $ child_diarrhea_prev  : num  5.7 4.5 6 11.9 7.5 8.1 13.3 2.7 7.8 10 ...
##  $ child_diarrhea_ors   : num  NA NA NA NA NA NA 72.9 NA NA NA ...
##  $ child_diarrhea_zinc  : num  NA NA NA NA NA NA 23 NA NA NA ...
##  $ child_diarrhea_hfac  : num  NA NA NA NA NA NA 69.4 NA NA NA ...
##  $ child_ari_prev       : num  1.8 7 0 1.3 1.4 2 2.2 1 2.4 1 ...
##  $ child_ari_hfac       : num  85.7 NA 77.3 79.7 83.5 72.3 63.7 71.9 45.8 NA ...
##  $ bf_early             : num  55.4 27.3 51.1 42.8 55.6 64.3 51.3 32.8 29.3 69.3 ...
##  $ bf_exclusive_6m      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ bf_solid_6_8m        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ diet_adequate_bf     : num  19.4 6.5 22.3 14 2.5 6.9 3 4.9 26.1 8.6 ...
##  $ diet_adequate_nonbf  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ diet_adequate_total  : num  18.7 5.9 23.5 16.1 1.8 11.8 10.2 3.5 23.4 11.2 ...
##  $ stunting             : num  21.6 27 21.1 19.7 36.4 31 23.1 31.4 29.8 23.8 ...
##  $ wasting              : num  15.7 27 12.6 19.5 19.2 21.5 14.3 11.7 14.3 17.8 ...
##  $ severe_wasting       : num  7.8 8.3 3.5 7.4 8.3 11.2 3.8 4.7 5.2 8.1 ...
##  $ underweight          : num  24.6 42.8 17.4 21.4 32.2 33.5 22.4 22.5 21.1 26.9 ...
##  $ child_overweight     : num  1.5 0.8 7.2 4.5 4.7 4.8 3.4 5.2 4.2 3.6 ...
##  $ women_bmi_low        : num  8.2 8.6 10 13.8 16.9 17.4 10.2 10.1 10.5 9.6 ...
##  $ women_obese          : num  39.1 35.9 39 27.2 28.8 23.8 44.4 45.3 40.6 46.4 ...
##  $ women_high_whr       : num  62.5 79.3 78.2 54 58 58 49.2 51.6 53.5 53.4 ...
##  $ child_anaemia        : num  37.7 30.4 43.4 59.6 66.7 72.6 66.8 62.3 65.7 59.3 ...
##  $ women_nonpreg_anaemia: num  38.4 62.5 57.6 62.8 64.6 58.6 63.2 63.1 60.4 59.8 ...
##  $ women_preg_anaemia   : num  NA NA NA NA NA NA NA NA NA 51.9 ...
##  $ women_total_anaemia  : num  38.3 62.1 57.7 62.6 64 58 63 63 60.3 59.5 ...
##  $ teen_anaemia         : num  48 47.8 43.2 59.2 73.9 58.9 65.2 66.8 59 54.6 ...
##  $ women_sugar_high     : num  7.4 7.2 7.5 8.2 6.2 6.1 7.5 7.3 7.9 7.8 ...
##  $ women_sugar_vhigh    : num  3.9 6.4 9.5 7.8 7 8.6 12.7 13.1 13.4 13 ...
##  $ women_sugar_elevated : num  13.1 16.7 18.4 17.4 14.3 17 21.7 23.8 23.3 22.7 ...
##  $ men_sugar_high       : num  9.6 9.1 9.3 6.8 5.8 7.3 9.2 7.2 8.5 10.7 ...
##  $ men_sugar_vhigh      : num  4.4 6.9 7.8 8.6 7.5 8.5 15.5 10.4 11.9 13.4 ...
##  $ men_sugar_elevated   : num  15.4 18.3 18.1 17.6 14.5 18.2 27.6 18.9 22.5 25.9 ...
##  $ women_bp_mild        : num  23.2 18.4 12.7 12.8 12.9 12.1 13 14.7 13.4 14.7 ...
##  $ women_bp_mod_sev     : num  8.5 4 4.9 5.9 6.6 5.9 6.6 6.1 4.3 5.9 ...
##  $ women_bp_elevated    : num  35.4 27.4 23 22.1 25.2 23.9 29 28.8 24 25.8 ...
##   [list output truncated]
view(df)
head(df)
##                  district                     state hh_surveyed women_surveyed
## 1                Nicobars Andaman & Nicobar Islands         882            764
## 2 North & Middle Andaman  Andaman & Nicobar Islands         874            789
## 3          South Andaman  Andaman & Nicobar Islands         868            844
## 4             Srikakulam             Andhra Pradesh         874            780
## 5           Vizianagaram             Andhra Pradesh         902            853
## 6          Visakhapatnam             Andhra Pradesh         869            818
##   men_surveyed female_schooling pop_under_15 sex_ratio_total sex_ratio_birth
## 1          125             78.0         23.0             973             927
## 2          108             82.7         19.8             950             844
## 3          134             84.7         21.0             967             935
## 4          100             60.0         20.7            1140            1163
## 5          134             56.0         20.6            1114             898
## 6          112             66.8         21.4            1066             974
##   child_birth_reg death_reg hh_electricity hh_water hh_sanitation
## 1            98.0      83.2           97.9     98.8          83.5
## 2           100.0      92.6           93.2     92.2          86.4
## 3            96.5      92.2           99.6     97.9          89.3
## 4            95.0      71.0           99.9     87.7          71.6
## 5            95.4      81.7           99.5     93.1          61.7
## 6            90.5      71.3           99.6     91.8          77.8
##   hh_cooking_fuel hh_iodized_salt hh_insurance child_preprimary women_literate
## 1            56.9            99.4          2.7             29.5           87.5
## 2            61.3            99.9          2.1             30.1           84.0
## 3            91.9            99.7          1.2             50.8           86.7
## 4            74.7            76.5         75.6              0.0           64.3
## 5            60.3            85.0         76.7             25.2           58.3
## 6            72.9            82.2         64.9              5.0           69.5
##   women_10yr_schooling women_early_marriage births_3rd_order teen_pregnancy
## 1                 53.5                 11.4              0.0            1.8
## 2                 41.0                 15.4              1.5            3.8
## 3                 57.5                 17.1              0.5            2.8
## 4                 42.5                 25.4              0.0            5.5
## 5                 37.6                 33.7              2.2           12.7
## 6                 46.0                 25.4              0.0            9.5
##   menstrual_hygiene fp_any fp_modern fp_female_ster fp_male_ster fp_iud fp_pill
## 1             100.0   65.3      57.2           46.4          0.0    2.7     2.0
## 2             100.0   84.1      73.1           48.3          0.6    6.4     7.8
## 3              98.2   57.1      50.5           34.0          0.0    2.8     1.8
## 4              78.0   72.3      72.2           71.0          0.3    0.6     0.0
## 5              72.5   71.2      71.2           69.3          1.3    0.0     0.0
## 6              85.7   68.0      67.7           64.9          1.5    0.3     0.0
##   fp_condom fp_injectable fp_unmet_total fp_unmet_spacing fp_hw_talk
## 1       4.9           1.2            9.5              3.3       40.4
## 2       9.3           0.0            5.8              1.3       23.2
## 3      10.6           0.3           17.6              8.6       31.2
## 4       0.3           0.0            5.7              3.6       16.0
## 5       0.6           0.0            6.7              4.7       21.1
## 6       0.8           0.0            4.8              2.4       15.2
##   fp_side_effects_info anc_1st_tri anc_4plus mother_tetanus ifa_100d ifa_180d
## 1                 49.4        62.8      71.7           78.0     72.6     43.9
## 2                 83.2        74.5      79.2           91.1     83.7     24.1
## 3                 88.2        79.4      85.9           92.1     81.0     61.9
## 4                 45.8        79.7      78.4           94.4     67.5     35.3
## 5                 36.4        76.1      71.4           91.3     59.6     32.4
## 6                 35.3        79.4      58.6           88.0     75.0     40.1
##   mcp_card pnc_mother_2d delivery_cost_public home_birth_checkup pnc_child_2d
## 1     97.9          85.1                 2278                 NA         92.5
## 2     99.2          92.5                 1904                 NA         94.3
## 3     98.9          88.1                 3460                 NA         89.8
## 4    100.0          90.8                 3479                 NA         97.7
## 5     98.8          83.9                 1931                 NA         89.2
## 6     97.0          84.8                 2200                 NA         90.9
##   inst_births inst_births_public home_birth_skilled birth_skilled_total
## 1        97.8               96.7                0.8                98.6
## 2        97.7               95.0                0.7                98.3
## 3        99.5               83.8                0.0                96.9
## 4        97.9               52.2                0.5                96.4
## 5        99.0               70.6                0.5                97.6
## 6        95.3               69.3                0.0                94.4
##   birth_c_section birth_c_sec_private birth_c_sec_public vacc_full_all
## 1            11.5                  NA               10.7          64.2
## 2            12.9                  NA               11.4            NA
## 3            37.1                79.1               29.6          76.3
## 4            57.0                73.8               44.5          82.8
## 5            41.3                70.3               30.3          76.8
## 6            26.5                57.2               16.8          76.5
##   vacc_full_card vacc_bcg vacc_polio3 vacc_penta_dpt3 vacc_measles1
## 1           94.1     80.4        69.1            71.9          67.3
## 2             NA       NA          NA              NA            NA
## 3           96.6    100.0        79.0            94.8          81.7
## 4          100.0     93.3        82.8            89.7          93.3
## 5             NA    100.0        76.8            90.3          92.5
## 6           93.5     97.9        76.5            90.5          95.1
##   vacc_measles2 vacc_rotavirus3 vacc_penta_hepb3 vacc_vita vacc_facility_public
## 1          20.7             3.1             68.6      94.9                100.0
## 2            NA              NA               NA      89.6                   NA
## 3          33.7             0.0             85.3      84.0                 93.1
## 4          34.9            74.8             89.7      69.6                 97.0
## 5          35.0            77.3             83.6      85.9                100.0
## 6          45.1            72.9             79.6      91.3                 83.8
##   vacc_facility_private child_diarrhea_prev child_diarrhea_ors
## 1                   0.0                 5.7                 NA
## 2                    NA                 4.5                 NA
## 3                   4.3                 6.0                 NA
## 4                   3.0                11.9                 NA
## 5                   0.0                 7.5                 NA
## 6                   9.7                 8.1                 NA
##   child_diarrhea_zinc child_diarrhea_hfac child_ari_prev child_ari_hfac
## 1                  NA                  NA            1.8           85.7
## 2                  NA                  NA            7.0             NA
## 3                  NA                  NA            0.0           77.3
## 4                  NA                  NA            1.3           79.7
## 5                  NA                  NA            1.4           83.5
## 6                  NA                  NA            2.0           72.3
##   bf_early bf_exclusive_6m bf_solid_6_8m diet_adequate_bf diet_adequate_nonbf
## 1     55.4              NA            NA             19.4                  NA
## 2     27.3              NA            NA              6.5                  NA
## 3     51.1              NA            NA             22.3                  NA
## 4     42.8              NA            NA             14.0                  NA
## 5     55.6              NA            NA              2.5                  NA
## 6     64.3              NA            NA              6.9                  NA
##   diet_adequate_total stunting wasting severe_wasting underweight
## 1                18.7     21.6    15.7            7.8        24.6
## 2                 5.9     27.0    27.0            8.3        42.8
## 3                23.5     21.1    12.6            3.5        17.4
## 4                16.1     19.7    19.5            7.4        21.4
## 5                 1.8     36.4    19.2            8.3        32.2
## 6                11.8     31.0    21.5           11.2        33.5
##   child_overweight women_bmi_low women_obese women_high_whr child_anaemia
## 1              1.5           8.2        39.1           62.5          37.7
## 2              0.8           8.6        35.9           79.3          30.4
## 3              7.2          10.0        39.0           78.2          43.4
## 4              4.5          13.8        27.2           54.0          59.6
## 5              4.7          16.9        28.8           58.0          66.7
## 6              4.8          17.4        23.8           58.0          72.6
##   women_nonpreg_anaemia women_preg_anaemia women_total_anaemia teen_anaemia
## 1                  38.4                 NA                38.3         48.0
## 2                  62.5                 NA                62.1         47.8
## 3                  57.6                 NA                57.7         43.2
## 4                  62.8                 NA                62.6         59.2
## 5                  64.6                 NA                64.0         73.9
## 6                  58.6                 NA                58.0         58.9
##   women_sugar_high women_sugar_vhigh women_sugar_elevated men_sugar_high
## 1              7.4               3.9                 13.1            9.6
## 2              7.2               6.4                 16.7            9.1
## 3              7.5               9.5                 18.4            9.3
## 4              8.2               7.8                 17.4            6.8
## 5              6.2               7.0                 14.3            5.8
## 6              6.1               8.6                 17.0            7.3
##   men_sugar_vhigh men_sugar_elevated women_bp_mild women_bp_mod_sev
## 1             4.4               15.4          23.2              8.5
## 2             6.9               18.3          18.4              4.0
## 3             7.8               18.1          12.7              4.9
## 4             8.6               17.6          12.8              5.9
## 5             7.5               14.5          12.9              6.6
## 6             8.5               18.2          12.1              5.9
##   women_bp_elevated men_bp_mild men_bp_mod_sev men_bp_elevated
## 1              35.4        32.9           11.1            47.0
## 2              27.4        22.6            6.0            32.2
## 3              23.0        17.9            6.1            26.9
## 4              22.1        14.4            5.5            22.9
## 5              25.2        14.8            6.4            25.1
## 6              23.9        17.0            7.0            29.2
##   screening_cervical screening_breast screening_oral women_tobacco men_tobacco
## 1               13.4             13.2            5.4          63.5        76.8
## 2                1.7              0.3           15.8          46.8        70.5
## 3                1.3              0.7            8.0          19.6        50.8
## 4                1.0              0.2            3.8           7.1        21.3
## 5                4.9              0.6            7.3          11.4        21.5
## 6                1.7              0.7            4.1           6.3        22.8
##   women_alcohol men_alcohol
## 1          29.6        64.5
## 2           5.1        45.3
## 3           1.7        32.8
## 4           0.6        28.3
## 5           0.8        32.3
## 6           1.3        30.2

#4 # Check for missing values in the dataset

colSums(is.na(df))
##              district                 state           hh_surveyed 
##                     0                     0                     0 
##        women_surveyed          men_surveyed      female_schooling 
##                     0                     0                     0 
##          pop_under_15       sex_ratio_total       sex_ratio_birth 
##                     0                     0                     0 
##       child_birth_reg             death_reg        hh_electricity 
##                     0                     1                     0 
##              hh_water         hh_sanitation       hh_cooking_fuel 
##                     0                     0                     0 
##       hh_iodized_salt          hh_insurance      child_preprimary 
##                     0                     0                     3 
##        women_literate  women_10yr_schooling  women_early_marriage 
##                     0                     0                     0 
##      births_3rd_order        teen_pregnancy     menstrual_hygiene 
##                     1                     0                     0 
##                fp_any             fp_modern        fp_female_ster 
##                     0                     0                     0 
##          fp_male_ster                fp_iud               fp_pill 
##                     0                     0                     0 
##             fp_condom         fp_injectable        fp_unmet_total 
##                     0                     0                     0 
##      fp_unmet_spacing            fp_hw_talk  fp_side_effects_info 
##                     0                     0                     2 
##           anc_1st_tri             anc_4plus        mother_tetanus 
##                     0                     0                     0 
##              ifa_100d              ifa_180d              mcp_card 
##                     0                     0                     0 
##         pnc_mother_2d  delivery_cost_public    home_birth_checkup 
##                     0                     1                   422 
##          pnc_child_2d           inst_births    inst_births_public 
##                     0                     0                     0 
##    home_birth_skilled   birth_skilled_total       birth_c_section 
##                     0                     0                     0 
##   birth_c_sec_private    birth_c_sec_public         vacc_full_all 
##                   150                     0                    13 
##        vacc_full_card              vacc_bcg           vacc_polio3 
##                    22                    13                    13 
##       vacc_penta_dpt3         vacc_measles1         vacc_measles2 
##                    13                    13                    13 
##       vacc_rotavirus3      vacc_penta_hepb3             vacc_vita 
##                    13                    13                     1 
##  vacc_facility_public vacc_facility_private   child_diarrhea_prev 
##                    16                    16                     0 
##    child_diarrhea_ors   child_diarrhea_zinc   child_diarrhea_hfac 
##                   492                   492                   492 
##        child_ari_prev        child_ari_hfac              bf_early 
##                     0                   224                     0 
##       bf_exclusive_6m         bf_solid_6_8m      diet_adequate_bf 
##                   261                   642                     5 
##   diet_adequate_nonbf   diet_adequate_total              stunting 
##                   643                     1                     0 
##               wasting        severe_wasting           underweight 
##                     0                     0                     0 
##      child_overweight         women_bmi_low           women_obese 
##                     0                     0                     0 
##        women_high_whr         child_anaemia women_nonpreg_anaemia 
##                     0                     0                     0 
##    women_preg_anaemia   women_total_anaemia          teen_anaemia 
##                   134                     0                     0 
##      women_sugar_high     women_sugar_vhigh  women_sugar_elevated 
##                     0                     0                     0 
##        men_sugar_high       men_sugar_vhigh    men_sugar_elevated 
##                     0                     0                     0 
##         women_bp_mild      women_bp_mod_sev     women_bp_elevated 
##                     0                     0                     0 
##           men_bp_mild        men_bp_mod_sev       men_bp_elevated 
##                     0                     0                     0 
##    screening_cervical      screening_breast        screening_oral 
##                     0                     0                     0 
##         women_tobacco           men_tobacco         women_alcohol 
##                     0                     0                     0 
##           men_alcohol 
##                     0
# Replace NAs with the column mean for everything except district and state
df <- df %>%
  mutate(across(-c(district, state), ~ {
    col_mean <- mean(.x, na.rm = TRUE)
    coalesce(.x, col_mean)
  }))
sum(is.na(df))
## [1] 0

#1.1 # Finding Mean Literacy by State: from Highest to Lowest

state_literacy <- df %>%
  group_by(state) %>%
  summarize(mean_literacy = mean(women_literate, na.rm = TRUE)) %>%
  arrange(desc(mean_literacy)) 
print(state_literacy)
## # A tibble: 36 × 2
##    state                       mean_literacy
##    <chr>                               <dbl>
##  1 "Kerala"                             97.9
##  2 " Lakshadweep "                      96.5
##  3 "Goa"                                93.1
##  4 "Mizoram"                            91.3
##  5 "Puducherry"                         91.1
##  6 "Himachal Pradesh"                   90.2
##  7 "Sikkim"                             86.9
##  8 "Andaman & Nicobar Islands"          86.1
##  9 "Meghalaya"                          86.0
## 10 "Tamil Nadu"                         85.7
## # ℹ 26 more rows

#1.2 # Finding State-wise Insurance Coverage: from Highest to Lowest

state_insurance <- df %>%
  group_by(state) %>%
  summarize(mean_insurance = mean(hh_insurance, na.rm = TRUE)) %>%
  arrange(desc(mean_insurance))
print(state_insurance)
## # A tibble: 36 × 2
##    state            mean_insurance
##    <chr>                     <dbl>
##  1 "Rajasthan"                88.4
##  2 "Chhattisgarh"             73.2
##  3 "Andhra Pradesh"           70.8
##  4 "Meghalaya"                68.3
##  5 "Tamil Nadu"               67.9
##  6 "Uttarakhand"              67.1
##  7 "Goa"                      66.2
##  8 "Telangana"                62.3
##  9 "Assam"                    61.0
## 10 " Lakshadweep "            60.1
## # ℹ 26 more rows

#1.3 # Identify High Anemia Zones: Top 10 states with the highest average child anemia rates

high_anemia_states <- df %>%
  group_by(state) %>%
  summarize(mean_child_anemia = mean(child_anaemia, na.rm = TRUE)) %>%
  arrange(desc(mean_child_anemia)) %>%
  head(10)
print("Top 10 High Anemia States (Children):")
## [1] "Top 10 High Anemia States (Children):"
print(high_anemia_states)
## # A tibble: 10 × 2
##    state                                  mean_child_anemia
##    <chr>                                              <dbl>
##  1 Ladakh                                              91.7
##  2 Gujarat                                             79.8
##  3 Dadra and Nagar Haveli & Daman and Diu              73.4
##  4 Jammu & Kashmir                                     73.2
##  5 Rajasthan                                           72.2
##  6 Madhya Pradesh                                      71.7
##  7 Maharastra                                          70.3
##  8 Telangana                                           70.2
##  9 Bihar                                               70.1
## 10 Assam                                               69.8

#1.4 # Correlation Check: Relationship between Clean_Fuel and Literacy_W

correlation_value <- cor(df$hh_cooking_fuel, df$women_literate, use = "complete.obs")
print(paste("Pearson Correlation:", round(correlation_value, 4)))
## [1] "Pearson Correlation: 0.4475"

What is the average literacy rate state-wise?

library(dplyr)

if (ncol(df) > 109) { df <- df[, 1:109] }
colnames(df) <- new_names

state_literacy_analysis <- df %>%
  mutate(women_literate = as.numeric(gsub("[^0-9.]", "", women_literate))) %>%
  group_by(state) %>%
  summarize(
    avg_literacy = mean(women_literate, na.rm = TRUE),
    district_count = n(),
    min_literacy = min(women_literate, na.rm = TRUE),
    max_literacy = max(women_literate, na.rm = TRUE)
  ) %>%
  arrange(desc(avg_literacy))

print(state_literacy_analysis)
## # A tibble: 36 × 5
##    state                   avg_literacy district_count min_literacy max_literacy
##    <chr>                          <dbl>          <int>        <dbl>        <dbl>
##  1 "Kerala"                        97.9             14         93.7         99.7
##  2 " Lakshadweep "                 96.5              1         96.5         96.5
##  3 "Goa"                           93.1              2         92.4         93.8
##  4 "Mizoram"                       91.3              8         76           99.7
##  5 "Puducherry"                    91.1              4         83.3         99.7
##  6 "Himachal Pradesh"              90.2             12         84.1         94.8
##  7 "Sikkim"                        86.9              4         81.4         90.5
##  8 "Andaman & Nicobar Isl…         86.1              3         84           87.5
##  9 "Meghalaya"                     86.0             11         76           93.7
## 10 "Tamil Nadu"                    85.7             32         77.4         98  
## # ℹ 26 more rows

#1.5 # Top 25 Sex Ratio Districts: Ranking districts by the Sex Ratio column.

top_5_sex_ratio <- df %>%
  select(district, state, sex_ratio_total) %>%
  arrange(desc(sex_ratio_total)) %>%
  head(25)
print(top_5_sex_ratio)
##             district                                  state sex_ratio_total
## 1               Diu  Dadra and Nagar Haveli & Daman and Diu            1332
## 2            Almora                             Uttarakhand            1331
## 3       Rudraprayag                             Uttarakhand            1242
## 4     Tehri Garhwal                             Uttarakhand            1236
## 5        Pratapgarh                           Uttar Pradesh            1229
## 6         Madhubani                                   Bihar            1227
## 7         Sivaganga                              Tamil Nadu            1223
## 8          Jagitial                               Telangana            1219
## 9        Perambalur                              Tamil Nadu            1210
## 10        Sitamarhi                                   Bihar            1209
## 11           Kannur                                  Kerala            1203
## 12       Kishanganj                                   Bihar            1199
## 13       Dharmapuri                              Tamil Nadu            1192
## 14      Lakshadweep                            Lakshadweep             1187
## 15  Purba Champaran                                   Bihar            1185
## 16 Sant Kabir Nagar                           Uttar Pradesh            1185
## 17           Palamu                               Jharkhand            1183
## 18         Hamirpur                        Himachal Pradesh            1182
## 19   Siddharthnagar                           Uttar Pradesh            1177
## 20   Pathanamthitta                                  Kerala            1174
## 21        Bageshwar                             Uttarakhand            1174
## 22           Kollam                                  Kerala            1171
## 23           Nirmal                               Telangana            1171
## 24           Amethi                           Uttar Pradesh            1170
## 25           Kodagu                               Karnataka            1168

#1.6 # Bottom 15 Immunization Areas: Identifying areas with low vaccination rates.

bottom_10_vaccination <- df %>%
  select(district, state, vacc_full_all) %>%
  arrange(vacc_full_all) %>%
  head(15)
print(bottom_10_vaccination)
##               district             state vacc_full_all
## 1            Udalguri              Assam          38.3
## 2              Ukhrul            Manipur          39.4
## 3            Tuensang           Nagaland          39.9
## 4               Wokha           Nagaland          42.8
## 5             Kiphire           Nagaland          42.8
## 6        Banas Kantha            Gujarat          43.5
## 7              Jhansi      Uttar Pradesh          44.5
## 8    North Garo Hills          Meghalaya          47.5
## 9  West Karbi Anglong              Assam          47.9
## 10      South Tripura            Tripura          48.5
## 11         East Siang  Arunachal Pradesh          48.8
## 12   East Khasi Hills          Meghalaya          49.1
## 13          Kokrajhar              Assam          51.1
## 14           Palakkad             Kerala          51.8
## 15           Bahraich      Uttar Pradesh          51.8

#1.7 # Gender Gap in Tobacco: Compare Tobacco_W vs Tobacco_M

df_tobacco_ratio <- df %>%
  mutate(
    tobacco_ratio = men_tobacco / women_tobacco,
    tobacco_gap = men_tobacco - women_tobacco
  ) %>%
  select(district, state, women_tobacco, men_tobacco, tobacco_ratio, tobacco_gap)
average_ratio <- mean(df_tobacco_ratio$tobacco_ratio, na.rm = TRUE)
top_10_ratios <- df_tobacco_ratio %>%
  arrange(desc(tobacco_ratio)) %>%
  head(25)
cat("Average National Tobacco Ratio (Men:Women):", round(average_ratio, 2), "\n\n")
## Average National Tobacco Ratio (Men:Women): 8
print(top_10_ratios)
##                      district            state women_tobacco men_tobacco
## 1                   Faridkot            Punjab           0.1        18.6
## 2                   Firozpur            Punjab           0.1        12.6
## 3                 Tarn Taran            Punjab           0.1         6.8
## 4            Fatehgarh Sahib            Punjab           0.2        12.3
## 5                 Kapurthala            Punjab           0.2        11.6
## 6                       Moga            Punjab           0.3        16.9
## 7  Shahid Bhagat Singh Nagar            Punjab           0.2        10.9
## 8                    Muktsar            Punjab           0.4        20.8
## 9                 Hoshiarpur            Punjab           0.2        10.1
## 10                  Hamirpur  Himachal Pradesh           0.7        33.0
## 11                   Barnala            Punjab           0.3        14.0
## 12                   Fazilka            Punjab           0.4        17.5
## 13                     Mansa            Punjab           0.4        16.9
## 14                  Amritsar            Punjab           0.2         8.2
## 15                    Rohtas             Bihar           1.0        40.6
## 16                 Pathankot            Punjab           0.4        15.8
## 17                       Una  Himachal Pradesh           0.8        31.0
## 18                   Patiala            Punjab           0.3        11.2
## 19                     Jammu   Jammu & Kashmir           0.8        25.7
## 20                 Fatehabad           Haryana           0.8        23.5
## 21                    Ambala           Haryana           0.8        22.0
## 22                     Banka             Bihar           1.8        49.4
## 23                    Kangra  Himachal Pradesh           1.2        32.9
## 24                  Ludhiana            Punjab           0.5        13.5
## 25                   Kodarma         Jharkhand           1.4        37.3
##    tobacco_ratio tobacco_gap
## 1      186.00000        18.5
## 2      126.00000        12.5
## 3       68.00000         6.7
## 4       61.50000        12.1
## 5       58.00000        11.4
## 6       56.33333        16.6
## 7       54.50000        10.7
## 8       52.00000        20.4
## 9       50.50000         9.9
## 10      47.14286        32.3
## 11      46.66667        13.7
## 12      43.75000        17.1
## 13      42.25000        16.5
## 14      41.00000         8.0
## 15      40.60000        39.6
## 16      39.50000        15.4
## 17      38.75000        30.2
## 18      37.33333        10.9
## 19      32.12500        24.9
## 20      29.37500        22.7
## 21      27.50000        21.2
## 22      27.44444        47.6
## 23      27.41667        31.7
## 24      27.00000        13.0
## 25      26.64286        35.9

#1.8 # BMI Analysis: Find the average percentage of women with below-normal BMI per state.

bmi_analysis <- df %>%
  group_by(state) %>%
  summarize(avg_women_bmi_low = mean(women_bmi_low, na.rm = TRUE)) %>%
  arrange(desc(avg_women_bmi_low))
print(bmi_analysis)
## # A tibble: 36 × 2
##    state                                  avg_women_bmi_low
##    <chr>                                              <dbl>
##  1 Jharkhand                                           26.9
##  2 Gujarat                                             26.0
##  3 Bihar                                               25.9
##  4 Chhattisgarh                                        25.0
##  5 Madhya Pradesh                                      23.5
##  6 Dadra and Nagar Haveli & Daman and Diu              21.9
##  7 Odisha                                              21.9
##  8 Maharastra                                          21.7
##  9 Rajasthan                                           20.2
## 10 Telangana                                           20.0
## # ℹ 26 more rows

#1.9 # Alcohol Consumption Ranking: Rank states based on the percentage

alcohol_ranking <- df %>%
  group_by(state) %>%
  summarize(
    avg_women_alcohol = mean(women_alcohol, na.rm = TRUE),
    avg_men_alcohol = mean(men_alcohol, na.rm = TRUE)
  ) %>%
  arrange(desc(avg_men_alcohol))
print(alcohol_ranking)
## # A tibble: 36 × 3
##    state                     avg_women_alcohol avg_men_alcohol
##    <chr>                                 <dbl>           <dbl>
##  1 Arunachal Pradesh                    23.9              53.0
##  2 Andaman & Nicobar Islands            12.1              47.5
##  3 Telangana                             7.4              45.5
##  4 Sikkim                               18.2              42.1
##  5 Chhattisgarh                          8.72             37.9
##  6 Goa                                   5.7              37.4
##  7 Jharkhand                             7.49             37.4
##  8 Manipur                               1.09             37.4
##  9 Meghalaya                             1.57             34.5
## 10 Himachal Pradesh                      0.675            33.8
## # ℹ 26 more rows

#1.10 # Urban vs Rural Proxy: Compare districts with high vs low Electricity.

df_proxy <- df %>%
  mutate(area_proxy = if_else(hh_electricity >= 95, "High Electricity (Urban Proxy)", "Low Electricity (Rural Proxy)"))
urban_rural_comparison <- df_proxy %>%
  group_by(area_proxy) %>%
  summarize(
    count = n(),
    avg_literacy = mean(women_literate, na.rm = TRUE),
    avg_insurance = mean(hh_insurance, na.rm = TRUE),
    avg_stunting = mean(stunting, na.rm = TRUE),
    avg_clean_fuel = mean(hh_cooking_fuel, na.rm = TRUE),
    avg_child_anemia = mean(child_anaemia, na.rm = TRUE)
  )
print(urban_rural_comparison)
## # A tibble: 2 × 7
##   area_proxy        count avg_literacy avg_insurance avg_stunting avg_clean_fuel
##   <chr>             <int>        <dbl>         <dbl>        <dbl>          <dbl>
## 1 High Electricity…   588         75.8          41.3         32.2           57.6
## 2 Low Electricity …   118         66.9          35.1         40.2           36.7
## # ℹ 1 more variable: avg_child_anemia <dbl>

#Level 5: Feature Engineering & Visualizations #2.1 # 1. Composite Infrastructure Index (CII)The Operation:We aggregate four key household metrics—electricity, improved water, sanitation, and clean fuel—into a single CII score.\[CII = \frac{elect + water + sanit + fuel}{4}\]Why? Instead of looking at four separate graphs, this single feature tells us the overall “Modernization” level of a district.Visualization: A Histogram of the CII to see if development in India is normally distributed or heavily skewed.

df <- df %>%
  mutate(cii = (hh_electricity + hh_water + hh_sanitation + hh_cooking_fuel) / 4)
df %>%
  select(district, state, hh_electricity, hh_water, hh_sanitation, hh_cooking_fuel, cii) %>%
  head(20)
##                      district                     state hh_electricity hh_water
## 1                    Nicobars Andaman & Nicobar Islands           97.9     98.8
## 2     North & Middle Andaman  Andaman & Nicobar Islands           93.2     92.2
## 3              South Andaman  Andaman & Nicobar Islands           99.6     97.9
## 4                 Srikakulam             Andhra Pradesh           99.9     87.7
## 5               Vizianagaram             Andhra Pradesh           99.5     93.1
## 6              Visakhapatnam             Andhra Pradesh           99.6     91.8
## 7              East Godavari             Andhra Pradesh           98.8     97.9
## 8              West Godavari             Andhra Pradesh           99.3     99.1
## 9                    Krishna             Andhra Pradesh           99.6     94.4
## 10                    Guntur             Andhra Pradesh           99.2     99.3
## 11                  Prakasam             Andhra Pradesh           99.3     98.6
## 12 Sri Potti Sriramulu Nello             Andhra Pradesh           99.1     95.6
## 13                    Y.S.R.             Andhra Pradesh           99.8     99.9
## 14                   Kurnool             Andhra Pradesh           99.9     97.9
## 15                 Anantapur             Andhra Pradesh           99.6     98.8
## 16                  Chittoor             Andhra Pradesh           99.7     98.5
## 17                    Tawang          Arunachal Pradesh           99.7     99.8
## 18               West Kameng          Arunachal Pradesh           97.5     99.8
## 19               East Kameng          Arunachal Pradesh           86.3     93.6
## 20                Papum Pare          Arunachal Pradesh           98.7     92.9
##    hh_sanitation hh_cooking_fuel    cii
## 1           83.5            56.9 84.275
## 2           86.4            61.3 83.275
## 3           89.3            91.9 94.675
## 4           71.6            74.7 83.475
## 5           61.7            60.3 78.650
## 6           77.8            72.9 85.525
## 7           77.7            80.3 88.675
## 8           80.8            86.8 91.500
## 9           79.1            89.8 90.725
## 10          83.4            91.7 93.400
## 11          78.4            90.8 91.775
## 12          81.8            84.6 90.275
## 13          85.4            94.3 94.850
## 14          73.5            78.1 87.350
## 15          71.3            86.4 89.025
## 16          74.6            86.6 89.850
## 17          81.7            88.1 92.325
## 18          88.9            88.9 93.775
## 19          75.6            49.5 76.250
## 20          75.7            81.3 87.150

#2.2 # Literacy Tiering (Categorical Binning) Convert the continuous lit_w (Literacy) variable into categorical “Tiers” (e.g., Low, Medium, High) using quartiles or fixed thresholds.

df <- df %>%
  mutate(literacy_tier = cut(women_literate, 
                             breaks = quantile(women_literate, probs = c(0, 0.33, 0.66, 1), na.rm = TRUE),
                             labels = c("Low", "Medium", "High"),
                             include.lowest = TRUE))

df %>%
  select(district, state, women_literate, literacy_tier) %>%
  head(35)
##                      district                     state women_literate
## 1                    Nicobars Andaman & Nicobar Islands           87.5
## 2     North & Middle Andaman  Andaman & Nicobar Islands           84.0
## 3              South Andaman  Andaman & Nicobar Islands           86.7
## 4                 Srikakulam             Andhra Pradesh           64.3
## 5               Vizianagaram             Andhra Pradesh           58.3
## 6              Visakhapatnam             Andhra Pradesh           69.5
## 7              East Godavari             Andhra Pradesh           77.9
## 8              West Godavari             Andhra Pradesh           77.0
## 9                    Krishna             Andhra Pradesh           76.9
## 10                    Guntur             Andhra Pradesh           68.5
## 11                  Prakasam             Andhra Pradesh           62.8
## 12 Sri Potti Sriramulu Nello             Andhra Pradesh           70.5
## 13                    Y.S.R.             Andhra Pradesh           63.8
## 14                   Kurnool             Andhra Pradesh           57.0
## 15                 Anantapur             Andhra Pradesh           63.6
## 16                  Chittoor             Andhra Pradesh           69.3
## 17                    Tawang          Arunachal Pradesh           59.1
## 18               West Kameng          Arunachal Pradesh           73.7
## 19               East Kameng          Arunachal Pradesh           62.2
## 20                Papum Pare          Arunachal Pradesh           78.2
## 21           Upper Subansiri          Arunachal Pradesh           64.3
## 22               Upper Siang          Arunachal Pradesh           74.4
## 23                 Changlang          Arunachal Pradesh           77.6
## 24           Lower Subansiri          Arunachal Pradesh           76.8
## 25             Dibang Valley          Arunachal Pradesh           77.8
## 26       Lower Dibang Valley          Arunachal Pradesh           81.6
## 27                     Anjaw          Arunachal Pradesh           67.9
## 28                East Siang          Arunachal Pradesh           83.8
## 29                 Kra Daadi          Arunachal Pradesh           55.3
## 30              Kurung Kumey          Arunachal Pradesh           76.5
## 31                     Lohit          Arunachal Pradesh           73.4
## 32                  Longding          Arunachal Pradesh           71.9
## 33                    Namsai          Arunachal Pradesh           67.1
## 34                     Siang          Arunachal Pradesh           87.4
## 35                     Tirap          Arunachal Pradesh           75.1
##    literacy_tier
## 1           High
## 2           High
## 3           High
## 4            Low
## 5            Low
## 6            Low
## 7         Medium
## 8         Medium
## 9         Medium
## 10           Low
## 11           Low
## 12        Medium
## 13           Low
## 14           Low
## 15           Low
## 16           Low
## 17           Low
## 18        Medium
## 19           Low
## 20        Medium
## 21           Low
## 22        Medium
## 23        Medium
## 24        Medium
## 25        Medium
## 26          High
## 27           Low
## 28          High
## 29           Low
## 30        Medium
## 31        Medium
## 32        Medium
## 33           Low
## 34          High
## 35        Medium

3. Quick summary to check the distribution of the new tiers

table(df$literacy_tier)
## 
##    Low Medium   High 
##    233    233    240

#2.3 # 3. Child Malnutrition “Burden” Score

df <- df %>%
  mutate(malnutrition_burden = stunting + wasting + underweight)
df %>%
  select(district, state, stunting, wasting, underweight, malnutrition_burden) %>%
  arrange(desc(malnutrition_burden)) %>%
  head(35)
##                district         state stunting wasting underweight
## 1  Pashchimi Singhbhum      Jharkhand     60.6    30.5        62.4
## 2                Dohad        Gujarat     55.3    27.8        53.0
## 3                Arwal          Bihar     45.6    36.8        52.9
## 4         Panch Mahals        Gujarat     47.1    35.7        51.9
## 5            Nandurbar     Maharastra     45.8    30.7        57.2
## 6            The Dangs        Gujarat     37.6    40.9        53.1
## 7                 Tapi        Gujarat     41.7    36.6        51.8
## 8            Karimganj          Assam     29.1    48.0        52.9
## 9            Jehanabad          Bihar     41.3    36.6        51.7
## 10            Adilabad      Telangana     45.7    29.5        52.0
## 11               Banda  Uttar Pradesh     51.0    25.7        49.8
## 12               Pakur      Jharkhand     51.3    23.6        51.4
## 13      Chhota Udaipur        Gujarat     48.6    28.4        48.1
## 14             Buldana     Maharastra     45.0    31.7        47.2
## 15             Aravali        Gujarat     47.1    29.0        47.2
## 16             Narmada        Gujarat     47.2    23.0        52.8
## 17          Aurangabad          Bihar     41.2    32.9        48.7
## 18               Dhule     Maharastra     37.6    38.9        46.0
## 19          Chandrapur     Maharastra     37.3    38.5        46.6
## 20 Saraikela-Kharsawan      Jharkhand     40.0    32.9        48.7
## 21              Araria          Bihar     49.9    23.9        47.8
## 22              Yadgir      Karnataka     57.6    17.7        45.2
## 23              Rohtas          Bihar     40.0    31.8        48.2
## 24             Bijapur   Chhattisgarh     53.8    20.0        46.1
## 25               Banka          Bihar     46.7    26.9        45.8
## 26     Kaimur (Bhabua)          Bihar     44.1    27.3        47.2
## 27           Mahisagar        Gujarat     43.4    26.2        49.0
## 28               Buxar          Bihar     39.6    33.2        45.3
## 29              Koppal      Karnataka     49.1    23.1        45.8
## 30             Nalanda          Bihar     42.6    27.8        46.7
## 31              Purnia          Bihar     43.5    25.8        47.1
## 32             Bhojpur          Bihar     40.7    31.3        44.0
## 33        Nabarangapur         Odisha     44.1    25.2        46.6
## 34             Katihar          Bihar     43.9    23.5        48.1
## 35           Zunheboto       Nagaland     44.0    26.9        44.5
##    malnutrition_burden
## 1                153.5
## 2                136.1
## 3                135.3
## 4                134.7
## 5                133.7
## 6                131.6
## 7                130.1
## 8                130.0
## 9                129.6
## 10               127.2
## 11               126.5
## 12               126.3
## 13               125.1
## 14               123.9
## 15               123.3
## 16               123.0
## 17               122.8
## 18               122.5
## 19               122.4
## 20               121.6
## 21               121.6
## 22               120.5
## 23               120.0
## 24               119.9
## 25               119.4
## 26               118.6
## 27               118.6
## 28               118.1
## 29               118.0
## 30               117.1
## 31               116.4
## 32               116.0
## 33               115.9
## 34               115.5
## 35               115.4
burden_correlation <- cor(df$cii, df$malnutrition_burden, use = "complete.obs")
cat("Correlation between CII and Malnutrition Burden:", round(burden_correlation, 4))
## Correlation between CII and Malnutrition Burden: -0.4915

#2.4 # 4. Public Health Reliance Ratio

df <- df %>%
  mutate(public_health_reliance = inst_births_public / inst_births)
cat("Top 35 Districts: Highest Public Health Reliance\n")
## Top 35 Districts: Highest Public Health Reliance
df %>%
  select(district, state, inst_births, inst_births_public, public_health_reliance) %>%
  arrange(desc(public_health_reliance)) %>%
  head(35)
##                    district                     state inst_births
## 1                   Kargil                     Ladakh        93.7
## 2         South Garo Hills                  Meghalaya        88.3
## 3    South West Garo Hills                  Meghalaya        83.7
## 4          East Garo Hills                  Meghalaya        73.8
## 5                   Tawang          Arunachal Pradesh        84.2
## 6                    Sukma               Chhattisgarh        81.2
## 7              Leh(Ladakh)                     Ladakh        96.4
## 8                 Kishtwar            Jammu & Kashmir        91.6
## 9               Malkangiri                     Odisha        90.7
## 10         Upper Subansiri          Arunachal Pradesh        77.0
## 11                 Nicobars Andaman & Nicobar Islands        97.8
## 12                 Dindori             Madhya Pradesh        77.6
## 13                Udhampur            Jammu & Kashmir        87.2
## 14                   Anjaw          Arunachal Pradesh        78.0
## 15             Upper Siang          Arunachal Pradesh        76.0
## 16               Kandhamal                     Odisha        93.9
## 17              Mayurbhanj                     Odisha        91.7
## 18             West Kameng          Arunachal Pradesh        93.6
## 19        North Garo Hills                  Meghalaya        61.5
## 20                Tuensang                   Nagaland        34.8
## 21                 Bijapur               Chhattisgarh        63.6
## 22                   Punch            Jammu & Kashmir        86.0
## 23                  Ramban            Jammu & Kashmir        80.4
## 24                  Badgam            Jammu & Kashmir        96.5
## 25                Longleng                   Nagaland        38.7
## 26             East Kameng          Arunachal Pradesh        76.0
## 27 South Salmara Mancachar                      Assam        71.7
## 28                   Siang          Arunachal Pradesh        81.1
## 29                Longding          Arunachal Pradesh        64.5
## 30  North & Middle Andaman  Andaman & Nicobar Islands        97.7
## 31                 Kiphire                   Nagaland        34.8
## 32                  Raisen             Madhya Pradesh        96.0
## 33                 Shahdol             Madhya Pradesh        85.6
## 34               Ganderbal            Jammu & Kashmir        98.0
## 35                  Dhalai                    Tripura        87.3
##    inst_births_public public_health_reliance
## 1                93.7              1.0000000
## 2                88.3              1.0000000
## 3                83.7              1.0000000
## 4                73.5              0.9959350
## 5                83.8              0.9952494
## 6                80.8              0.9950739
## 7                95.7              0.9927386
## 8                90.8              0.9912664
## 9                89.9              0.9911797
## 10               76.2              0.9896104
## 11               96.7              0.9887526
## 12               76.6              0.9871134
## 13               86.0              0.9862385
## 14               76.9              0.9858974
## 15               74.9              0.9855263
## 16               92.4              0.9840256
## 17               90.2              0.9836423
## 18               92.0              0.9829060
## 19               60.4              0.9821138
## 20               34.1              0.9798851
## 21               62.3              0.9795597
## 22               84.2              0.9790698
## 23               78.7              0.9788557
## 24               94.3              0.9772021
## 25               37.8              0.9767442
## 26               74.2              0.9763158
## 27               69.9              0.9748954
## 28               79.0              0.9741060
## 29               62.8              0.9736434
## 30               95.0              0.9723644
## 31               33.8              0.9712644
## 32               93.2              0.9708333
## 33               83.0              0.9696262
## 34               95.0              0.9693878
## 35               84.5              0.9679267

3. Display the districts with the lowest reliance (High Private Sector Reliance)

cat("\nBottom 35 Districts: Lowest Public Health Reliance (High Private Reliance)\n")
## 
## Bottom 35 Districts: Lowest Public Health Reliance (High Private Reliance)
df %>%
  select(district, state, inst_births, inst_births_public, public_health_reliance) %>%
  arrange(public_health_reliance) %>%
  head(35)
##                      district          state inst_births inst_births_public
## 1                   Prakasam  Andhra Pradesh        97.4               18.1
## 2                  Ernakulam          Kerala        99.1               23.2
## 3                      Patan         Gujarat        98.6               25.2
## 4                 Malappuram          Kerala       100.0               25.6
## 5               Panch Mahals         Gujarat        88.4               24.7
## 6                   Mahesana         Gujarat        97.3               27.4
## 7                  Mahisagar         Gujarat        93.0               26.6
## 8                   Palakkad          Kerala       100.0               29.6
## 9                  Kasaragod          Kerala       100.0               29.9
## 10                  Thrissur          Kerala       100.0               30.9
## 11              Banas Kantha         Gujarat        92.9               29.1
## 12                     Morbi         Gujarat        94.8               30.1
## 13                    Meerut   Uttar Pradesh        80.6               25.7
## 14                    Nirmal       Telangana        96.7               31.3
## 15                    Kannur          Kerala       100.0               33.6
## 16                Karimnagar       Telangana        98.4               34.2
## 17        Thiruvananthapuram          Kerala        99.5               34.7
## 18 Sri Potti Sriramulu Nello  Andhra Pradesh        97.0               34.9
## 19                    Bijnor   Uttar Pradesh        83.7               30.6
## 20            Pathanamthitta          Kerala        99.1               36.8
## 21       Gautam Buddha Nagar   Uttar Pradesh        86.6               32.4
## 22                   Wayanad          Kerala       100.0               37.6
## 23                Mancherial       Telangana        90.1               33.9
## 24                   Aravali         Gujarat        92.2               34.8
## 25                     Udupi       Karnataka        98.9               37.6
## 26       Jyotiba Phule Nagar   Uttar Pradesh        81.1               31.3
## 27          Rajanna Sircilla       Telangana        97.6               37.7
## 28               Gir Somnath         Gujarat        87.1               34.0
## 29           Devbhumi Dwarka         Gujarat        94.8               37.2
## 30                    Sangli      Maharastra        98.0               38.5
## 31             Kanniyakumari      Tamil Nadu       100.0               39.4
## 32                   Khammam       Telangana        97.9               39.0
## 33        Medchal-Malkajgiri       Telangana        98.3               39.4
## 34               Gandhinagar         Gujarat        97.8               39.5
## 35                   Krishna  Andhra Pradesh        98.9               40.1
##    public_health_reliance
## 1               0.1858316
## 2               0.2341070
## 3               0.2555781
## 4               0.2560000
## 5               0.2794118
## 6               0.2816033
## 7               0.2860215
## 8               0.2960000
## 9               0.2990000
## 10              0.3090000
## 11              0.3132400
## 12              0.3175105
## 13              0.3188586
## 14              0.3236815
## 15              0.3360000
## 16              0.3475610
## 17              0.3487437
## 18              0.3597938
## 19              0.3655914
## 20              0.3713421
## 21              0.3741339
## 22              0.3760000
## 23              0.3762486
## 24              0.3774403
## 25              0.3801820
## 26              0.3859433
## 27              0.3862705
## 28              0.3903559
## 29              0.3924051
## 30              0.3928571
## 31              0.3940000
## 32              0.3983657
## 33              0.4008138
## 34              0.4038855
## 35              0.4054601

4. State-level summary of Public Health Reliance

state_reliance <- df %>%
  group_by(state) %>%
  summarize(avg_reliance = mean(public_health_reliance, na.rm = TRUE)) %>%
  arrange(desc(avg_reliance))
print(state_reliance)
## # A tibble: 36 × 2
##    state                     avg_reliance
##    <chr>                            <dbl>
##  1 Ladakh                           0.996
##  2 Arunachal Pradesh                0.953
##  3 Jammu & Kashmir                  0.948
##  4 Andaman & Nicobar Islands        0.934
##  5 Madhya Pradesh                   0.897
##  6 Meghalaya                        0.888
##  7 Tripura                          0.886
##  8 Assam                            0.883
##  9 Sikkim                           0.874
## 10 Mizoram                          0.869
## # ℹ 26 more rows

#2.5 # 5. The Gender “Risk Delta” (Behavioral Gap)

df <- df %>%
  mutate(tobacco_gap = men_tobacco - women_tobacco)

2. Display the districts with the largest gender disparity in tobacco use

cat("Top 35 Districts: Largest Gender Gap in Tobacco Use (Men > Women)\n")
## Top 35 Districts: Largest Gender Gap in Tobacco Use (Men > Women)
df %>%
  select(district, state, women_tobacco, men_tobacco, tobacco_gap) %>%
  arrange(desc(tobacco_gap)) %>%
  head(35)
##             district          state women_tobacco men_tobacco tobacco_gap
## 1           Sheohar           Bihar           4.5        56.6        52.1
## 2             Jamui           Bihar           2.8        53.6        50.8
## 3  North Garo Hills       Meghalaya           3.8        54.1        50.3
## 4            Mahoba   Uttar Pradesh          13.2        63.3        50.1
## 5        Chhatarpur  Madhya Pradesh          10.8        60.5        49.7
## 6             Satna  Madhya Pradesh           6.3        55.9        49.6
## 7         Darbhanga           Bihar           5.6        55.1        49.5
## 8         Sitamarhi           Bihar           6.4        55.4        49.0
## 9          Lalitpur   Uttar Pradesh           5.5        54.5        49.0
## 10        Madhubani           Bihar           5.0        53.9        48.9
## 11           Amreli         Gujarat           5.0        53.7        48.7
## 12        Sonbhadra   Uttar Pradesh           5.8        54.1        48.3
## 13        Tikamgarh  Madhya Pradesh           5.6        53.9        48.3
## 14       Samastipur           Bihar           3.5        51.7        48.2
## 15             Gaya           Bihar           3.7        51.9        48.2
## 16  Kaimur (Bhabua)           Bihar           2.8        50.8        48.0
## 17  East Garo Hills       Meghalaya           6.3        54.0        47.7
## 18             Rewa  Madhya Pradesh           7.6        55.3        47.7
## 19  Purba Champaran           Bihar           7.6        55.2        47.6
## 20            Banka           Bihar           1.8        49.4        47.6
## 21         Hamirpur   Uttar Pradesh          17.0        64.5        47.5
## 22            Morbi         Gujarat           7.3        54.7        47.4
## 23            Sidhi  Madhya Pradesh           7.4        54.5        47.1
## 24          Nalanda           Bihar           2.8        49.8        47.0
## 25            Dumka       Jharkhand           9.6        56.6        47.0
## 26       Chitrakoot   Uttar Pradesh          12.7        59.6        46.9
## 27           Rajkot         Gujarat           7.2        53.4        46.2
## 28         Longleng        Nagaland           7.9        54.0        46.1
## 29          Sitapur   Uttar Pradesh          13.8        59.7        45.9
## 30            Katni  Madhya Pradesh          11.7        57.4        45.7
## 31        Begusarai           Bihar           4.0        49.6        45.6
## 32           Hardoi   Uttar Pradesh           6.5        52.1        45.6
## 33         Jhalawar       Rajasthan           9.8        55.4        45.6
## 34            Arwal           Bihar           2.4        47.9        45.5
## 35              Mon        Nagaland           6.9        52.4        45.5

3. State-level summary of the average Gender Risk Delta

This shows where the behavioral gap between genders is most pronounced

state_tobacco_gap <- df %>%
  group_by(state) %>%
  summarize(avg_tobacco_gap = mean(tobacco_gap, na.rm = TRUE)) %>%
  arrange(desc(avg_tobacco_gap))
print(state_tobacco_gap)
## # A tibble: 36 × 2
##    state                                  avg_tobacco_gap
##    <chr>                                            <dbl>
##  1 Bihar                                             44.1
##  2 Jharkhand                                         39.5
##  3 Nagaland                                          36.6
##  4 Madhya Pradesh                                    36.6
##  5 Uttar Pradesh                                     36.5
##  6 Rajasthan                                         35.8
##  7 Jammu & Kashmir                                   35.8
##  8 West Bengal                                       35.8
##  9 Gujarat                                           34.5
## 10 Dadra and Nagar Haveli & Daman and Diu            34.4
## # ℹ 26 more rows

#Ploting # P.1 # 1. Bar Charts (3 Different Types) # P.1.1 The Leaderboard (Horizontal Ranked Bar Chart) # Identifies Top 10 districts for full vaccination

state_vaccination <- df %>%
  group_by(state) %>%
  summarize(avg_vacc_full = mean(vacc_full_all, na.rm = TRUE)) %>%
  arrange(desc(avg_vacc_full))
top_20_states_vacc <- state_vaccination %>%
  head(20)
plot_state <- ggplot(top_20_states_vacc, aes(x = reorder(state, avg_vacc_full), y = avg_vacc_full)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Top 10 States: Full Vaccination Leaderboard",
    subtitle = "State-level average of full vaccination coverage across states",
    x = "State / UT", 
    y = "Average Full Vaccination (%)"
  ) +
  theme_minimal()
print(plot_state)

print(top_20_states_vacc)
## # A tibble: 20 × 2
##    state                                    avg_vacc_full
##    <chr>                                            <dbl>
##  1 "Odisha"                                          91.8
##  2 "Dadra and Nagar Haveli & Daman and Diu"          90.4
##  3 "Tamil Nadu"                                      89.0
##  4 "Himachal Pradesh"                                88.4
##  5 "Ladakh"                                          88.2
##  6 "Sikkim"                                          87.9
##  7 "Jammu & Kashmir"                                 87.1
##  8 "West Bengal"                                     86.7
##  9 "Karnataka"                                       86.4
## 10 " Lakshadweep "                                   86.1
## 11 "Puducherry"                                      86  
## 12 "Uttarakhand"                                     83.6
## 13 "Chandigarh"                                      80.9
## 14 "Rajasthan"                                       80.4
## 15 "Chhattisgarh"                                    80.4
## 16 "Madhya Pradesh"                                  78.8
## 17 "Kerala"                                          78.8
## 18 "Telangana"                                       78.5
## 19 "Goa"                                             78.1
## 20 "Gujarat"                                         78.1

P.1.2 Gender Disparity (Grouped Bar Chart)

Compares tobacco use for men vs women in the Top 5 states (by household count)

top_10_states_list <- df %>%
  group_by(state) %>%
  summarize(total_hh = sum(hh_surveyed, na.rm = TRUE)) %>%
  arrange(desc(total_hh)) %>%
  head(10) %>%
  pull(state)

tobacco_comp <- df %>%
  filter(state %in% top_10_states_list) %>%
  group_by(state) %>%
  summarize(Women = mean(women_tobacco, na.rm = TRUE),
            Men = mean(men_tobacco, na.rm = TRUE)) %>%
  pivot_longer(cols = c(Women, Men), names_to = "Gender", values_to = "Tobacco_Use")

plot_b <- ggplot(tobacco_comp, aes(x = state, y = Tobacco_Use, fill = Gender)) +
  geom_col(position = "dodge") +
  labs(title = "Gender Disparity in Tobacco Use",
       subtitle = "Comparison across the Top 5 most populous states",
       x = "State", y = "Average Tobacco Use (%)") +
  scale_fill_manual(values = c("Women" = "#f8766d", "Men" = "#00bfc4")) +
  theme_minimal()
print(plot_b)

# P.1.3 Infrastructure Composition (Stacked Percentage Bar Chart) # Shows the mix of amenities across literacy tiers

library(dplyr)
library(tidyr)
library(ggplot2)

# Create literacy tier first
df <- df %>%
  mutate(
    literacy_tier = case_when(
      women_literate < 50 ~ "Low Literacy",
      women_literate >= 50 & women_literate < 75 ~ "Medium Literacy",
      women_literate >= 75 ~ "High Literacy",
      TRUE ~ NA_character_
    )
  )

# Infrastructure composition plot
infra_comp <- df %>%
  filter(!is.na(literacy_tier)) %>%
  group_by(literacy_tier) %>%
  summarize(
    Electricity = mean(hh_electricity, na.rm = TRUE),
    Water = mean(hh_water, na.rm = TRUE),
    Sanitation = mean(hh_sanitation, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = -literacy_tier,
    names_to = "Amenity",
    values_to = "Value"
  ) %>%
  group_by(literacy_tier) %>%
  mutate(Percentage = Value / sum(Value))

plot_c <- ggplot(infra_comp, aes(x = literacy_tier, y = Value, fill = Amenity)) +
  geom_col(position = "fill") +
  geom_text(
    aes(label = paste0(round(Percentage * 100, 1), "%")),
    position = position_fill(vjust = 0.5),
    color = "white",
    fontface = "bold",
    size = 4
  ) +
  labs(
    title = "Infrastructure Composition by Literacy Tier",
    subtitle = "Relative share of amenities",
    x = "Literacy Tier",
    y = "Proportion"
  ) +
  scale_y_continuous(labels = function(x) paste0(x * 100, "%")) +
  theme_minimal() +
  theme(legend.position = "bottom")

print(plot_c)

# P.2 Histogram # P.2.1 Distribution of the cii (Composite Infrastructure Index).

df <- df %>%
  mutate(cii = (hh_electricity + hh_water + hh_sanitation + hh_cooking_fuel) / 4)
plot_histogram <- ggplot(df, aes(x = cii)) +
  geom_histogram(aes(y = after_stat(density)), 
                 bins = 30, 
                 fill = "#69b3a2", 
                 color = "#e9ecef", 
                 alpha = 0.7) +
  geom_density(color = "#404080", size = 1.2) +
  geom_vline(aes(xintercept = mean(cii, na.rm = TRUE)), 
             color = "red", 
             linetype = "dashed", 
             size = 1) +
  labs(
    title = "Distribution of Composite Infrastructure Index (CII)",
    subtitle = "Histogram showing density and distribution shape across districts",
    x = "CII Score (Development Level)",
    y = "Density"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(plot_histogram)

# P.3. Pie Chart # P.3.1 National Proportion of fuel (Clean Fuel) vs. Non-Clean Fuel households.

national_fuel_stats <- df %>%
  summarize(
    Clean = mean(hh_cooking_fuel, na.rm = TRUE),
    Non_Clean = 100 - mean(hh_cooking_fuel, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Fuel_Type", values_to = "Percentage")
plot_pie <- ggplot(national_fuel_stats, aes(x = "", y = Percentage, fill = Fuel_Type)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(round(Percentage, 1), "%")), 
            position = position_stack(vjust = 0.5), 
            color = "white", 
            fontface = "bold") +
  labs(
    title = "National Proportion of Cooking Fuel",
    subtitle = "Clean Fuel vs. Non-Clean Fuel Households",
    fill = "Fuel Category"
  ) +
  theme_void() + 
  scale_fill_manual(values = c("Clean" = "#23c0d9", "Non_Clean" = "#b47fc9"))
print(plot_pie)

#P.4. Pair Plot # P.4.1 Correlation Matrix/Pair Grid of lit_w, cii, vacc_full, and malnutrition_burden.

library(GGally)
library(dplyr)

# create malnutrition burden
df$malnutrition_burden <- rowSums(df[grep("stunt|wast|under", names(df))], na.rm = TRUE)

# create plot_data
plot_data <- na.omit(df[, c("women_literate", "cii", "vacc_full_all", "malnutrition_burden")])

# plot
ggpairs(plot_data, progress = FALSE)

#P.5 Box Plot # P.5.1 Box Plot comparing anemia prevalence

df <- read.csv("C:\\Users\\asus\\Downloads\\dHealth.csv", check.names = FALSE)

3. Data Cleaning & Feature Engineering

df <- df %>%
  rename(
    women_literate = `Women (age 15-49) who are literate4 (%)`,
    all_women_anaemic = `All women age 15-49 years who are anaemic22 (%)`
  ) %>%
  mutate(
    women_literate = as.numeric(gsub("[^0-9.]", "", women_literate)),
    all_women_anaemic = as.numeric(gsub("[^0-9.]", "", all_women_anaemic))
  ) %>%
  mutate(literacy_tier = cut(women_literate, 
                             breaks = quantile(women_literate, probs = c(0, 0.33, 0.66, 1), na.rm = TRUE),
                             labels = c("Low", "Medium", "High"),
                             include.lowest = TRUE))
ggplot(df, aes(x = literacy_tier, y = all_women_anaemic, fill = literacy_tier)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) + 
  geom_jitter(width = 0.2, alpha = 0.2, size = 1, color = "#000") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Women's Anemia Distribution by Literacy Tier",
    subtitle = "Analysis of health variance across education levels",
    x = "Literacy Tier (Women)",
    y = "Anemia Prevalence (%)"
  ) +
  theme_wsj() +
  theme(legend.position = "none")

#P.6. Line Chart # P.6.1 The Trend Profile (Ordered Line Chart)

df <- read.csv("C:\\Users\\asus\\Downloads\\dHealth.csv", check.names = FALSE)
if (ncol(df) > 109) {
  df <- df[, 1:109]
}
colnames(df) <- new_names
df <- df %>%
  mutate(across(-c(district, state), ~ as.numeric(gsub("[^0-9.]", "", .))))
state_trends_a <- df %>%
  group_by(state) %>%
  summarize(
    avg_lit = mean(women_literate, na.rm = TRUE),
    avg_inst = mean(inst_births, na.rm = TRUE)
  )
message("Success! Data trimmed and state_trends_a created.")
## Success! Data trimmed and state_trends_a created.
colnames(df) <- new_names
state_trends_a <- df %>%
  group_by(state) %>%
  summarize(
    avg_lit = mean(women_literate, na.rm = TRUE),
    avg_inst = mean(inst_births, na.rm = TRUE)
  )
plot_a <- ggplot(state_trends_a, aes(x = reorder(state, avg_lit), y = avg_inst, group = 1)) +
  geom_line(color = "#2c3e50", linewidth = 1) + 
  geom_point(color = "#e74c3c", size = 2) +
  coord_flip() + 
  labs(
    title = "The Path to Progress: Literacy vs. Institutional Births",
    subtitle = "States ranked from Lowest to Highest Female Literacy",
    x = "State (Ranked by Literacy)",
    y = "Average Institutional Births (%)"
  ) +
  theme_minimal()
print(plot_a)

colnames(df) <- new_names
state_trends_a <- df %>%
  group_by(state) %>%
  summarize(
    avg_lit = mean(women_literate, na.rm = TRUE),
    avg_inst = mean(inst_births, na.rm = TRUE)
  )
plot_a <- ggplot(state_trends_a, aes(x = reorder(state, avg_lit), y = avg_inst, group = 1)) +
  geom_line(color = "#f01a1a", linewidth = 1) + 
  geom_point(color = "#1a0901", size = 2) +
  coord_flip() + 
  labs(
    title = "The Path to Progress: Literacy vs. Institutional Births",
    subtitle = "States ranked from Lowest to Highest Female Literacy",
    x = "State (Ranked by Literacy)",
    y = "Average Institutional Births (%)"
  ) +
  theme_minimal()
print(plot_a)

# details: this line chart shows the relationship between female literacy and institutional births across states, with states ordered by literacy to reveal trends in how education may influence healthcare utilization. i.e kerela has the highest literacy and also high institutional births, while states with lower literacy tend to have lower institutional births, highlighting the potential impact of education on health outcomes.

P.6.2 Dual-Metric Comparison (Multi-line Chart)

state_trends_b <- df %>%
  group_by(state) %>%
  summarize(
    avg_stunting = mean(stunting, na.rm = TRUE),
    avg_wasted = mean(wasting, na.rm = TRUE)
  )
malnutrition_long <- state_trends_b %>%
  pivot_longer(
    cols = c(avg_stunting, avg_wasted), 
    names_to = "Malnutrition_Type", 
    values_to = "Rate"
  )
plot_b <- ggplot(malnutrition_long, aes(x = state, y = Rate, color = Malnutrition_Type, group = Malnutrition_Type)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_color_manual(
    values = c("avg_stunting" = "#d35400", "avg_wasted" = "#2980b9"),
    labels = c("avg_stunting" = "Stunting (Chronic)", "avg_wasted" = "Wasting (Acute)")
  ) +
  theme_wsj() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) 
  ) +
  labs(
    title = "Chronic vs. Acute Malnutrition Trends",
    subtitle = "Comparing long-term stunting vs. short-term wasting across States",
    x = "State", 
    y = "Prevalence Rate (%)", 
    color = "Metric Type"
  )
print(plot_b)

# P.7 Scatter Plot # P.7.1 cii (Infrastructure) vs. malnutrition_burden.

df <- df %>%
  mutate(
    cii = (hh_electricity + hh_water + hh_sanitation + hh_cooking_fuel) / 4,
    malnutrition_burden = stunting + wasting + underweight
  )
p <- ggplot(df, aes(x = cii, 
                    y = malnutrition_burden, 
                    size = hh_surveyed, 
                    color = women_literate,
                    text = paste("District:", district, "<br>State:", state))) +
  geom_point(alpha = 0.6) +
  scale_color_gradient(low = "#ffeda0", high = "#800026", name = "Literacy (%)") +
  theme_minimal() + 
  labs(
    title = "Interactive Development-Health Nexus",
    x = "Composite Infrastructure Index (CII)",
    y = "Total Malnutrition Burden"
  )
interactive_plot <- ggplotly(p, tooltip = "text")
interactive_plot

K-MEANS CLUSTERING

#dentify which districts are “Doubly Disadvantaged” (Low education + Low healthcare access + High malnutrition) vs. those that are “Resilient” (Low education but High health outcomes).

if (ncol(df) > 109) { df <- df[, 1:109] }
colnames(df) <- new_names

df_clean <- df %>%
  mutate(across(c(women_literate, stunting, underweight, anc_4plus, hh_surveyed), 
                ~ as.numeric(gsub("[^0-9.]", "", .)))) %>%
  filter(!is.na(women_literate) & !is.na(stunting) & !is.na(underweight) & !is.na(anc_4plus))

target_states <- c("Uttar Pradesh", "Bihar")
other_states <- df_clean %>%
  filter(!(state %in% target_states)) %>%
  group_by(state) %>%
  summarize(total_hh = sum(hh_surveyed, na.rm = TRUE)) %>%
  arrange(desc(total_hh)) %>%
  head(4) %>%
  pull(state)

top_6_states <- c(target_states, other_states)
df_top6 <- df_clean %>% filter(state %in% top_6_states)
set.seed(123)
cluster_input <- scale(df_top6 %>% select(women_literate, stunting, underweight, anc_4plus))
km_res <- kmeans(cluster_input, centers = 3, nstart = 25)
df_top6$cluster <- as.factor(km_res$cluster)
levels(df_top6$cluster) <- c(
  "High Vulnerability (Low Lit/ANC)",
  "Success Model (High Lit/ANC)",
  "Emerging (Moderate Profile)"
)
hull_data <- df_top6 %>%
  group_by(state, cluster) %>%
  slice(chull(women_literate, anc_4plus))
p <- ggplot(df_top6, aes(x = women_literate, y = anc_4plus, color = cluster, fill = cluster)) +
  geom_polygon(data = hull_data, alpha = 0.2, show.legend = FALSE) +
  geom_point(aes(text = paste0("District: ", district, "<br>State: ", state, "<br>Status: ", cluster)), 
             size = 1.3, alpha = 0.7) +
  facet_wrap(~state, ncol = 3) + # 2 rows of 3
  scale_color_manual(values = c("#d9534f", "#5cb85c", "#428bca")) +
  scale_fill_manual(values = c("#d9534f", "#5cb85c", "#428bca")) +
  theme_minimal() +
  labs(title = "District Vulnerability Profiles: 6 Key States",
       x = "Female Literacy (%)",
       y = "Mothers with 4+ ANC Visits (%)",
       color = "Cluster Definitions") +
  theme(legend.position = "bottom",
        strip.text = element_text(face = "bold", size = 9),
        panel.spacing = unit(1.5, "lines"))
## Warning in geom_point(aes(text = paste0("District: ", district, "<br>State: ",
## : Ignoring unknown aesthetics: text
ggplotly(p, tooltip = "text") %>%
  layout(
    showlegend = TRUE,
    legend = list(
      orientation = "h", 
      x = 0.5,
      xanchor = "center", 
      y = -0.15,
      font = list(size = 10)
    ),
    margin = list(l = 50, r = 50, b = 100, t = 80),
    autosize = TRUE
  )

details : This clustering analysis identifies three distinct groups of districts based on their literacy rates and coverage. The “High Vulnerability” cluster includes districts with low literacy and low ANC visits, indicating areas that may require urgent intervention. The “Success Model” cluster represents districts that have achieved high literacy and high ANC coverage, serving as potential models for best practices. The “Emerging” cluster includes districts that are in transition, with moderate literacy and ANC rates, suggesting they may be on a positive trajectory but still need support to reach optimal outcomes. This analysis helps policymakers target resources effectively to improve health outcomes in vulnerable areas.

K-NEAREST NEIGHBORS (KNN) CLASSIFICATION

library(dplyr)
library(ggplot2)
library(plotly)

1. Clean and Rename Data

if (ncol(df) > 109) { df <- df[, 1:109] }
colnames(df) <- new_names

df_knn <- df %>%
  mutate(across(c(anc_4plus, women_literate, hh_electricity, hh_sanitation, hh_insurance), 
                ~ as.numeric(gsub("[^0-9.]", "", .)))) %>%
  filter(complete.cases(anc_4plus, women_literate, hh_electricity, hh_sanitation, hh_insurance))

2. Define the Target Label

median_anc <- median(df_knn$anc_4plus, na.rm = TRUE)
df_knn$access_label <- ifelse(df_knn$anc_4plus >= median_anc, "High Access", "Low Access")

3. MANUAL DATA SPLIT (Replacing createDataPartition)

set.seed(123)
n_rows <- nrow(df_knn)
train_indices <- sample(1:n_rows, size = 0.7 * n_rows) # 70% for training

train_data <- df_knn[train_indices, ]
test_data <- df_knn[-train_indices, ]

4. Feature Selection and Scaling

features <- c("women_literate", "hh_electricity", "hh_sanitation", "hh_insurance")
train_scaled <- as.matrix(scale(train_data[, features]))
test_scaled <- as.matrix(scale(test_data[, features], 
                               center = attr(train_scaled, "scaled:center"), 
                               scale = attr(train_scaled, "scaled:scale")))

5. KNN Parameters

k_val <- round(sqrt(nrow(train_data)))

6. Manual KNN Prediction Function

predict_knn_manual <- function(train_x, test_x, train_y, k) {
  sapply(1:nrow(test_x), function(i) {
    # Calculate Euclidean Distance
    distances <- sqrt(rowSums(t(t(train_x) - test_x[i, ])^2))
    # Find k nearest neighbors
    neighbor_indices <- order(distances)[1:k]
    neighbor_labels <- train_y[neighbor_indices]
    # Majority Vote
    names(which.max(table(neighbor_labels)))
  })
}

7. Run Prediction

test_data$predicted <- predict_knn_manual(train_scaled, test_scaled, train_data$access_label, k_val)
test_data$is_correct <- ifelse(test_data$predicted == test_data$access_label, "Correct", "Misclassified")

8. Visualization

p <- ggplot(test_data, aes(x = women_literate, y = hh_sanitation, 
                           color = predicted, shape = is_correct,
                           text = paste0("District: ", district, 
                                         "<br>Actual: ", access_label, 
                                         "<br>Predicted: ", predicted))) +
  geom_point(size = 3, alpha = 0.8) +
  scale_color_manual(values = c("High Access" = "#2ecc71", "Low Access" = "#e74c3c")) +
  theme_minimal() +
  labs(title = "Manual KNN: Healthcare Access Prediction",
       subtitle = "Calculated using Euclidean Distance without external ML libraries",
       x = "Female Literacy (%)",
       y = "Improved Sanitation (%)")

ggplotly(p, tooltip = "text") %>%
  layout(legend = list(orientation = "h", x = 0.5, xanchor = "center", y = -0.2))