# install.packages(c("tidyverse", "readr", "ggplot2", "dplyr"))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# Load data and ensure 'state' exists and is clean
df <- read.csv("District_health_Survey.csv") %>%
clean_names() %>%
# This makes sure the state column is definitely lowercase 'state'
rename_with(~ "state", contains("state")) %>%
mutate(state = str_trim(state))
# Ensure data is a data frame
df <- as.data.frame(df)
# Assign column names
colnames(df) <- c(
"state","district","pop_total","pop_male","pop_female",
"literacy_total","literacy_male","literacy_female",
"school_attendance","female_schooling",
"married_women","child_marriage",
"family_planning","contraceptive_use",
"anc_checkup","institutional_delivery","home_delivery",
"postnatal_care","maternal_care",
"child_immunization","full_vaccination","vitaminA",
"child_illness","diarrhea_treatment",
"child_stunting","child_wasting","child_underweight",
"child_overweight",
"bmi_male","bmi_female",
"overweight_male","overweight_female",
"obesity_male","obesity_female",
"anemia_male","anemia_female","anemia_children",
"severe_anemia",
"bp_male","bp_female","hypertension",
"high_bp_cases",
"blood_sugar","diabetes"
)
print(class(df))
## [1] "data.frame"
print(dim(df))
## [1] 706 109
print(head(df))
## state district pop_total pop_male
## 1 Nicobars Andaman & Nicobar Islands 882 764
## 2 North & Middle Andaman Andaman & Nicobar Islands 874 789
## 3 South Andaman Andaman & Nicobar Islands 868 844
## 4 Srikakulam Andhra Pradesh 874 780
## 5 Vizianagaram Andhra Pradesh 902 853
## 6 Visakhapatnam Andhra Pradesh 869 818
## pop_female literacy_total literacy_male literacy_female school_attendance
## 1 125 78.0 23.0 973 927
## 2 108 82.7 19.8 950 844
## 3 134 84.7 21.0 967 935
## 4 100 60.0 20.7 1140 1163
## 5 134 56.0 20.6 1114 898
## 6 112 66.8 21.4 1066 974
## female_schooling married_women child_marriage family_planning
## 1 98.0 83.2 97.9 98.8
## 2 100.0 (92.6) 93.2 92.2
## 3 96.5 92.2 99.6 97.9
## 4 95.0 71.0 99.9 87.7
## 5 95.4 81.7 99.5 93.1
## 6 90.5 71.3 99.6 91.8
## contraceptive_use anc_checkup institutional_delivery home_delivery
## 1 83.5 56.9 99.4 2.7
## 2 86.4 61.3 99.9 2.1
## 3 89.3 91.9 99.7 1.2
## 4 71.6 74.7 76.5 75.6
## 5 61.7 60.3 85.0 76.7
## 6 77.8 72.9 82.2 64.9
## postnatal_care maternal_care child_immunization full_vaccination vitaminA
## 1 (29.5) 87.5 53.5 11.4 0.0
## 2 (30.1) 84.0 41.0 15.4 1.5
## 3 (50.8) 86.7 57.5 17.1 0.5
## 4 (0.0) 64.3 42.5 25.4 0.0
## 5 (25.2) 58.3 37.6 33.7 2.2
## 6 (5.0) 69.5 46.0 25.4 0.0
## child_illness diarrhea_treatment child_stunting child_wasting
## 1 1.8 100.0 65.3 57.2
## 2 3.8 100.0 84.1 73.1
## 3 2.8 98.2 57.1 50.5
## 4 5.5 78.0 72.3 72.2
## 5 12.7 72.5 71.2 71.2
## 6 9.5 85.7 68.0 67.7
## child_underweight child_overweight bmi_male bmi_female overweight_male
## 1 46.4 0.0 2.7 2.0 4.9
## 2 48.3 0.6 6.4 7.8 9.3
## 3 34.0 0.0 2.8 1.8 10.6
## 4 71.0 0.3 0.6 0.0 0.3
## 5 69.3 1.3 0.0 0.0 0.6
## 6 64.9 1.5 0.3 0.0 0.8
## overweight_female obesity_male obesity_female anemia_male anemia_female
## 1 1.2 9.5 3.3 40.4 49.4
## 2 0.0 5.8 1.3 23.2 83.2
## 3 0.3 17.6 8.6 31.2 88.2
## 4 0.0 5.7 3.6 16.0 45.8
## 5 0.0 6.7 4.7 21.1 36.4
## 6 0.0 4.8 2.4 15.2 35.3
## anemia_children severe_anemia bp_male bp_female hypertension high_bp_cases
## 1 62.8 71.7 78.0 72.6 43.9 97.9
## 2 74.5 79.2 91.1 83.7 24.1 99.2
## 3 79.4 85.9 92.1 81.0 61.9 98.9
## 4 79.7 78.4 94.4 67.5 35.3 100.0
## 5 76.1 71.4 91.3 59.6 32.4 98.8
## 6 79.4 58.6 88.0 75.0 40.1 97.0
## blood_sugar diabetes <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 1 85.1 2278 * 92.5 97.8 96.7 0.8 98.6 11.5 * 10.7 (64.2)
## 2 92.5 1904 * 94.3 97.7 95.0 0.7 98.3 12.9 * 11.4 *
## 3 88.1 3460 * 89.8 99.5 83.8 0.0 96.9 37.1 (79.1) 29.6 (76.3)
## 4 90.8 3479 * 97.7 97.9 52.2 0.5 96.4 57.0 73.8 44.5 (82.8)
## 5 83.9 1931 * 89.2 99.0 70.6 0.5 97.6 41.3 70.3 30.3 (76.8)
## 6 84.8 2200 * 90.9 95.3 69.3 0.0 94.4 26.5 57.2 16.8 (76.5)
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 1 (94.1) (80.4) (69.1) (71.9) (67.3) (20.7) (3.1) (68.6) 94.9 (100.0)
## 2 * * * * * * * * (89.6) *
## 3 (96.6) (100.0) (79.0) (94.8) (81.7) (33.7) (0.0) (85.3) 84.0 (93.1)
## 4 (100.0) (93.3) (82.8) (89.7) (93.3) (34.9) (74.8) (89.7) 69.6 (97.0)
## 5 * (100.0) (76.8) (90.3) (92.5) (35.0) (77.3) (83.6) 85.9 (100.0)
## 6 (93.5) (97.9) (76.5) (90.5) (95.1) (45.1) (72.9) (79.6) 91.3 (83.8)
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 1 (0.0) 5.7 * * * 1.8 (85.7) 55.4 * * (19.4) * (18.7)
## 2 * 4.5 * * * 7.0 * 27.3 * * (6.5) * (5.9)
## 3 (4.3) 6.0 * * * 0.0 (77.3) 51.1 * * (22.3) * 23.5
## 4 (3.0) 11.9 * * * 1.3 (79.7) 42.8 * * (14.0) * 16.1
## 5 (0.0) 7.5 * * * 1.4 (83.5) 55.6 * * (2.5) * 1.8
## 6 (9.7) 8.1 * * * 2.0 (72.3) 64.3 * * (6.9) * 11.8
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 1 21.6 15.7 7.8 24.6 1.5 8.2 39.1 62.5 37.7 38.4 * 38.3 48.0 7.4
## 2 27.0 27.0 8.3 42.8 0.8 8.6 35.9 79.3 30.4 62.5 * 62.1 47.8 7.2
## 3 21.1 12.6 3.5 17.4 7.2 10.0 39.0 78.2 43.4 57.6 * 57.7 43.2 7.5
## 4 19.7 19.5 7.4 21.4 4.5 13.8 27.2 54.0 59.6 62.8 * 62.6 59.2 8.2
## 5 36.4 19.2 8.3 32.2 4.7 16.9 28.8 58.0 66.7 64.6 * 64.0 73.9 6.2
## 6 31.0 21.5 11.2 33.5 4.8 17.4 23.8 58.0 72.6 58.6 * 58.0 58.9 6.1
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 1 3.9 13.1 9.6 4.4 15.4 23.2 8.5 35.4 32.9 11.1 47.0 13.4 13.2 5.4 63.5
## 2 6.4 16.7 9.1 6.9 18.3 18.4 4.0 27.4 22.6 6.0 32.2 1.7 0.3 15.8 46.8
## 3 9.5 18.4 9.3 7.8 18.1 12.7 4.9 23.0 17.9 6.1 26.9 1.3 0.7 8.0 19.6
## 4 7.8 17.4 6.8 8.6 17.6 12.8 5.9 22.1 14.4 5.5 22.9 1.0 0.2 3.8 7.1
## 5 7.0 14.3 5.8 7.5 14.5 12.9 6.6 25.2 14.8 6.4 25.1 4.9 0.6 7.3 11.4
## 6 8.6 17.0 7.3 8.5 18.2 12.1 5.9 23.9 17.0 7.0 29.2 1.7 0.7 4.1 6.3
## <NA> <NA> <NA>
## 1 76.8 29.6 64.5
## 2 70.5 5.1 45.3
## 3 50.8 1.7 32.8
## 4 21.3 0.6 28.3
## 5 21.5 0.8 32.3
## 6 22.8 1.3 30.2
library(janitor)
df <- clean_names(df)
str(df)
## 'data.frame': 706 obs. of 109 variables:
## $ state : chr "Nicobars" "North & Middle Andaman " "South Andaman " "Srikakulam " ...
## $ district : chr "Andaman & Nicobar Islands" "Andaman & Nicobar Islands" "Andaman & Nicobar Islands" "Andhra Pradesh" ...
## $ pop_total : num 882 874 868 874 902 869 888 884 865 851 ...
## $ pop_male : num 764 789 844 780 853 818 824 841 820 807 ...
## $ pop_female : num 125 108 134 100 134 112 105 122 119 93 ...
## $ literacy_total : num 78 82.7 84.7 60 56 66.8 75.4 75.4 74 64.9 ...
## $ literacy_male : num 23 19.8 21 20.7 20.6 21.4 20.5 21.5 20.4 22.4 ...
## $ literacy_female : num 973 950 967 1140 1114 ...
## $ school_attendance : chr "927 " "844 " "935 " "1163 " ...
## $ female_schooling : num 98 100 96.5 95 95.4 90.5 93 93.5 96.4 92.3 ...
## $ married_women : chr "83.2 " "(92.6)" "92.2 " "71.0 " ...
## $ child_marriage : num 97.9 93.2 99.6 99.9 99.5 99.6 98.8 99.3 99.6 99.2 ...
## $ family_planning : num 98.8 92.2 97.9 87.7 93.1 91.8 97.9 99.1 94.4 99.3 ...
## $ contraceptive_use : num 83.5 86.4 89.3 71.6 61.7 77.8 77.7 80.8 79.1 83.4 ...
## $ anc_checkup : num 56.9 61.3 91.9 74.7 60.3 72.9 80.3 86.8 89.8 91.7 ...
## $ institutional_delivery: num 99.4 99.9 99.7 76.5 85 82.2 81.2 83.4 87.5 85.8 ...
## $ home_delivery : num 2.7 2.1 1.2 75.6 76.7 64.9 66.4 67.6 68.1 71.1 ...
## $ postnatal_care : chr "(29.5)" "(30.1)" "(50.8)" "(0.0)" ...
## $ maternal_care : num 87.5 84 86.7 64.3 58.3 69.5 77.9 77 76.9 68.5 ...
## $ child_immunization : num 53.5 41 57.5 42.5 37.6 46 43.2 46.5 46.2 32.6 ...
## $ full_vaccination : chr "11.4 " "15.4 " "17.1 " "25.4 " ...
## $ vitamin_a : chr "0.0 " "1.5 " "0.5 " "0.0 " ...
## $ child_illness : chr "1.8 " "3.8 " "2.8 " "5.5 " ...
## $ diarrhea_treatment : num 100 100 98.2 78 72.5 85.7 71 84.4 92.6 88 ...
## $ child_stunting : num 65.3 84.1 57.1 72.3 71.2 68 66.3 77.8 79.1 73.3 ...
## $ child_wasting : num 57.2 73.1 50.5 72.2 71.2 67.7 66.3 77.2 78.1 73.2 ...
## $ child_underweight : num 46.4 48.3 34 71 69.3 64.9 64.1 74.5 76.5 72.9 ...
## $ child_overweight : num 0 0.6 0 0.3 1.3 1.5 0.9 0.7 1 0 ...
## $ bmi_male : num 2.7 6.4 2.8 0.6 0 0.3 0.1 0.6 0 0.2 ...
## $ bmi_female : num 2 7.8 1.8 0 0 0 0.3 0.4 0 0 ...
## $ overweight_male : num 4.9 9.3 10.6 0.3 0.6 0.8 1.1 0.6 0.4 0.1 ...
## $ overweight_female : num 1.2 0 0.3 0 0 0 0 0.4 0 0 ...
## $ obesity_male : num 9.5 5.8 17.6 5.7 6.7 4.8 8 3 2.5 3.2 ...
## $ obesity_female : num 3.3 1.3 8.6 3.6 4.7 2.4 4.4 1.8 1.4 1.7 ...
## $ anemia_male : num 40.4 23.2 31.2 16 21.1 15.2 12.5 12.5 16.1 16.6 ...
## $ anemia_female : chr "49.4 " "83.2 " "88.2 " "45.8 " ...
## $ anemia_children : chr "62.8 " "74.5 " "79.4 " "79.7 " ...
## $ severe_anemia : chr "71.7 " "79.2 " "85.9 " "78.4 " ...
## $ bp_male : chr "78.0 " "91.1 " "92.1 " "94.4 " ...
## $ bp_female : chr "72.6 " "83.7 " "81.0 " "67.5 " ...
## $ hypertension : chr "43.9 " "24.1 " "61.9 " "35.3 " ...
## $ high_bp_cases : chr "97.9 " "99.2 " "98.9 " "100.0 " ...
## $ blood_sugar : chr "85.1 " "92.5 " "88.1 " "90.8 " ...
## $ diabetes : chr "2278 " "1904 " "3460 " "3479 " ...
## $ na : chr "*" "*" "*" "*" ...
## $ na_2 : chr "92.5 " "94.3 " "89.8 " "97.7 " ...
## $ na_3 : num 97.8 97.7 99.5 97.9 99 95.3 96.6 98.7 98.9 98.6 ...
## $ na_4 : num 96.7 95 83.8 52.2 70.6 69.3 46 48.8 40.1 49.6 ...
## $ na_5 : num 0.8 0.7 0 0.5 0.5 0 2.2 0.7 0.5 0.9 ...
## $ na_6 : num 98.6 98.3 96.9 96.4 97.6 94.4 89.9 98.5 98 95.5 ...
## $ na_7 : num 11.5 12.9 37.1 57 41.3 26.5 52.2 55.7 66.1 53.8 ...
## $ na_8 : chr "*" "*" "(79.1)" "73.8 " ...
## $ na_9 : chr "10.7 " "11.4 " "29.6 " "44.5 " ...
## $ na_10 : chr "(64.2)" "*" "(76.3)" "(82.8)" ...
## $ na_11 : chr "(94.1)" "*" "(96.6)" "(100.0)" ...
## $ na_12 : chr "(80.4)" "*" "(100.0)" "(93.3)" ...
## $ na_13 : chr "(69.1)" "*" "(79.0)" "(82.8)" ...
## $ na_14 : chr "(71.9)" "*" "(94.8)" "(89.7)" ...
## $ na_15 : chr "(67.3)" "*" "(81.7)" "(93.3)" ...
## $ na_16 : chr "(20.7)" "*" "(33.7)" "(34.9)" ...
## $ na_17 : chr "(3.1)" "*" "(0.0)" "(74.8)" ...
## $ na_18 : chr "(68.6)" "*" "(85.3)" "(89.7)" ...
## $ na_19 : chr "94.9 " "(89.6)" "84.0 " "69.6 " ...
## $ na_20 : chr "(100.0)" "*" "(93.1)" "(97.0)" ...
## $ na_21 : chr "(0.0)" "*" "(4.3)" "(3.0)" ...
## $ na_22 : num 5.7 4.5 6 11.9 7.5 8.1 13.3 2.7 7.8 10 ...
## $ na_23 : chr "*" "*" "*" "*" ...
## $ na_24 : chr "*" "*" "*" "*" ...
## $ na_25 : chr "*" "*" "*" "*" ...
## $ na_26 : num 1.8 7 0 1.3 1.4 2 2.2 1 2.4 1 ...
## $ na_27 : chr "(85.7)" "*" "(77.3)" "(79.7)" ...
## $ na_28 : chr "55.4 " "27.3 " "51.1 " "42.8 " ...
## $ na_29 : chr "*" "*" "*" "*" ...
## $ na_30 : chr "*" "*" "*" "*" ...
## $ na_31 : chr "(19.4)" "(6.5)" "(22.3)" "(14.0)" ...
## $ na_32 : chr "*" "*" "*" "*" ...
## $ na_33 : chr "(18.7)" "(5.9)" "23.5 " "16.1 " ...
## $ na_34 : chr "21.6 " "27.0 " "21.1 " "19.7 " ...
## $ na_35 : chr "15.7 " "27.0 " "12.6 " "19.5 " ...
## $ na_36 : chr "7.8 " "8.3 " "3.5 " "7.4 " ...
## $ na_37 : chr "24.6 " "42.8 " "17.4 " "21.4 " ...
## $ na_38 : chr "1.5 " "0.8 " "7.2 " "4.5 " ...
## $ na_39 : num 8.2 8.6 10 13.8 16.9 17.4 10.2 10.1 10.5 9.6 ...
## $ na_40 : num 39.1 35.9 39 27.2 28.8 23.8 44.4 45.3 40.6 46.4 ...
## $ na_41 : num 62.5 79.3 78.2 54 58 58 49.2 51.6 53.5 53.4 ...
## $ na_42 : chr "37.7 " "30.4 " "43.4 " "59.6 " ...
## $ na_43 : num 38.4 62.5 57.6 62.8 64.6 58.6 63.2 63.1 60.4 59.8 ...
## $ na_44 : chr "*" "*" "*" "*" ...
## $ na_45 : num 38.3 62.1 57.7 62.6 64 58 63 63 60.3 59.5 ...
## $ na_46 : chr "48.0 " "47.8 " "43.2 " "59.2 " ...
## $ na_47 : num 7.4 7.2 7.5 8.2 6.2 6.1 7.5 7.3 7.9 7.8 ...
## $ na_48 : num 3.9 6.4 9.5 7.8 7 8.6 12.7 13.1 13.4 13 ...
## $ na_49 : num 13.1 16.7 18.4 17.4 14.3 17 21.7 23.8 23.3 22.7 ...
## $ na_50 : num 9.6 9.1 9.3 6.8 5.8 7.3 9.2 7.2 8.5 10.7 ...
## $ na_51 : num 4.4 6.9 7.8 8.6 7.5 8.5 15.5 10.4 11.9 13.4 ...
## $ na_52 : num 15.4 18.3 18.1 17.6 14.5 18.2 27.6 18.9 22.5 25.9 ...
## $ na_53 : num 23.2 18.4 12.7 12.8 12.9 12.1 13 14.7 13.4 14.7 ...
## $ na_54 : num 8.5 4 4.9 5.9 6.6 5.9 6.6 6.1 4.3 5.9 ...
## $ na_55 : num 35.4 27.4 23 22.1 25.2 23.9 29 28.8 24 25.8 ...
## [list output truncated]
dim(df)
## [1] 706 109
Interpretation: The dim() function shows the total number of rows and columns in the dataset. Rows represent districts, while columns represent variables or indicators. By using str(), we can understand the type of each variable such as numeric, character, or factor. This helps in identifying how data is stored and what kind of analysis can be applied. Overall, this step confirms the size and structure of the dataset before performing further analysis.
names(df)
## [1] "state" "district" "pop_total"
## [4] "pop_male" "pop_female" "literacy_total"
## [7] "literacy_male" "literacy_female" "school_attendance"
## [10] "female_schooling" "married_women" "child_marriage"
## [13] "family_planning" "contraceptive_use" "anc_checkup"
## [16] "institutional_delivery" "home_delivery" "postnatal_care"
## [19] "maternal_care" "child_immunization" "full_vaccination"
## [22] "vitamin_a" "child_illness" "diarrhea_treatment"
## [25] "child_stunting" "child_wasting" "child_underweight"
## [28] "child_overweight" "bmi_male" "bmi_female"
## [31] "overweight_male" "overweight_female" "obesity_male"
## [34] "obesity_female" "anemia_male" "anemia_female"
## [37] "anemia_children" "severe_anemia" "bp_male"
## [40] "bp_female" "hypertension" "high_bp_cases"
## [43] "blood_sugar" "diabetes" "na"
## [46] "na_2" "na_3" "na_4"
## [49] "na_5" "na_6" "na_7"
## [52] "na_8" "na_9" "na_10"
## [55] "na_11" "na_12" "na_13"
## [58] "na_14" "na_15" "na_16"
## [61] "na_17" "na_18" "na_19"
## [64] "na_20" "na_21" "na_22"
## [67] "na_23" "na_24" "na_25"
## [70] "na_26" "na_27" "na_28"
## [73] "na_29" "na_30" "na_31"
## [76] "na_32" "na_33" "na_34"
## [79] "na_35" "na_36" "na_37"
## [82] "na_38" "na_39" "na_40"
## [85] "na_41" "na_42" "na_43"
## [88] "na_44" "na_45" "na_46"
## [91] "na_47" "na_48" "na_49"
## [94] "na_50" "na_51" "na_52"
## [97] "na_53" "na_54" "na_55"
## [100] "na_56" "na_57" "na_58"
## [103] "na_59" "na_60" "na_61"
## [106] "na_62" "na_63" "na_64"
## [109] "na_65"
Interpretation: By examining the column names using names(), we can understand the different types of variables present in the dataset. These variables can be grouped into categories such as health indicators (BMI, BP), education (literacy rate), and lifestyle factors (tobacco, alcohol use). Categorizing variables helps in organizing the analysis and focusing on specific domains. It also improves clarity when presenting the results. This step is important for building a structured analytical approach.
summary(df$bmi_male)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.900 2.887 3.575 32.200
summary(df$bmi_female)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.100 2.500 5.607 5.975 53.000
summary(df$bp_male)
## Length Class Mode
## 706 character character
summary(df$bp_female)
## Length Class Mode
## 706 character character
summary(df$literacy_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45.40 64.40 71.35 71.51 78.97 99.20
Interpretation: The distribution of key health indicators shows how values are spread across districts. By using summary statistics, we can observe minimum, maximum, median, and average values.For example, BMI or BP values may show variation indicating health inequality among districts. Literacy distribution helps identify regions with better education levels. Overall, this analysis helps in understanding variation and identifying patterns in health and education.
sum(is.na(df))
## [1] 0
colSums(is.na(df))
## state district pop_total
## 0 0 0
## pop_male pop_female literacy_total
## 0 0 0
## literacy_male literacy_female school_attendance
## 0 0 0
## female_schooling married_women child_marriage
## 0 0 0
## family_planning contraceptive_use anc_checkup
## 0 0 0
## institutional_delivery home_delivery postnatal_care
## 0 0 0
## maternal_care child_immunization full_vaccination
## 0 0 0
## vitamin_a child_illness diarrhea_treatment
## 0 0 0
## child_stunting child_wasting child_underweight
## 0 0 0
## child_overweight bmi_male bmi_female
## 0 0 0
## overweight_male overweight_female obesity_male
## 0 0 0
## obesity_female anemia_male anemia_female
## 0 0 0
## anemia_children severe_anemia bp_male
## 0 0 0
## bp_female hypertension high_bp_cases
## 0 0 0
## blood_sugar diabetes na
## 0 0 0
## na_2 na_3 na_4
## 0 0 0
## na_5 na_6 na_7
## 0 0 0
## na_8 na_9 na_10
## 0 0 0
## na_11 na_12 na_13
## 0 0 0
## na_14 na_15 na_16
## 0 0 0
## na_17 na_18 na_19
## 0 0 0
## na_20 na_21 na_22
## 0 0 0
## na_23 na_24 na_25
## 0 0 0
## na_26 na_27 na_28
## 0 0 0
## na_29 na_30 na_31
## 0 0 0
## na_32 na_33 na_34
## 0 0 0
## na_35 na_36 na_37
## 0 0 0
## na_38 na_39 na_40
## 0 0 0
## na_41 na_42 na_43
## 0 0 0
## na_44 na_45 na_46
## 0 0 0
## na_47 na_48 na_49
## 0 0 0
## na_50 na_51 na_52
## 0 0 0
## na_53 na_54 na_55
## 0 0 0
## na_56 na_57 na_58
## 0 0 0
## na_59 na_60 na_61
## 0 0 0
## na_62 na_63 na_64
## 0 0 0
## na_65
## 0
Missing values indicate incomplete or unavailable data in the dataset. By calculating total missing values, we can understand the overall data quality. Column-wise missing values help identify which variables have the most gaps. Variables with high missing values may reduce the reliability of analysis. It is important to handle missing data carefully, either by removing or imputing values. This step ensures that further analysis is accurate and meaningful.
min(df$literacy_total, na.rm = TRUE)
## [1] 45.4
max(df$literacy_total, na.rm = TRUE)
## [1] 99.2
mean(df$literacy_total, na.rm = TRUE)
## [1] 71.51445
min(df$bmi_male, na.rm = TRUE)
## [1] 0
max(df$bmi_male, na.rm = TRUE)
## [1] 32.2
mean(df$bmi_male, na.rm = TRUE)
## [1] 2.886969
min(df$bp_male, na.rm = TRUE)
## [1] "(95.1)"
max(df$bp_male, na.rm = TRUE)
## [1] "99.6 "
mean(df$bp_male, na.rm = TRUE)
## Warning in mean.default(df$bp_male, na.rm = TRUE): argument is not numeric or
## logical: returning NA
## [1] NA
Interpretation: Minimum, maximum, and average values provide a basic understanding of the range and central tendency of variables. The minimum value shows the lowest observed level, while the maximum shows the highest. The average gives a general idea of the typical value across districts. Large differences between minimum and maximum indicate high variation. This helps in identifying extreme cases and understanding overall trends. Such analysis is important for comparing districts and identifying areas that need attention. ##……………………………………………………….. ## Level 2: Data Extraction & Filtering ##……………………………………………………….. ### Question 2.1: Which districts have literacy rate above 80%?
high_literacy <- df[df$literacy_total > 80, ]
head(high_literacy)
## state district pop_total pop_male
## 2 North & Middle Andaman Andaman & Nicobar Islands 874 789
## 3 South Andaman Andaman & Nicobar Islands 868 844
## 41 Lakhimpur Assam 916 957
## 46 Dima Hasao Assam 918 1001
## 47 Cachar Assam 907 1110
## 48 Karimganj Assam 917 1170
## pop_female literacy_total literacy_male literacy_female school_attendance
## 2 108 82.7 19.8 950 844
## 3 134 84.7 21.0 967 935
## 41 140 83.2 28.1 983 985
## 46 149 85.1 27.7 959 908
## 47 152 80.5 30.9 1012 991
## 48 160 82.1 32.3 1000 885
## female_schooling married_women child_marriage family_planning
## 2 100.0 (92.6) 93.2 92.2
## 3 96.5 92.2 99.6 97.9
## 41 98.7 67.1 97.2 81.2
## 46 95.0 (60.3) 95.9 50.2
## 47 96.0 72.4 81.7 43.8
## 48 98.2 72.6 87.1 62.3
## contraceptive_use anc_checkup institutional_delivery home_delivery
## 2 86.4 61.3 99.9 2.1
## 3 89.3 91.9 99.7 1.2
## 41 74.2 36.1 99.9 67.0
## 46 83.8 48.2 99.4 61.9
## 47 57.6 43.0 98.6 57.9
## 48 61.5 39.6 98.7 64.4
## postnatal_care maternal_care child_immunization full_vaccination vitamin_a
## 2 (30.1) 84.0 41.0 15.4 1.5
## 3 (50.8) 86.7 57.5 17.1 0.5
## 41 (11.0) 83.9 42.1 36.3 1.1
## 46 8.5 87.7 42.4 16.5 0.7
## 47 3.7 77.2 25.8 29.9 1.0
## 48 7.1 80.7 19.6 27.7 2.3
## child_illness diarrhea_treatment child_stunting child_wasting
## 2 3.8 100.0 84.1 73.1
## 3 2.8 98.2 57.1 50.5
## 41 8.2 79.7 67.1 43.6
## 46 5.4 79.1 65.5 44.9
## 47 8.5 57.8 48.0 38.2
## 48 9.9 59.8 64.3 54.9
## child_underweight child_overweight bmi_male bmi_female overweight_male
## 2 48.3 0.6 6.4 7.8 9.3
## 3 34.0 0.0 2.8 1.8 10.6
## 41 10.8 0.0 3.0 23.5 5.9
## 46 10.2 0.2 4.0 27.0 3.1
## 47 6.9 0.1 1.0 22.9 6.1
## 48 6.1 0.5 2.3 34.0 9.5
## overweight_female obesity_male obesity_female anemia_male anemia_female
## 2 0.0 5.8 1.3 23.2 83.2
## 3 0.3 17.6 8.6 31.2 88.2
## 41 0.1 9.8 4.5 30.8 80.3
## 46 0.0 9.7 2.6 18.3 66.6
## 47 0.6 15.6 6.9 23.6 82.2
## 48 1.9 8.6 4.2 31.0 84.5
## anemia_children severe_anemia bp_male bp_female hypertension high_bp_cases
## 2 74.5 79.2 91.1 83.7 24.1 99.2
## 3 79.4 85.9 92.1 81.0 61.9 98.9
## 41 61.0 51.7 97.3 42.2 14.9 99.6
## 46 58.9 46.9 96.8 40.8 17.6 99.2
## 47 52.0 32.7 90.0 35.6 12.0 98.2
## 48 54.8 42.8 94.2 39.9 11.4 99.5
## blood_sugar diabetes na na_2 na_3 na_4 na_5 na_6 na_7 na_8 na_9 na_10
## 2 92.5 1904 * 94.3 97.7 95.0 0.7 98.3 12.9 * 11.4 *
## 3 88.1 3460 * 89.8 99.5 83.8 0.0 96.9 37.1 (79.1) 29.6 (76.3)
## 41 67.7 5098 * 77.0 96.2 91.6 1.0 96.8 22.7 * 19.9 69.0
## 46 73.8 4825 * 76.8 89.5 84.6 1.8 91.9 19.7 * 18.5 (63.6)
## 47 57.3 3799 1.8 61.4 79.2 74.9 2.1 80.2 12.5 * 13.2 70.4
## 48 63.9 4287 0.0 70.2 76.4 69.8 2.4 78.3 7.0 (51.4) 5.2 75.1
## na_11 na_12 na_13 na_14 na_15 na_16 na_17 na_18 na_19 na_20 na_21
## 2 * * * * * * * * (89.6) * *
## 3 (96.6) (100.0) (79.0) (94.8) (81.7) (33.7) (0.0) (85.3) 84.0 (93.1) (4.3)
## 41 72.4 94.5 74.3 87.7 86.3 16.9 53.3 84.6 65.0 100.0 0.0
## 46 (63.8) (91.3) (73.6) (71.9) (71.7) (15.9) (46.1) (69.9) 63.3 (98.1) (0.0)
## 47 82.1 88.3 74.4 82.5 76.9 9.2 39.9 63.8 67.9 98.6 0.0
## 48 75.9 95.5 84.2 84.0 85.5 22.7 47.3 80.2 74.7 95.0 0.0
## na_22 na_23 na_24 na_25 na_26 na_27 na_28 na_29 na_30 na_31 na_32
## 2 4.5 * * * 7.0 * 27.3 * * (6.5) *
## 3 6.0 * * * 0.0 (77.3) 51.1 * * (22.3) *
## 41 3.7 * * * 3.9 48.0 59.0 (80.9) * 8.2 *
## 46 7.0 * * * 2.7 (39.9) 58.0 * * 9.3 *
## 47 4.2 * * * 2.1 50.4 29.8 (50.8) * 2.9 *
## 48 7.6 (68.4) (55.6) (45.8) 2.3 59.6 40.3 (53.1) (35.4) 4.7 *
## na_33 na_34 na_35 na_36 na_37 na_38 na_39 na_40 na_41 na_42 na_43 na_44
## 2 (5.9) 27.0 27.0 8.3 42.8 0.8 8.6 35.9 79.3 30.4 62.5 *
## 3 23.5 21.1 12.6 3.5 17.4 7.2 10.0 39.0 78.2 43.4 57.6 *
## 41 7.9 38.5 18.2 7.5 34.4 2.3 17.3 12.4 58.7 69.3 66.3 (42.3)
## 46 8.7 30.6 23.6 11.3 21.7 12.5 10.0 16.2 53.8 73.1 61.1 (43.3)
## 47 2.7 28.7 30.7 12.5 38.2 3.3 19.7 9.1 79.1 61.8 58.2 44.0
## 48 5.3 29.1 48.0 30.5 52.9 1.0 17.8 6.6 75.8 64.1 52.5 41.9
## na_45 na_46 na_47 na_48 na_49 na_50 na_51 na_52 na_53 na_54 na_55 na_56
## 2 62.1 47.8 7.2 6.4 16.7 9.1 6.9 18.3 18.4 4.0 27.4 22.6
## 3 57.7 43.2 7.5 9.5 18.4 9.3 7.8 18.1 12.7 4.9 23.0 17.9
## 41 65.4 71.5 5.0 3.5 9.3 5.1 3.4 9.4 13.0 4.4 20.8 15.9
## 46 60.5 70.9 5.6 2.4 9.8 8.7 2.8 13.3 11.2 3.5 17.1 16.9
## 47 57.4 57.6 6.4 6.1 13.1 7.3 4.5 14.4 9.7 3.7 17.0 9.1
## 48 52.0 60.6 6.3 4.3 11.5 8.5 4.5 15.2 9.0 2.6 16.5 11.0
## na_57 na_58 na_59 na_60 na_61 na_62 na_63 na_64 na_65
## 2 6.0 32.2 1.7 0.3 15.8 46.8 70.5 5.1 45.3
## 3 6.1 26.9 1.3 0.7 8.0 19.6 50.8 1.7 32.8
## 41 5.3 25.0 0.0 0.0 0.0 27.2 55.8 15.5 39.9
## 46 4.0 23.5 0.0 0.0 0.0 21.3 55.2 23.9 57.2
## 47 2.9 15.4 0.2 0.0 0.2 33.2 51.0 2.5 14.6
## 48 2.7 17.2 0.0 0.0 0.5 43.0 54.2 1.4 7.0
Interpretation: This analysis identifies districts with high literacy rates above 80%, which indicates better educational development. These districts are likely to have higher awareness about health, hygiene, and social issues. High literacy is often linked with improved health outcomes and better quality of life. By filtering these districts, we can compare them with low-literacy regions to understand disparities. This helps in identifying successful regions that can act as models for others.
malnutrition <- df[df$child_underweight > 30, ]
head(malnutrition)
## state district pop_total pop_male
## 1 Nicobars Andaman & Nicobar Islands 882 764
## 2 North & Middle Andaman Andaman & Nicobar Islands 874 789
## 3 South Andaman Andaman & Nicobar Islands 868 844
## 4 Srikakulam Andhra Pradesh 874 780
## 5 Vizianagaram Andhra Pradesh 902 853
## 6 Visakhapatnam Andhra Pradesh 869 818
## pop_female literacy_total literacy_male literacy_female school_attendance
## 1 125 78.0 23.0 973 927
## 2 108 82.7 19.8 950 844
## 3 134 84.7 21.0 967 935
## 4 100 60.0 20.7 1140 1163
## 5 134 56.0 20.6 1114 898
## 6 112 66.8 21.4 1066 974
## female_schooling married_women child_marriage family_planning
## 1 98.0 83.2 97.9 98.8
## 2 100.0 (92.6) 93.2 92.2
## 3 96.5 92.2 99.6 97.9
## 4 95.0 71.0 99.9 87.7
## 5 95.4 81.7 99.5 93.1
## 6 90.5 71.3 99.6 91.8
## contraceptive_use anc_checkup institutional_delivery home_delivery
## 1 83.5 56.9 99.4 2.7
## 2 86.4 61.3 99.9 2.1
## 3 89.3 91.9 99.7 1.2
## 4 71.6 74.7 76.5 75.6
## 5 61.7 60.3 85.0 76.7
## 6 77.8 72.9 82.2 64.9
## postnatal_care maternal_care child_immunization full_vaccination vitamin_a
## 1 (29.5) 87.5 53.5 11.4 0.0
## 2 (30.1) 84.0 41.0 15.4 1.5
## 3 (50.8) 86.7 57.5 17.1 0.5
## 4 (0.0) 64.3 42.5 25.4 0.0
## 5 (25.2) 58.3 37.6 33.7 2.2
## 6 (5.0) 69.5 46.0 25.4 0.0
## child_illness diarrhea_treatment child_stunting child_wasting
## 1 1.8 100.0 65.3 57.2
## 2 3.8 100.0 84.1 73.1
## 3 2.8 98.2 57.1 50.5
## 4 5.5 78.0 72.3 72.2
## 5 12.7 72.5 71.2 71.2
## 6 9.5 85.7 68.0 67.7
## child_underweight child_overweight bmi_male bmi_female overweight_male
## 1 46.4 0.0 2.7 2.0 4.9
## 2 48.3 0.6 6.4 7.8 9.3
## 3 34.0 0.0 2.8 1.8 10.6
## 4 71.0 0.3 0.6 0.0 0.3
## 5 69.3 1.3 0.0 0.0 0.6
## 6 64.9 1.5 0.3 0.0 0.8
## overweight_female obesity_male obesity_female anemia_male anemia_female
## 1 1.2 9.5 3.3 40.4 49.4
## 2 0.0 5.8 1.3 23.2 83.2
## 3 0.3 17.6 8.6 31.2 88.2
## 4 0.0 5.7 3.6 16.0 45.8
## 5 0.0 6.7 4.7 21.1 36.4
## 6 0.0 4.8 2.4 15.2 35.3
## anemia_children severe_anemia bp_male bp_female hypertension high_bp_cases
## 1 62.8 71.7 78.0 72.6 43.9 97.9
## 2 74.5 79.2 91.1 83.7 24.1 99.2
## 3 79.4 85.9 92.1 81.0 61.9 98.9
## 4 79.7 78.4 94.4 67.5 35.3 100.0
## 5 76.1 71.4 91.3 59.6 32.4 98.8
## 6 79.4 58.6 88.0 75.0 40.1 97.0
## blood_sugar diabetes na na_2 na_3 na_4 na_5 na_6 na_7 na_8 na_9 na_10
## 1 85.1 2278 * 92.5 97.8 96.7 0.8 98.6 11.5 * 10.7 (64.2)
## 2 92.5 1904 * 94.3 97.7 95.0 0.7 98.3 12.9 * 11.4 *
## 3 88.1 3460 * 89.8 99.5 83.8 0.0 96.9 37.1 (79.1) 29.6 (76.3)
## 4 90.8 3479 * 97.7 97.9 52.2 0.5 96.4 57.0 73.8 44.5 (82.8)
## 5 83.9 1931 * 89.2 99.0 70.6 0.5 97.6 41.3 70.3 30.3 (76.8)
## 6 84.8 2200 * 90.9 95.3 69.3 0.0 94.4 26.5 57.2 16.8 (76.5)
## na_11 na_12 na_13 na_14 na_15 na_16 na_17 na_18 na_19 na_20
## 1 (94.1) (80.4) (69.1) (71.9) (67.3) (20.7) (3.1) (68.6) 94.9 (100.0)
## 2 * * * * * * * * (89.6) *
## 3 (96.6) (100.0) (79.0) (94.8) (81.7) (33.7) (0.0) (85.3) 84.0 (93.1)
## 4 (100.0) (93.3) (82.8) (89.7) (93.3) (34.9) (74.8) (89.7) 69.6 (97.0)
## 5 * (100.0) (76.8) (90.3) (92.5) (35.0) (77.3) (83.6) 85.9 (100.0)
## 6 (93.5) (97.9) (76.5) (90.5) (95.1) (45.1) (72.9) (79.6) 91.3 (83.8)
## na_21 na_22 na_23 na_24 na_25 na_26 na_27 na_28 na_29 na_30 na_31 na_32
## 1 (0.0) 5.7 * * * 1.8 (85.7) 55.4 * * (19.4) *
## 2 * 4.5 * * * 7.0 * 27.3 * * (6.5) *
## 3 (4.3) 6.0 * * * 0.0 (77.3) 51.1 * * (22.3) *
## 4 (3.0) 11.9 * * * 1.3 (79.7) 42.8 * * (14.0) *
## 5 (0.0) 7.5 * * * 1.4 (83.5) 55.6 * * (2.5) *
## 6 (9.7) 8.1 * * * 2.0 (72.3) 64.3 * * (6.9) *
## na_33 na_34 na_35 na_36 na_37 na_38 na_39 na_40 na_41 na_42 na_43 na_44
## 1 (18.7) 21.6 15.7 7.8 24.6 1.5 8.2 39.1 62.5 37.7 38.4 *
## 2 (5.9) 27.0 27.0 8.3 42.8 0.8 8.6 35.9 79.3 30.4 62.5 *
## 3 23.5 21.1 12.6 3.5 17.4 7.2 10.0 39.0 78.2 43.4 57.6 *
## 4 16.1 19.7 19.5 7.4 21.4 4.5 13.8 27.2 54.0 59.6 62.8 *
## 5 1.8 36.4 19.2 8.3 32.2 4.7 16.9 28.8 58.0 66.7 64.6 *
## 6 11.8 31.0 21.5 11.2 33.5 4.8 17.4 23.8 58.0 72.6 58.6 *
## na_45 na_46 na_47 na_48 na_49 na_50 na_51 na_52 na_53 na_54 na_55 na_56 na_57
## 1 38.3 48.0 7.4 3.9 13.1 9.6 4.4 15.4 23.2 8.5 35.4 32.9 11.1
## 2 62.1 47.8 7.2 6.4 16.7 9.1 6.9 18.3 18.4 4.0 27.4 22.6 6.0
## 3 57.7 43.2 7.5 9.5 18.4 9.3 7.8 18.1 12.7 4.9 23.0 17.9 6.1
## 4 62.6 59.2 8.2 7.8 17.4 6.8 8.6 17.6 12.8 5.9 22.1 14.4 5.5
## 5 64.0 73.9 6.2 7.0 14.3 5.8 7.5 14.5 12.9 6.6 25.2 14.8 6.4
## 6 58.0 58.9 6.1 8.6 17.0 7.3 8.5 18.2 12.1 5.9 23.9 17.0 7.0
## na_58 na_59 na_60 na_61 na_62 na_63 na_64 na_65
## 1 47.0 13.4 13.2 5.4 63.5 76.8 29.6 64.5
## 2 32.2 1.7 0.3 15.8 46.8 70.5 5.1 45.3
## 3 26.9 1.3 0.7 8.0 19.6 50.8 1.7 32.8
## 4 22.9 1.0 0.2 3.8 7.1 21.3 0.6 28.3
## 5 25.1 4.9 0.6 7.3 11.4 21.5 0.8 32.3
## 6 29.2 1.7 0.7 4.1 6.3 22.8 1.3 30.2
Interpretation: This filtering highlights districts where child underweight levels are significantly high, indicating malnutrition issues. Such districts may suffer from poor food availability, lack of awareness, or weak healthcare systems. High malnutrition rates directly affect child growth and development. Identifying these regions helps policymakers target nutrition programs effectively. It also shows the need for intervention in food security and child healthcare.
high_pop_poor_health <- df[df$pop_total > 1000000 & df$literacy_total < 60, ]
head(high_pop_poor_health)
## [1] state district pop_total
## [4] pop_male pop_female literacy_total
## [7] literacy_male literacy_female school_attendance
## [10] female_schooling married_women child_marriage
## [13] family_planning contraceptive_use anc_checkup
## [16] institutional_delivery home_delivery postnatal_care
## [19] maternal_care child_immunization full_vaccination
## [22] vitamin_a child_illness diarrhea_treatment
## [25] child_stunting child_wasting child_underweight
## [28] child_overweight bmi_male bmi_female
## [31] overweight_male overweight_female obesity_male
## [34] obesity_female anemia_male anemia_female
## [37] anemia_children severe_anemia bp_male
## [40] bp_female hypertension high_bp_cases
## [43] blood_sugar diabetes na
## [46] na_2 na_3 na_4
## [49] na_5 na_6 na_7
## [52] na_8 na_9 na_10
## [55] na_11 na_12 na_13
## [58] na_14 na_15 na_16
## [61] na_17 na_18 na_19
## [64] na_20 na_21 na_22
## [67] na_23 na_24 na_25
## [70] na_26 na_27 na_28
## [73] na_29 na_30 na_31
## [76] na_32 na_33 na_34
## [79] na_35 na_36 na_37
## [82] na_38 na_39 na_40
## [85] na_41 na_42 na_43
## [88] na_44 na_45 na_46
## [91] na_47 na_48 na_49
## [94] na_50 na_51 na_52
## [97] na_53 na_54 na_55
## [100] na_56 na_57 na_58
## [103] na_59 na_60 na_61
## [106] na_62 na_63 na_64
## [109] na_65
## <0 rows> (or 0-length row.names)
Interpretation: This analysis identifies districts with large populations but low literacy levels, indicating poor overall development. High population combined with low literacy can create pressure on healthcare and education systems. These districts may face challenges such as poverty, unemployment, and poor health awareness. It highlights areas where government intervention is most needed. This type of filtering is useful for identifying priority regions for development programs. ##……………………………………………………………. ## Level 3: Grouping & Summarization ##……………………………………………………………. ### Question 3.1: What is the average literacy rate state-wise?
library(dplyr)
state_literacy <- df %>%
group_by(state) %>%
summarise(avg_literacy = mean(literacy_total, na.rm = TRUE))
head(state_literacy)
## # A tibble: 6 × 2
## state avg_literacy
## <chr> <dbl>
## 1 "Adilabad " 58.6
## 2 "Agar Malwa " 56.3
## 3 "Agra " 69.4
## 4 "Ahmadabad " 79.8
## 5 "Ahmadnagar " 77.4
## 6 "Aizawl " 97.8
Interpretation: This analysis calculates the average literacy rate for each state by grouping district-level data. It helps in understanding overall educational performance at the state level. States with higher average literacy are likely to have better awareness and socio-economic conditions. On the other hand, lower literacy states may require more educational initiatives. This comparison highlights regional disparities in education across the country.
vaccination_state <- df %>%
group_by(state) %>%
summarise(avg_vaccination = mean(full_vaccination, na.rm = TRUE))
## Warning: There were 698 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_vaccination = mean(full_vaccination, na.rm = TRUE)`.
## ℹ In group 1: `state = "Adilabad "`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 697 remaining warnings.
head(vaccination_state)
## # A tibble: 6 × 2
## state avg_vaccination
## <chr> <dbl>
## 1 "Adilabad " NA
## 2 "Agar Malwa " NA
## 3 "Agra " NA
## 4 "Ahmadabad " NA
## 5 "Ahmadnagar " NA
## 6 "Aizawl " NA
Interpretation: This analysis shows the average vaccination rate among children across different states. Higher vaccination rates indicate better healthcare services and awareness programs. States with lower averages may be at risk of preventable diseases. This helps identify areas where immunization programs need improvement. Overall, it reflects the effectiveness of public health policies.
bmi_state <- df %>%
group_by(state) %>%
summarise(
avg_bmi_male = mean(bmi_male, na.rm = TRUE),
avg_bmi_female = mean(bmi_female, na.rm = TRUE)
)
head(bmi_state)
## # A tibble: 6 × 3
## state avg_bmi_male avg_bmi_female
## <chr> <dbl> <dbl>
## 1 "Adilabad " 0 1.2
## 2 "Agar Malwa " 0.6 1.2
## 3 "Agra " 2.4 4.2
## 4 "Ahmadabad " 3.3 3.8
## 5 "Ahmadnagar " 1.7 1
## 6 "Aizawl " 2.8 9.7
Interpretation: This analysis compares average BMI values of males and females across states. It helps identify gender differences in nutritional status. A significant gap may indicate inequality in food distribution or health awareness. States where both values are within a healthy range reflect better nutrition conditions. This comparison is useful for understanding gender-based health patterns.
anemia_state <- df %>%
group_by(state) %>%
summarise(
avg_anemia_male = mean(anemia_male, na.rm = TRUE),
avg_anemia_female = mean(anemia_female, na.rm = TRUE),
avg_anemia_children = mean(anemia_children, na.rm = TRUE)
)
## Warning: There were 1396 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_anemia_female = mean(anemia_female, na.rm = TRUE)`.
## ℹ In group 1: `state = "Adilabad "`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1395 remaining warnings.
head(anemia_state)
## # A tibble: 6 × 4
## state avg_anemia_male avg_anemia_female avg_anemia_children
## <chr> <dbl> <dbl> <dbl>
## 1 "Adilabad " 12 NA NA
## 2 "Agar Malwa " 24.2 NA NA
## 3 "Agra " 12.8 NA NA
## 4 "Ahmadabad " 36.2 NA NA
## 5 "Ahmadnagar " 19.5 NA NA
## 6 "Aizawl " 12 NA NA
Interpretation: This analysis calculates average anemia levels among males, females, and children for each state. High anemia levels indicate poor nutrition and lack of essential nutrients like iron. Women and children are often more affected, which highlights vulnerability in these groups. States with high averages require urgent nutritional interventions. This helps in identifying regions with serious public health concerns.
df[[17]] <- as.numeric(as.character(df[[17]]))
df[[50]] <- as.numeric(as.character(df[[50]]))
df[[69]] <- as.numeric(as.character(df[[69]]))
## Warning: NAs introduced by coercion
health_access_state <- df %>%
mutate(state = str_trim(state)) %>%
group_by(state) %>%
summarise(
avg_health_access = mean(.[[17]], na.rm = TRUE),
avg_doctor_availability = mean(.[[50]], na.rm = TRUE),
avg_hospital_access = mean(.[[69]], na.rm = TRUE),
.groups = "drop"
)
head(health_access_state)
## # A tibble: 6 × 4
## state avg_health_access avg_doctor_availability avg_hospital_access
## <chr> <dbl> <dbl> <dbl>
## 1 Adilabad 40.2 89.6 67.1
## 2 Agar Malwa 40.2 89.6 67.1
## 3 Agra 40.2 89.6 67.1
## 4 Ahmadabad 40.2 89.6 67.1
## 5 Ahmadnagar 40.2 89.6 67.1
## 6 Aizawl 40.2 89.6 67.1
Interpretation: This analysis evaluates healthcare access across states by averaging multiple indicators. Higher values indicate better availability of doctors and hospitals. States with low averages may suffer from poor healthcare infrastructure. This can lead to delayed treatment and higher health risks. The results help in understanding regional inequalities in healthcare facilities and planning improvements.
df$health_score <- (as.numeric(df[[19]]) + as.numeric(df[[17]]) +
(100 - as.numeric(df[[89]])) + (100 - as.numeric(df[[102]]))) / 4
top_healthy <- df %>%
filter(!is.na(health_score)) %>%
arrange(desc(health_score)) %>%
select(any_of(c("state", "district", "district_names")), health_score) %>%
slice(1:10)
head(top_healthy)
## state district health_score
## 1 Champhai Mizoram 79.325
## 2 Serchhip Mizoram 77.175
## 3 Lakshadweep Lakshadweep 76.525
## 4 Barmer Rajasthan 76.275
## 5 Pithoragarh Uttarakhand 75.875
## 6 Wayanad Kerala 75.825
Interpretation: This analysis creates a composite health score by combining literacy, healthcare access, anemia, and blood pressure indicators. It provides a holistic measure of district-level well-being instead of relying on a single variable. Districts ranked at the top show balanced development in education, healthcare, and overall health conditions. These regions can serve as ideal models for policy implementation. This approach reflects real-world data science techniques used in health analytics.
bottom_unhealthy <- df %>%
filter(!is.na(health_score)) %>%
arrange(health_score) %>%
select(any_of(c("state", "district", "district_names")), health_score) %>%
slice(1:10)
head(bottom_unhealthy)
## state district health_score
## 1 Kishanganj Bihar 42.075
## 2 Purnia Bihar 42.300
## 3 Katihar Bihar 42.975
## 4 Jamui Bihar 43.250
## 5 Bahraich Uttar Pradesh 43.550
## 6 Saharsa Bihar 43.775
Interpretation: This analysis identifies districts with the lowest composite health scores, representing poor overall health conditions. These areas may suffer from low literacy, weak healthcare access, and high disease prevalence. Such districts are more vulnerable and require urgent government attention. Ranking them helps prioritize resource allocation and policy intervention. It highlights inequality in health and development across regions.
state_literacy_rank <- df %>%
group_by(state) %>%
summarise(avg_literacy = mean(literacy_total, na.rm = TRUE)) %>%
arrange(desc(avg_literacy)) %>%
mutate(rank = row_number())
head(state_literacy_rank)
## # A tibble: 6 × 3
## state avg_literacy rank
## <chr> <dbl> <int>
## 1 "Mahe " 99.2 1
## 2 "Kottayam " 97.9 2
## 3 "Aizawl " 97.8 3
## 4 "Alappuzha " 97.8 4
## 5 "Thrissur " 97.8 5
## 6 "Kozhikode " 97.7 6
Interpretation: This analysis ranks states based on their average literacy rate, providing a clear comparison of educational performance. States with higher rankings demonstrate better education systems and awareness levels. Lower-ranked states may face challenges such as lack of schools or poor educational infrastructure. This ranking highlights regional disparities in education. It helps policymakers focus on improving literacy in weaker states.
low_disease <- df %>%
mutate(
bp_male = as.numeric(bp_male),
diabetes = as.numeric(diabetes),
disease_score = (bp_male + diabetes) / 2
) %>%
arrange(disease_score) %>%
select(state, district, disease_score) %>%
slice(1:10)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `bp_male = as.numeric(bp_male)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
head(low_disease)
## state district disease_score
## 1 Dindori Madhya Pradesh 144.40
## 2 Betul Madhya Pradesh 274.95
## 3 Singrauli Madhya Pradesh 283.05
## 4 Sukma Chhattisgarh 284.65
## 5 Dadra & Nagar Haveli Dadra and Nagar Haveli & Daman and Diu 293.90
## 6 Tikamgarh Madhya Pradesh 325.50
Interpretation: This analysis calculates a disease score using indicators like blood pressure, diabetes, and heart disease. Districts with the lowest scores represent healthier populations with lower disease prevalence. These regions may have better lifestyles, nutrition, and healthcare systems. Identifying such districts helps understand positive health patterns. These areas can serve as benchmarks for improving public health in other districts.
gap_analysis <- df %>%
mutate(
health_score_calc = (as.numeric(.[[17]]) + (100 - as.numeric(.[[89]]))) / 2,
gap = as.numeric(.[[19]]) - health_score_calc
) %>%
filter(!is.na(gap)) %>%
arrange(desc(abs(gap))) %>%
select(state, district, gap) %>%
slice(1:10)
head(gap_analysis)
## state district gap
## 1 Kargil Ladakh 66.40
## 2 Leh(Ladakh) Ladakh 65.05
## 3 South Andaman Andaman & Nicobar Islands 64.95
## 4 North & Middle Andaman Andaman & Nicobar Islands 64.00
## 5 Jammu Jammu & Kashmir 63.15
## 6 Lahul & Spiti Himachal Pradesh 61.65
Interpretation: This analysis identifies districts where there is a large difference between education and health performance. A high gap indicates imbalance, where a district may have good literacy but poor health, or vice versa. Such disparities highlight inefficiencies in development and resource distribution. These districts require targeted policies to balance both sectors. This is an innovative analysis that provides deeper insight beyond basic comparisons.