#loading the dataset
library(readxl)
kdhscombined_file <- read_excel("kdhscombined-file.xlsx")
View(kdhscombined_file)
# selecting only the necessary columns
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
kdhscombined_file <- kdhscombined_file %>%
  mutate(Group = case_when(
    Characteristics %in% c("Lowest Wealth Quintile", "Second Wealth Quintile", 
                          "Middle Wealth Quintile", "Fourth Wealth Quintile", 
                          "Highest Wealth Quintile") ~ "Wealth",
    Characteristics %in% c("Rural Residence", "Urban Residence") ~ "Residence",
    Characteristics %in% c("No education", "Primary Education", 
                          "Secondary Education", "More than Secondary Education") ~ "Education",
    TRUE ~ NA_character_  
  ))
#pivoting the data
pivoted_data <- kdhscombined_file %>%
  filter(!is.na(Group)) %>%
  pivot_wider(
    names_from = Group,
    values_from = Number
  )
## Warning: Values from `Number` are not uniquely identified; output will contain
## list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} |>
##   dplyr::summarise(n = dplyr::n(), .by = c(`Table Number`, `Table Title`,
##   Respondents, Characteristics, Group)) |>
##   dplyr::filter(n > 1L)
#converting the number column to numeric
kdhscombined_file <- kdhscombined_file %>%
  mutate(Number = as.numeric(Number))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Number = as.numeric(Number)`.
## Caused by warning:
## ! NAs introduced by coercion
#separating the pivoted the data
pivoted_data <- kdhscombined_file %>%
  pivot_wider(
    names_from = Characteristics,
    values_from = Number,
    values_fn = list(Number = mean)  
  )
#separating the selected columns
pivoted_data <- pivoted_data %>%
  select(`Lowest Wealth Quintile`, `Second Wealth Quintile`, `Third Wealth Quintile`, `Fourth Wealth Quintile`, `Highest Wealth Quintile`, `Rural Residence`, `Urban Residence`,`No education`, `Primary Education`, `Secondary Education`, `More than Secondary Education`)
#calculating the medians
medians <- pivoted_data %>%
  summarise(across(everything(), ~ median(.x, na.rm = TRUE)))
#calculating the means
means <- colMeans(pivoted_data, na.rm = TRUE)
View(means)
#calculating the standard deviation
pivoted_data %>%
  summarise(across(everything(), ~ sd(.x, na.rm = TRUE)))
## # A tibble: 1 × 11
##   `Lowest Wealth Quintile` `Second Wealth Quintile` `Third Wealth Quintile`
##                      <dbl>                    <dbl>                   <dbl>
## 1                    1915.                    1968.                   2630.
## # ℹ 8 more variables: `Fourth Wealth Quintile` <dbl>,
## #   `Highest Wealth Quintile` <dbl>, `Rural Residence` <dbl>,
## #   `Urban Residence` <dbl>, `No education` <dbl>, `Primary Education` <dbl>,
## #   `Secondary Education` <dbl>, `More than Secondary Education` <dbl>
#converting the character to numeric
pivoted_data <- pivoted_data %>%
  mutate(
    `Lowest Wealth Quintile` = as.numeric(`Lowest Wealth Quintile`),
    `Second Wealth Quintile` = as.numeric(`Second Wealth Quintile`),
    `Third Wealth Quintile` = as.numeric(`Third Wealth Quintile`),
    `Fourth Wealth Quintile` = as.numeric(`Fourth Wealth Quintile`),
    `Highest Wealth Quintile` = as.numeric(`Highest Wealth Quintile`),
    `Rural Residence` = as.numeric(`Rural Residence`),
    `Urban Residence` = as.numeric(`Urban Residence`),
    `No education` = as.numeric(`No education`),
    `Primary Education` = as.numeric(`Primary Education`),
    `Secondary Education` = as.numeric(`Secondary Education`),
    `More than Secondary Education` = as.numeric(`More than Secondary Education`)
  )
#creating assumed mortality data
library(tidyverse)

assumed_mortality <- tibble(
  group = c(
    "(`Lowest Wealth Quintile`",
    "(`Second Wealth Quintile`",
    "(`Third Wealth Quintile`",
    "(`Fourth Wealth Quintile`",
    "(`Highest Wealth Quintile`",
    "(`Rural Residence`",
    "(`Urban Residence`",
    "(`No education`",
    "(`Primary Education`",
    "(`Secondary Education`",
    "(`More than Secondary Education`"
  ),
  risk = c(
    0.25,  
    0.20,
    0.15,
    0.10,
    0.05,  # highest wealth
    0.18,  # rural
    0.09,  # urban
    0.30,  # no education
    0.20,
    0.10,
    0.05   # more than secondary
  )
)
#creating assumed mortality risk data
assumed_mortality <- tibble(
  group = c(
    "(`Lowest Wealth Quintile`",
    "(`Second Wealth Quintile`",
    "(`Third Wealth Quintile`",
    "(`Fourth Wealth Quintile`",
    "(`Highest Wealth Quintile`",
    "(`Rural Residence`",
    "(`Urban Residence`",
    "(`No education`",
    "(`Primary Education`",
    "(`Secondary Education`",
    "(`More than Secondary Education`"
  ),
  risk = c(
    0.25, 0.20, 0.15, 0.10, 0.05,
    0.18, 0.09,
    0.30, 0.20, 0.10, 0.05
  )
)

# Clean the group names
assumed_mortality <- assumed_mortality %>%
  mutate(group = str_replace_all(group, "[`\\(\\)]", ""))
#creating the group names with their risk
simulate_group <- function(group_name, risk, var_name, n = 1000) {
  df <- data.frame(
    group = rep(group_name, n),
    died = rbinom(n, 1, risk)
  )
  names(df)[1] <- var_name
  return(df)
}
#creating the wealth group
wealth_groups <- assumed_mortality$group[1:5]
wealth_risks <- assumed_mortality$risk[1:5]

wealth_sim <- bind_rows(lapply(1:5, function(i) {
  simulate_group(wealth_groups[i], wealth_risks[i], "wealth")
}))
#creating the residence group
residence_groups <- assumed_mortality$group[6:7]
residence_risks <- assumed_mortality$risk[6:7]

residence_sim <- bind_rows(lapply(1:2, function(i) {
  simulate_group(residence_groups[i], residence_risks[i], "residence")
}))
#creating the education group
education_groups <- assumed_mortality$group[8:11]
education_risks <- assumed_mortality$risk[8:11]

education_sim <- bind_rows(lapply(1:4, function(i) {
  simulate_group(education_groups[i], education_risks[i], "education")
}))
#combining the necessary groups
set.seed(123)
n <- 1000

simulated_data <- tibble(
  wealth = sample(wealth_groups, n, replace = TRUE),
  residence = sample(residence_groups, n, replace = TRUE),
  education = sample(education_groups, n, replace = TRUE)
)
# Step 1: Pivot the wide data to long format
long_data <- pivoted_data %>%
  pivot_longer(cols = everything(), names_to = "group", values_to = "percentage")

# Step 2: Join assumed mortality and rename risk columns per group type
wealth_groups <- assumed_mortality %>% filter(str_detect(group, "Wealth")) %>%
  rename(wealth_group = group, wealth_risk = risk)

residence_groups <- assumed_mortality %>% filter(str_detect(group, "Residence")) %>%
  rename(residence_group = group, residence_risk = risk)

education_groups <- assumed_mortality %>% filter(str_detect(group, "Education")) %>%
  rename(education_group = group, education_risk = risk)

# Step 3: Merge with long data
# This will split the data into three parts, each for one category
wealth_data <- long_data %>% 
  filter(group %in% wealth_groups$wealth_group) %>%
  left_join(wealth_groups, by = c("group" = "wealth_group"))

residence_data <- long_data %>% 
  filter(group %in% residence_groups$residence_group) %>%
  left_join(residence_groups, by = c("group" = "residence_group"))

education_data <- long_data %>% 
  filter(group %in% education_groups$education_group) %>%
  left_join(education_groups, by = c("group" = "education_group"))
#simulate mortality risk separately
wealth_data <- wealth_data %>%
  mutate(died = rbinom(n(), 1, wealth_risk))

residence_data <- residence_data %>%
  mutate(died = rbinom(n(), 1, residence_risk))

education_data <- education_data %>%
  mutate(died = rbinom(n(), 1, education_risk))
# Add a new variable to indicate the type of group
wealth_data <- wealth_data %>% mutate(group_type = "wealth")
residence_data <- residence_data %>% mutate(group_type = "residence")
education_data <- education_data %>% mutate(group_type = "education")

# Combine all into one dataset
simulated_data <- bind_rows(wealth_data, residence_data, education_data)
#analyze mortality risk
simulated_data %>%
  group_by(group_type, group) %>%
  summarise(mortality_rate = mean(died), .groups = "drop")
## # A tibble: 10 × 3
##    group_type group                         mortality_rate
##    <chr>      <chr>                                  <dbl>
##  1 education  More than Secondary Education         0.0452
##  2 education  Primary Education                     0.193 
##  3 education  Secondary Education                   0.0974
##  4 residence  Rural Residence                       0.181 
##  5 residence  Urban Residence                       0.0888
##  6 wealth     Fourth Wealth Quintile                0.0919
##  7 wealth     Highest Wealth Quintile               0.0444
##  8 wealth     Lowest Wealth Quintile                0.25  
##  9 wealth     Second Wealth Quintile                0.184 
## 10 wealth     Third Wealth Quintile                 0.131
# Convert group to factor for modeling
simulated_data <- simulated_data %>%
  mutate(group = as.factor(group))

# Logistic regression: probability of dying ~ group
model <- glm(died ~ group, data = simulated_data, family = "binomial")

summary(model)
## 
## Call:
## glm(formula = died ~ group, family = "binomial", data = simulated_data)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -2.29065    0.09660 -23.712  < 2e-16 ***
## groupHighest Wealth Quintile       -0.77863    0.16639  -4.679 2.88e-06 ***
## groupLowest Wealth Quintile         1.19204    0.11613  10.265  < 2e-16 ***
## groupMore than Secondary Education -0.76042    0.16548  -4.595 4.33e-06 ***
## groupPrimary Education              0.86096    0.11971   7.192 6.38e-13 ***
## groupRural Residence                0.77894    0.12080   6.448 1.13e-10 ***
## groupSecond Wealth Quintile         0.79984    0.12051   6.637 3.20e-11 ***
## groupSecondary Education            0.06365    0.13489   0.472   0.6370    
## groupThird Wealth Quintile          0.39711    0.12720   3.122   0.0018 ** 
## groupUrban Residence               -0.03791    0.13769  -0.275   0.7831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9952.0  on 12839  degrees of freedom
## Residual deviance: 9455.8  on 12830  degrees of freedom
## AIC: 9475.8
## 
## Number of Fisher Scoring iterations: 5
#visualize the data
library(ggplot2)

simulated_data %>%
  group_by(group, group_type) %>%
  summarise(mortality_rate = mean(died)) %>%
  ggplot(aes(x = reorder(group, -mortality_rate), y = mortality_rate, fill = group_type)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Simulated Mortality Risk by Group",
       x = "Group", y = "Mortality Rate") +
  theme_minimal()
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.

#testing goodness of fit of the model using Akaike Information Criterion
aic_value <- AIC(model)
print(paste("Akaike Information Criterion (AIC):", aic_value))
## [1] "Akaike Information Criterion (AIC): 9475.81522015821"
#another model for AIC
# Fit the null model (just an intercept)
null_model <- glm(died ~ 1, data = simulated_data, family = "binomial")

# Fit the full logistic regression model (with predictors)
full_model <- glm(died ~ group,data=simulated_data, family = "binomial")

# Compare the AIC values
AIC(null_model)
## [1] 9954.024
AIC(full_model)
## [1] 9475.815
#Fitness of the model using deviance test
model <- glm(died ~ group, data = simulated_data, family = "binomial")

# Get deviance of the model
deviance(model)
## [1] 9455.815
#another model for deviance test
null_model <- glm(died ~ 1, data = simulated_data, family = "binomial")
full_model <- glm(died ~ group, data = simulated_data, family = "binomial")
anova(null_model, full_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: died ~ 1
## Model 2: died ~ group
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     12839     9952.0                          
## 2     12830     9455.8  9   496.21 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#anova test
null_model <- glm(died ~ group, data = simulated_data, family = "binomial")
anova(null_model, model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: died ~ group
## Model 2: died ~ group
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1     12830     9455.8                     
## 2     12830     9455.8  0        0