#loading the dataset
library(readxl)
kdhscombined_file <- read_excel("kdhscombined-file.xlsx")
View(kdhscombined_file)
# selecting only the necessary columns
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
kdhscombined_file <- kdhscombined_file %>%
mutate(Group = case_when(
Characteristics %in% c("Lowest Wealth Quintile", "Second Wealth Quintile",
"Middle Wealth Quintile", "Fourth Wealth Quintile",
"Highest Wealth Quintile") ~ "Wealth",
Characteristics %in% c("Rural Residence", "Urban Residence") ~ "Residence",
Characteristics %in% c("No education", "Primary Education",
"Secondary Education", "More than Secondary Education") ~ "Education",
TRUE ~ NA_character_
))
#pivoting the data
pivoted_data <- kdhscombined_file %>%
filter(!is.na(Group)) %>%
pivot_wider(
names_from = Group,
values_from = Number
)
## Warning: Values from `Number` are not uniquely identified; output will contain
## list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} |>
## dplyr::summarise(n = dplyr::n(), .by = c(`Table Number`, `Table Title`,
## Respondents, Characteristics, Group)) |>
## dplyr::filter(n > 1L)
#converting the number column to numeric
kdhscombined_file <- kdhscombined_file %>%
mutate(Number = as.numeric(Number))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Number = as.numeric(Number)`.
## Caused by warning:
## ! NAs introduced by coercion
#separating the pivoted the data
pivoted_data <- kdhscombined_file %>%
pivot_wider(
names_from = Characteristics,
values_from = Number,
values_fn = list(Number = mean)
)
#separating the selected columns
pivoted_data <- pivoted_data %>%
select(`Lowest Wealth Quintile`, `Second Wealth Quintile`, `Third Wealth Quintile`, `Fourth Wealth Quintile`, `Highest Wealth Quintile`, `Rural Residence`, `Urban Residence`,`No education`, `Primary Education`, `Secondary Education`, `More than Secondary Education`)
#calculating the medians
medians <- pivoted_data %>%
summarise(across(everything(), ~ median(.x, na.rm = TRUE)))
#calculating the means
means <- colMeans(pivoted_data, na.rm = TRUE)
View(means)
#calculating the standard deviation
pivoted_data %>%
summarise(across(everything(), ~ sd(.x, na.rm = TRUE)))
## # A tibble: 1 × 11
## `Lowest Wealth Quintile` `Second Wealth Quintile` `Third Wealth Quintile`
## <dbl> <dbl> <dbl>
## 1 1915. 1968. 2630.
## # ℹ 8 more variables: `Fourth Wealth Quintile` <dbl>,
## # `Highest Wealth Quintile` <dbl>, `Rural Residence` <dbl>,
## # `Urban Residence` <dbl>, `No education` <dbl>, `Primary Education` <dbl>,
## # `Secondary Education` <dbl>, `More than Secondary Education` <dbl>
#converting the character to numeric
pivoted_data <- pivoted_data %>%
mutate(
`Lowest Wealth Quintile` = as.numeric(`Lowest Wealth Quintile`),
`Second Wealth Quintile` = as.numeric(`Second Wealth Quintile`),
`Third Wealth Quintile` = as.numeric(`Third Wealth Quintile`),
`Fourth Wealth Quintile` = as.numeric(`Fourth Wealth Quintile`),
`Highest Wealth Quintile` = as.numeric(`Highest Wealth Quintile`),
`Rural Residence` = as.numeric(`Rural Residence`),
`Urban Residence` = as.numeric(`Urban Residence`),
`No education` = as.numeric(`No education`),
`Primary Education` = as.numeric(`Primary Education`),
`Secondary Education` = as.numeric(`Secondary Education`),
`More than Secondary Education` = as.numeric(`More than Secondary Education`)
)
#creating assumed mortality data
library(tidyverse)
assumed_mortality <- tibble(
group = c(
"(`Lowest Wealth Quintile`",
"(`Second Wealth Quintile`",
"(`Third Wealth Quintile`",
"(`Fourth Wealth Quintile`",
"(`Highest Wealth Quintile`",
"(`Rural Residence`",
"(`Urban Residence`",
"(`No education`",
"(`Primary Education`",
"(`Secondary Education`",
"(`More than Secondary Education`"
),
risk = c(
0.25,
0.20,
0.15,
0.10,
0.05, # highest wealth
0.18, # rural
0.09, # urban
0.30, # no education
0.20,
0.10,
0.05 # more than secondary
)
)
#creating assumed mortality risk data
assumed_mortality <- tibble(
group = c(
"(`Lowest Wealth Quintile`",
"(`Second Wealth Quintile`",
"(`Third Wealth Quintile`",
"(`Fourth Wealth Quintile`",
"(`Highest Wealth Quintile`",
"(`Rural Residence`",
"(`Urban Residence`",
"(`No education`",
"(`Primary Education`",
"(`Secondary Education`",
"(`More than Secondary Education`"
),
risk = c(
0.25, 0.20, 0.15, 0.10, 0.05,
0.18, 0.09,
0.30, 0.20, 0.10, 0.05
)
)
# Clean the group names
assumed_mortality <- assumed_mortality %>%
mutate(group = str_replace_all(group, "[`\\(\\)]", ""))
#creating the group names with their risk
simulate_group <- function(group_name, risk, var_name, n = 1000) {
df <- data.frame(
group = rep(group_name, n),
died = rbinom(n, 1, risk)
)
names(df)[1] <- var_name
return(df)
}
#creating the wealth group
wealth_groups <- assumed_mortality$group[1:5]
wealth_risks <- assumed_mortality$risk[1:5]
wealth_sim <- bind_rows(lapply(1:5, function(i) {
simulate_group(wealth_groups[i], wealth_risks[i], "wealth")
}))
#creating the residence group
residence_groups <- assumed_mortality$group[6:7]
residence_risks <- assumed_mortality$risk[6:7]
residence_sim <- bind_rows(lapply(1:2, function(i) {
simulate_group(residence_groups[i], residence_risks[i], "residence")
}))
#creating the education group
education_groups <- assumed_mortality$group[8:11]
education_risks <- assumed_mortality$risk[8:11]
education_sim <- bind_rows(lapply(1:4, function(i) {
simulate_group(education_groups[i], education_risks[i], "education")
}))
#combining the necessary groups
set.seed(123)
n <- 1000
simulated_data <- tibble(
wealth = sample(wealth_groups, n, replace = TRUE),
residence = sample(residence_groups, n, replace = TRUE),
education = sample(education_groups, n, replace = TRUE)
)
# Step 1: Pivot the wide data to long format
long_data <- pivoted_data %>%
pivot_longer(cols = everything(), names_to = "group", values_to = "percentage")
# Step 2: Join assumed mortality and rename risk columns per group type
wealth_groups <- assumed_mortality %>% filter(str_detect(group, "Wealth")) %>%
rename(wealth_group = group, wealth_risk = risk)
residence_groups <- assumed_mortality %>% filter(str_detect(group, "Residence")) %>%
rename(residence_group = group, residence_risk = risk)
education_groups <- assumed_mortality %>% filter(str_detect(group, "Education")) %>%
rename(education_group = group, education_risk = risk)
# Step 3: Merge with long data
# This will split the data into three parts, each for one category
wealth_data <- long_data %>%
filter(group %in% wealth_groups$wealth_group) %>%
left_join(wealth_groups, by = c("group" = "wealth_group"))
residence_data <- long_data %>%
filter(group %in% residence_groups$residence_group) %>%
left_join(residence_groups, by = c("group" = "residence_group"))
education_data <- long_data %>%
filter(group %in% education_groups$education_group) %>%
left_join(education_groups, by = c("group" = "education_group"))
#simulate mortality risk separately
wealth_data <- wealth_data %>%
mutate(died = rbinom(n(), 1, wealth_risk))
residence_data <- residence_data %>%
mutate(died = rbinom(n(), 1, residence_risk))
education_data <- education_data %>%
mutate(died = rbinom(n(), 1, education_risk))
# Add a new variable to indicate the type of group
wealth_data <- wealth_data %>% mutate(group_type = "wealth")
residence_data <- residence_data %>% mutate(group_type = "residence")
education_data <- education_data %>% mutate(group_type = "education")
# Combine all into one dataset
simulated_data <- bind_rows(wealth_data, residence_data, education_data)
#analyze mortality risk
simulated_data %>%
group_by(group_type, group) %>%
summarise(mortality_rate = mean(died), .groups = "drop")
## # A tibble: 10 × 3
## group_type group mortality_rate
## <chr> <chr> <dbl>
## 1 education More than Secondary Education 0.0452
## 2 education Primary Education 0.193
## 3 education Secondary Education 0.0974
## 4 residence Rural Residence 0.181
## 5 residence Urban Residence 0.0888
## 6 wealth Fourth Wealth Quintile 0.0919
## 7 wealth Highest Wealth Quintile 0.0444
## 8 wealth Lowest Wealth Quintile 0.25
## 9 wealth Second Wealth Quintile 0.184
## 10 wealth Third Wealth Quintile 0.131
# Convert group to factor for modeling
simulated_data <- simulated_data %>%
mutate(group = as.factor(group))
# Logistic regression: probability of dying ~ group
model <- glm(died ~ group, data = simulated_data, family = "binomial")
summary(model)
##
## Call:
## glm(formula = died ~ group, family = "binomial", data = simulated_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.29065 0.09660 -23.712 < 2e-16 ***
## groupHighest Wealth Quintile -0.77863 0.16639 -4.679 2.88e-06 ***
## groupLowest Wealth Quintile 1.19204 0.11613 10.265 < 2e-16 ***
## groupMore than Secondary Education -0.76042 0.16548 -4.595 4.33e-06 ***
## groupPrimary Education 0.86096 0.11971 7.192 6.38e-13 ***
## groupRural Residence 0.77894 0.12080 6.448 1.13e-10 ***
## groupSecond Wealth Quintile 0.79984 0.12051 6.637 3.20e-11 ***
## groupSecondary Education 0.06365 0.13489 0.472 0.6370
## groupThird Wealth Quintile 0.39711 0.12720 3.122 0.0018 **
## groupUrban Residence -0.03791 0.13769 -0.275 0.7831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9952.0 on 12839 degrees of freedom
## Residual deviance: 9455.8 on 12830 degrees of freedom
## AIC: 9475.8
##
## Number of Fisher Scoring iterations: 5
#visualize the data
library(ggplot2)
simulated_data %>%
group_by(group, group_type) %>%
summarise(mortality_rate = mean(died)) %>%
ggplot(aes(x = reorder(group, -mortality_rate), y = mortality_rate, fill = group_type)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Simulated Mortality Risk by Group",
x = "Group", y = "Mortality Rate") +
theme_minimal()
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.

#testing goodness of fit of the model using Akaike Information Criterion
aic_value <- AIC(model)
print(paste("Akaike Information Criterion (AIC):", aic_value))
## [1] "Akaike Information Criterion (AIC): 9475.81522015821"
#another model for AIC
# Fit the null model (just an intercept)
null_model <- glm(died ~ 1, data = simulated_data, family = "binomial")
# Fit the full logistic regression model (with predictors)
full_model <- glm(died ~ group,data=simulated_data, family = "binomial")
# Compare the AIC values
AIC(null_model)
## [1] 9954.024
AIC(full_model)
## [1] 9475.815
#Fitness of the model using deviance test
model <- glm(died ~ group, data = simulated_data, family = "binomial")
# Get deviance of the model
deviance(model)
## [1] 9455.815
#another model for deviance test
null_model <- glm(died ~ 1, data = simulated_data, family = "binomial")
full_model <- glm(died ~ group, data = simulated_data, family = "binomial")
anova(null_model, full_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: died ~ 1
## Model 2: died ~ group
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 12839 9952.0
## 2 12830 9455.8 9 496.21 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#anova test
null_model <- glm(died ~ group, data = simulated_data, family = "binomial")
anova(null_model, model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: died ~ group
## Model 2: died ~ group
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 12830 9455.8
## 2 12830 9455.8 0 0