Code
library(gtsummary)
library(gt)
library(here)
library(readxl)
library(labelled)
library(janitor)
library(sjPlot)
library(ggthemes)
library(CleanEasy)
library(tidyverse)You have been provided with 2 datasets (In excel) for a study around Kenya investigating the effect of a vaccine on diarrhoea caused by rotavirus. To protect children below the age of 5 years against diarrhoea, they need to receive 2 doses of the vaccine. Study the data well, and using any statistical software, import and merge the two data sets, and answer the following questions. You will be required to present your findings to the interviewing panel, so make sure all your work is saved.
library(gtsummary)
library(gt)
library(here)
library(readxl)
library(labelled)
library(janitor)
library(sjPlot)
library(ggthemes)
library(CleanEasy)
library(tidyverse)rota1 <- read_excel(here("Data","1645201251039_Rota1.xls"))
rota2 <- read_excel(here("Data","1645201247905_Rota2.xls"))Merge the two data sets and using the questions/hints below, make a short 5 minutes’ presentation on your findings.
The objective of the study is to evaluate the effectiveness of a rotavirus vaccine against diarrhoea in children <5 years of age in Kenya.
Null Hypothesis: Rotavirus vaccine has no preventive effect against diarrhea disease Alternative Hypothesis*: Rotavirus vaccine has a significant effect against diarrhea disease
Have a glimpse of the data structure before you start any cleaning to help identify data structural error/ data type errors
rota1 |> glimpse()Rows: 677
Columns: 3
$ Uniqueid <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
$ Dateofbirth <dttm> 2014-04-24, 2014-08-03, 2014-09-08, 2014-05-30, 2014-08-2~
$ Gender <dbl> 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2~
rota2 |> glimpse()Rows: 677
Columns: 10
$ Uniqueid <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16~
$ Dateofenrollment <dttm> 2015-02-05, 2015-04-13, 2015-07-09, 2015-07-13, 2015~
$ Height <chr> "78", "69.5", "66", "70", "75.8", "64", "73.5", "61.9~
$ Weight <chr> "9.2", "8", "6", "6", "8.6", "8", "8.1", "6", "63.1",~
$ Outcome <dbl> 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
$ Malaria <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1,~
$ case <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,~
$ dose2_vaccinated <chr> NA, "1", "0", "1", "1", "1", "0", "1", "1", "1", "1",~
$ Hospitalname <chr> "Lwak", "Lwak", "Lwak", "Lwak", "Lwak", "Lwak", "Lwak~
$ vaccine_dose <dbl> NA, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
rota_set <- rota1 |> full_join(rota2) |>
# change variable data type (string to numeric)
# ***************************************************
mutate_at(vars(Height,Weight,dose2_vaccinated),as.numeric) |>
# change variable data type (dttm to date)
# ***************************************************
mutate_at(vars(Dateofbirth,Dateofenrollment),as.Date) |>
# change variable data type (string to factor)
# ***************************************************
mutate(Hospitalname = as.factor(Hospitalname)) |>
# Replace wrong data entries
#---------------------------------------------------
mutate(Weight = replace(Weight,Uniqueid == 134,20),
Weight = replace(Weight,Uniqueid == 147, 10)) |>
# Filter to inclusion ranges
# 1) Weight
filter(between(Weight,2,20)) |>
# 2) Height
filter(between(Height,30,100))Click on panel tab to view the histograms
##| layout-ncol: 2
#| column: page-inset-right
#| fig-width: 8
#| fig-height: 3
#| fig-cap: "Height Histogram Exploration"
#| fig-align: center
set_theme(base = theme_minimal())
rota_set |> select(Weight) |> sjPlot::plot_frq(type = "histogram", title = "Histogram Distribution of Weight",
show.mean = TRUE,show.sd = TRUE,mean.line.type = 2,
normal.curve = TRUE ) + theme(plot.title = element_text(hjust = 0.5))##| layout-ncol: 2
#| column: page-inset-right
#| fig-width: 8
#| fig-height: 3
#| fig-cap: "Weight Histogram Exploration"
#| fig-align: center
set_theme(base = theme_pander())
rota_set |> select(Height) |> sjPlot::plot_frq(type = "histogram", title = "Histogram Distribution of Height",
show.mean = TRUE,show.sd = TRUE,mean.line.type = 2,geom.colors = "steelblue",
normal.curve = TRUE ) + theme(plot.title = element_text(hjust = 0.5))Calculate age in months and generate 5 age groups as: <3 months, 3-5 months, 6-11 months, 12-23 months and >=24 months.
#: For the purpose of this analysis 30 days is used to represent 1 month
##| layout-ncol: 2
#| column: page-inset-right
#| fig-width: 8
#| fig-height: 3
#| fig-cap: "Distribution of Ages"
#| fig-align: center
rota_set |> mutate(age = round(as.numeric(Dateofenrollment - Dateofbirth)/30,0),
`Age Distribution` = case_when(age < 3 ~ "<3 months",
between(age,3,5) ~ "3-5 months",
between(age,6,11) ~ "6-11 months",
between(age,12,23) ~ "12-23 months",
age >= 24 ~ ">=24 months"),
`Age Distribution` = fct_relevel(`Age Distribution`,
"<3 months","3-5 months","6-11 months","12-23 months",">=24 months")) |>
select(`Age Distribution`) |> #sjPlot::plot_frq()
tabyl(`Age Distribution`) |> adorn_pct_formatting() |>
ggplot(aes(x = `Age Distribution`, y = n, fill = `Age Distribution`)) +
geom_col(width = 0.4,alpha = 0.5) +
geom_text(aes(label = str_c(n,"\n(",percent,")")),vjust = -.1) +
theme_minimal() + theme(legend.position = "none") +
labs(y = "Count",title = "Distribution of Under five Ages") +
scale_y_continuous(limits = c(0,350))##| layout-ncol: 2
#| column: page-inset-right
#| fig-width: 8
#| fig-height: 3
#| fig-cap: "Distribution of Ages"
#| fig-align: center
rota_set |> select(case,dose2_vaccinated) |>
set_value_labels(case = c("Case" = 1, "Control" = 0),
dose2_vaccinated = c("received 2 dose" = 1, "received 0 dose" = 0)) |>
rename(`Dose 2 vaccinated` = dose2_vaccinated, Case = case) |>
mutate_if(is.labelled,as_factor) |>
mutate(`Dose 2 vaccinated` = fct_rev(`Dose 2 vaccinated`),
Case = fct_rev(Case)) |>
tbl_cross(row = `Dose 2 vaccinated`,col = Case, percent = "column",
missing = "no",digits = c(0,1)) |> add_p() |> bold_labels()| Case | Total | p-value1 | ||
|---|---|---|---|---|
| Case | Control | |||
| Dose 2 vaccinated | <0.001 | |||
| received 2 dose | 49 (59.8%) | 305 (82.2%) | 354 (78.1%) | |
| received 0 dose | 33 (40.2%) | 66 (17.8%) | 99 (21.9%) | |
| Total | 82 (100.0%) | 371 (100.0%) | 453 (100.0%) | |
| 1 Pearson’s Chi-squared test | ||||
rota_set |> mutate(age = round(as.numeric(Dateofenrollment - Dateofbirth)/30,0),
`Age Distribution` = case_when(age < 3 ~ "<3 months",
between(age,3,5) ~ "3-5 months",
between(age,6,11) ~ "6-11 months",
between(age,12,23) ~ "12-23 months",
age >= 24 ~ ">=24 months"),
`Age Distribution` = fct_relevel(`Age Distribution`,
"<3 months","3-5 months","6-11 months","12-23 months",">=24 months")) |>
select(case,dose2_vaccinated,Malaria,`Age Distribution`) |>
set_value_labels(case = c("Case" = 1, "Control" = 0),
dose2_vaccinated = c("received 2 dose" = 1, "received 0 dose" = 0)) |>
rename(`Dose 2 vaccinated` = dose2_vaccinated, Case = case) |>
mutate(`Dose 2 vaccinated` = as_factor(`Dose 2 vaccinated`)) %>%
glm(Case ~ `Dose 2 vaccinated` + `Age Distribution`,data = .,family = binomial()) |>
tbl_regression(exponentiate = T,show_single_row = c(`Dose 2 vaccinated`),
add_estimate_to_reference_rows = TRUE)| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Dose 2 vaccinated | 0.25 | 0.15, 0.44 | <0.001 |
| Age Distribution | |||
| <3 months | 1.00 | — | |
| 3-5 months | 2.57 | 0.33, 53.4 | 0.4 |
| 6-11 months | 11.1 | 2.02, 209 | 0.024 |
| 12-23 months | 6.13 | 1.09, 116 | 0.092 |
| >=24 months | 3.91 | 0.32, 91.9 | 0.3 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
2 by 2 Epidemiological Table
Using 2 by 2 epidemiological table of case and control to check the Odds & Odds Ratio(OR) among exposed and non exposed.
data <- rota_set |> select(case,dose2_vaccinated) |>
set_value_labels(case = c("Case" = 1, "Control" = 0),
dose2_vaccinated = c("received 2 dose" = 1, "received 0 dose" = 0)) |>
rename(`Dose 2 vaccinated` = dose2_vaccinated, Case = case) |>
mutate_if(is.labelled,as_factor) |>
mutate(`Dose 2 vaccinated` = fct_rev(`Dose 2 vaccinated`),
Case = fct_rev(Case))
epiR::epi.2by2(table(data$`Dose 2 vaccinated`,data$Case),method = "case.control",interpret = F) Outcome + Outcome - Total Odds
Exposed + 49 305 354 0.16 (0.12 to 0.21)
Exposed - 33 66 99 0.50 (0.32 to 0.74)
Total 82 371 453 0.22 (0.17 to 0.28)
Point estimates and 95% CIs:
-------------------------------------------------------------------
Exposure odds ratio 0.32 (0.19, 0.54)
Attrib fraction (est) in the exposed (%) -210.28 (-435.79, -78.77)
Attrib fraction (est) in the population (%) -126.22 (-218.67, -60.59)
-------------------------------------------------------------------
Uncorrected chi2 test that OR = 1: chi2(1) = 19.826 Pr>chi2 = <0.001
Fisher exact test that OR = 1: Pr>chi2 = <0.001
Wald confidence limits
CI: confidence interval