St Mark’s Maternity Hospital Project

Joy Winner Emmanuel

2025-01-22

INTRODUCTION

This is a presentation which includes data exploration and visualization, data cleaning and preprocessing of the world standard population by sex.

Data Exploration and Visualization

  1. Load and explore a dataset using RStudio.
  2. Create visualizations (e.g., histograms, scatter plots, bar charts) to understand data distribution.
  3. Perform principal component analysis (PCA) or clustering (k-means) on a dataset.

Data Cleaning and Preprocessing

  1. Handle missing values in a dataset using various methods (e.g., mean imputation, median imputation).
  2. Remove duplicates and outliers from a dataset.
  3. Transform categorical variables into numerical variables.
#Setting the working directory
setwd("C:\\Users\\USER\\Documents\\R")
#Load the tidyverse library, which includes readr
library(tidyverse)
# Checking the summary of the data
summary(st_mark)
##    case_id            generation    date_infection         date_onset        
##  Length:422         Min.   : 2.00   Min.   :2014-04-26   Min.   :2014-05-01  
##  Class :character   1st Qu.:13.00   1st Qu.:2014-09-06   1st Qu.:2014-09-15  
##  Mode  :character   Median :16.00   Median :2014-10-16   Median :2014-10-24  
##                     Mean   :16.67   Mean   :2014-10-26   Mean   :2014-11-03  
##                     3rd Qu.:20.00   3rd Qu.:2014-12-16   3rd Qu.:2014-12-20  
##                     Max.   :37.00   Max.   :2015-04-17   Max.   :2015-04-27  
##                                     NA's   :152                              
##  date_hospitalisation  date_outcome          outcome         
##  Min.   :2014-05-05   Min.   :2014-05-23   Length:422        
##  1st Qu.:2014-09-17   1st Qu.:2014-09-25   Class :character  
##  Median :2014-10-24   Median :2014-11-06   Mode  :character  
##  Mean   :2014-11-05   Mean   :2014-11-13                     
##  3rd Qu.:2014-12-20   3rd Qu.:2014-12-28                     
##  Max.   :2015-04-29   Max.   :2015-05-10                     
##                       NA's   :79                             
##     gender               age          age_unit           age_years    
##  Length:422         Min.   : 0.00   Length:422         Min.   : 0.00  
##  Class :character   1st Qu.: 6.00   Class :character   1st Qu.: 6.00  
##  Mode  :character   Median :12.00   Mode  :character   Median :12.00  
##                     Mean   :15.12                      Mean   :15.03  
##                     3rd Qu.:22.00                      3rd Qu.:22.00  
##                     Max.   :59.00                      Max.   :59.00  
##                     NA's   :5                          NA's   :5      
##    age_cat            age_cat5           hospital              lon        
##  Length:422         Length:422         Length:422         Min.   :-13.27  
##  Class :character   Class :character   Class :character   1st Qu.:-13.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :-13.23  
##                                                           Mean   :-13.23  
##                                                           3rd Qu.:-13.22  
##                                                           Max.   :-13.21  
##                                                                           
##       lat          infector            source              wt_kg      
##  Min.   :8.448   Length:422         Length:422         Min.   :-4.00  
##  1st Qu.:8.461   Class :character   Class :character   1st Qu.:40.00  
##  Median :8.469   Mode  :character   Mode  :character   Median :54.50  
##  Mean   :8.470                                         Mean   :52.12  
##  3rd Qu.:8.479                                         3rd Qu.:65.00  
##  Max.   :8.490                                         Max.   :95.00  
##                                                                       
##      ht_cm          ct_blood        fever              chills         
##  Min.   : 12.0   Min.   :16.00   Length:422         Length:422        
##  1st Qu.: 83.5   1st Qu.:20.00   Class :character   Class :character  
##  Median :128.0   Median :22.00   Mode  :character   Mode  :character  
##  Mean   :121.6   Mean   :21.29                                        
##  3rd Qu.:156.0   3rd Qu.:23.00                                        
##  Max.   :266.0   Max.   :25.00                                        
##                                                                       
##     cough              aches              vomit                temp      
##  Length:422         Length:422         Length:422         Min.   :35.60  
##  Class :character   Class :character   Class :character   1st Qu.:38.30  
##  Mode  :character   Mode  :character   Mode  :character   Median :38.90  
##                                                           Mean   :38.63  
##                                                           3rd Qu.:39.30  
##                                                           Max.   :40.40  
##                                                           NA's   :11     
##  time_admission         bmi          days_onset_hosp 
##  Length:422        Min.   :-100.00   Min.   : 0.000  
##  Class1:hms        1st Qu.:  24.87   1st Qu.: 1.000  
##  Class2:difftime   Median :  33.59   Median : 1.000  
##  Mode  :numeric    Mean   :  51.04   Mean   : 2.036  
##                    3rd Qu.:  53.27   3rd Qu.: 3.000  
##                    Max.   : 428.06   Max.   :18.000  
## 
# Viewing the data
View(st_mark)
# Checking for missing values
table(is.na(st_mark))
## 
## FALSE  TRUE 
## 11876   784

Observations from the dataset

There are lots of missing values in the columns listed below:

library(dplyr)
# Create a new data frame (st_mark_new) from the original data frame (st_mark) 
# by removing the following columns:
st_mark_new <- st_mark %>% select(-c("date_infection", "date_onset", "date_hospitalisation", "date_outcome", "age_years", "age_cat5", "hospital", "infector", "source"))
View(st_mark_new)
# Generate a frequency table for the "age_unit" column in the st_mark_new data frame.
table(st_mark_new$age_unit)
## 
## months  years 
##      2    420
# Convert ages from months to years by dividing by 12 and update the "age_unit" to "years".
st_mark_new <- st_mark_new %>% 
  mutate(
    age = ifelse(age_unit == "months", age / 12, age),
    age_unit = ifelse(age_unit == "months","years",age_unit)
  )
# Generate a frequency table for the "age_unit" column 
table(st_mark_new$age_unit)
## 
## years 
##   422
# Summary of the age column
summary(st_mark_new$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    6.00   12.00   15.03   22.00   59.00       5
# Replace missing ages with the median age.
st_mark_new$age[is.na(st_mark_new$age)] <- median(st_mark_new$age, na.rm = TRUE)
summary(st_mark_new$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    6.00   12.00   14.99   22.00   59.00
# Summarize the "temp" column and replace missing values with its median.
summary(st_mark_new$temp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   35.60   38.30   38.90   38.63   39.30   40.40      11
st_mark_new$temp[is.na(st_mark_new$temp)] <- median(st_mark_new$temp, na.rm = TRUE)
# Replace missing values of the "time_admission" column with its median.
st_mark_new$time_admission[is.na(st_mark_new$time_admission)] <- median(st_mark_new$time_admission, na.rm = TRUE)
# Remove rows with missing values, excluding the "age_cat" column, and then view the updated data frame.
st_mark_new <- st_mark_new[complete.cases(st_mark_new[,
               -which(names(st_mark_new) == "age_cat")]), ]

View(st_mark_new)

Checking for outliers in age, ht_cm, bmi, days_onset_hosp, wt_kg, temp using boxplot

st_mark_new$bmi <- abs(st_mark_new$bmi)
st_mark_new$wt_kg <- abs(st_mark_new$wt_kg)
# Box plot for age 
j1 <- ggplot(st_mark_new, aes(age)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Age Distribution")

# Box plot for height (ht_cm) 
j2 <- ggplot(st_mark_new, aes(ht_cm)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Height Distribution")

# Box plot for BMI 
j3 <- ggplot(st_mark_new, aes(bmi)) +
  geom_boxplot(fill = "salmon") +
  labs(title = "BMI Distribution")

# Box plot for days from onset to hospital (days_onset_hosp) 
j4 <- ggplot(st_mark_new, aes(days_onset_hosp)) +
  geom_boxplot(fill = "lightcoral") +
  labs(title = "Days from Onset to Hospital")

# Box plot for weight (wt_kg) 
j5 <- ggplot(st_mark_new, aes(wt_kg)) +
  geom_boxplot(fill = "yellow") +
  labs(title = "Weight Distribution")

# Box plot for temperature (temp) 
j6 <- ggplot(st_mark_new, aes(temp)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Temperature Distribution")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(j1,j2,j3,j4,j5,j6, ncol=3)

Data Visualization

# Plotting the bar chart of gender against outcome
ggplot(st_mark_new, aes(x = gender, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Gender vs Outcome", x = "Gender", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "skyblue", "Death" = "gold")) +
  theme_minimal()

# Bar plot for outcome
ggplot(st_mark_new, aes(x = outcome)) +
  geom_bar(fill = "orange") +  # Bar color set to skyblue
  labs(title = "Outcome Distribution", x = "Outcome", y = "Count") +
  theme_minimal()

# Arrange age categories in natural numeric order
st_mark_new$age_cat <- factor(
  st_mark_new$age_cat,
  levels = c("0-4", "5-9", "10-14", "15-19", "20-29", "30-49","50-69"))

# Verify the order
levels(st_mark_new$age_cat)
## [1] "0-4"   "5-9"   "10-14" "15-19" "20-29" "30-49" "50-69"
ggplot(st_mark_new, aes(x = age_cat, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution of Age Category vs Outcome", x = "Age Category", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "blue", "Death" = "gold")) +
  theme_minimal()

Relationship between how much time it takes seeing the infection and being in the hospital

ggplot(st_mark_new, aes(x = days_onset_hosp, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution of Days_onset_hosp vs Outcome", x = "days_onset_hosp", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "blue", "Death" = "red")) +
  theme_minimal()

Comparing each symptom with outcome

SY1 <- ggplot(st_mark_new, aes(x = fever, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Fever vs Outcome", x = "fever", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "blue", "Death" = "red")) +
  theme_minimal()

SY2 <- ggplot(st_mark_new, aes(x = chills, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Chills vs Outcome", x = "chills", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "gold", "Death" = "red")) +
  theme_minimal()

SY3 <- ggplot(st_mark_new, aes(x = cough, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Cough vs Outcome", x = "cough", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "lightblue", "Death" = "yellow")) +
  theme_minimal()

SY4 <- ggplot(st_mark_new, aes(x = aches, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Aches vs Outcome", x = "aches", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "lightgreen", "Death" = "red")) +
  theme_minimal()

SY5 <- ggplot(st_mark_new, aes(x = vomit, fill = outcome)) +
  geom_bar(position = "dodge") +
  labs(title = "Vomit vs Outcome", x = "vomit", y = "Frequency") +
  scale_fill_manual(values = c("Recover" = "blue", "Death" = "orange")) +
  theme_minimal()

grid.arrange(SY1,SY2,SY3,SY4,SY5, ncol=3)

library(corrplot)
sel_columns <- st_mark_new[,c("generation","age","wt_kg","ht_cm","ct_blood","temp")]
#Calculate the correlation matrix
cor_matrix <- cor(sel_columns, use = "complete.obs")

#Plot the correlation matrix
corrplot(cor_matrix, method = "color",
         addCoef.col = "black",
         tl.col = "black",
         tl.cex = 0.8,
         number.cex = 0.7,
         col = colorRampPalette(c("red","white","blue"))(200))

# Scatter plot of the weight vs age

ggplot(st_mark_new, aes(x = age, y = wt_kg)) +
  geom_point(color = "blue", size = 2) +
  labs(title = "Scatter Plot of Weight vs Age",
       x = "Age",
       y = "Weight") +
  theme_minimal()  # Apply a minimal theme for a clean look

# Convert categorical variables to numerical variables
# Convert multiple categorical columns to numeric
cols_to_convert <- c("gender", "fever", "chills", "outcome", "cough", "aches", "vomit")
st_mark_new[cols_to_convert] <- lapply(st_mark_new[cols_to_convert], function(x) as.numeric(as.factor(x)))

View(st_mark_new)