St Mark Maternity

Dorcas Olanike Agboola

2024-12-10

Introduction

#setwd("C:/Users/USER/Desktop/ICAMMDA/Data Analytics/")
library(readr)
Maternity <- read_csv("C:/Users/USER/Desktop/ICAMMDA/Data Analytics/St. Mark's Maternity Hospital (SMMH).csv")
View(Maternity)
summary(Maternity)
##    case_id            generation    date_infection         date_onset        
##  Length:422         Min.   : 2.00   Min.   :2014-04-26   Min.   :2014-05-01  
##  Class :character   1st Qu.:13.00   1st Qu.:2014-09-06   1st Qu.:2014-09-15  
##  Mode  :character   Median :16.00   Median :2014-10-16   Median :2014-10-24  
##                     Mean   :16.67   Mean   :2014-10-26   Mean   :2014-11-03  
##                     3rd Qu.:20.00   3rd Qu.:2014-12-16   3rd Qu.:2014-12-20  
##                     Max.   :37.00   Max.   :2015-04-17   Max.   :2015-04-27  
##                                     NA's   :152                              
##  date_hospitalisation  date_outcome          outcome         
##  Min.   :2014-05-05   Min.   :2014-05-23   Length:422        
##  1st Qu.:2014-09-17   1st Qu.:2014-09-25   Class :character  
##  Median :2014-10-24   Median :2014-11-06   Mode  :character  
##  Mean   :2014-11-05   Mean   :2014-11-13                     
##  3rd Qu.:2014-12-20   3rd Qu.:2014-12-28                     
##  Max.   :2015-04-29   Max.   :2015-05-10                     
##                       NA's   :79                             
##     gender               age          age_unit           age_years    
##  Length:422         Min.   : 0.00   Length:422         Min.   : 0.00  
##  Class :character   1st Qu.: 6.00   Class :character   1st Qu.: 6.00  
##  Mode  :character   Median :12.00   Mode  :character   Median :12.00  
##                     Mean   :15.12                      Mean   :15.03  
##                     3rd Qu.:22.00                      3rd Qu.:22.00  
##                     Max.   :59.00                      Max.   :59.00  
##                     NA's   :5                          NA's   :5      
##    age_cat            age_cat5           hospital              lon        
##  Length:422         Length:422         Length:422         Min.   :-13.27  
##  Class :character   Class :character   Class :character   1st Qu.:-13.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :-13.23  
##                                                           Mean   :-13.23  
##                                                           3rd Qu.:-13.22  
##                                                           Max.   :-13.21  
##                                                                           
##       lat          infector            source              wt_kg      
##  Min.   :8.448   Length:422         Length:422         Min.   :-4.00  
##  1st Qu.:8.461   Class :character   Class :character   1st Qu.:40.00  
##  Median :8.469   Mode  :character   Mode  :character   Median :54.50  
##  Mean   :8.470                                         Mean   :52.12  
##  3rd Qu.:8.479                                         3rd Qu.:65.00  
##  Max.   :8.490                                         Max.   :95.00  
##                                                                       
##      ht_cm          ct_blood        fever              chills         
##  Min.   : 12.0   Min.   :16.00   Length:422         Length:422        
##  1st Qu.: 83.5   1st Qu.:20.00   Class :character   Class :character  
##  Median :128.0   Median :22.00   Mode  :character   Mode  :character  
##  Mean   :121.6   Mean   :21.29                                        
##  3rd Qu.:156.0   3rd Qu.:23.00                                        
##  Max.   :266.0   Max.   :25.00                                        
##                                                                       
##     cough              aches              vomit                temp      
##  Length:422         Length:422         Length:422         Min.   :35.60  
##  Class :character   Class :character   Class :character   1st Qu.:38.30  
##  Mode  :character   Mode  :character   Mode  :character   Median :38.90  
##                                                           Mean   :38.63  
##                                                           3rd Qu.:39.30  
##                                                           Max.   :40.40  
##                                                           NA's   :11     
##  time_admission         bmi          days_onset_hosp 
##  Length:422        Min.   :-100.00   Min.   : 0.000  
##  Class1:hms        1st Qu.:  24.87   1st Qu.: 1.000  
##  Class2:difftime   Median :  33.59   Median : 1.000  
##  Mode  :numeric    Mean   :  51.04   Mean   : 2.036  
##                    3rd Qu.:  53.27   3rd Qu.: 3.000  
##                    Max.   : 428.06   Max.   :18.000  
## 

Observation from the dataset

It was observed that

  1. There are missing values with the following columns
  1. Some age_units is in months instead of years
  2. age column = age_years columns
  3. age_cat column = age_cats columns

To remove some columns that are not needed to the work

Maternity$date_infection <- NULL
Maternity$date_outcome <- NULL
Maternity$date_onset <- NULL
Maternity$date_hospitalisation <- NULL
Maternity$hospital <- NULL
Maternity$infector <- NULL
Maternity$source<- NULL
View(Maternity)
### Since age column is the same as age_years, we remove age_years
Maternity$age_years <- NULL
### Since age_cat column is the same as age_cat5, we remove age_cat5.
Maternity$age_cat5 <- NULL
#### I need to create a table to view the months among the years in age_unit
table(Maternity$age_unit)
## 
## months  years 
##      2    420
## column 48 and 214 ages were in months
### To change the age in months to years and months unit to years.
library(dplyr)
Maternity <-Maternity %>%
  mutate(
    age = ifelse(age_unit == "months", age / 12, age), # Convert age to years if the unit is "months"
    age_unit = ifelse(age_unit == "months", "years", age_unit) # Change unit to "years"
  )
### In order to remove unnecessary zeros.
### Round the ages to a whole number
Maternity$age <- round(Maternity$age)

To check the rows with NA in temp and time_admission column

### To check the rows with NA in column temp
na_age <- which(is.na(Maternity$age))
na_age
## [1]  55 173 244 288 296
na_temp <- which(is.na(Maternity$temp))
na_temp
##  [1]  79  80  81  82 416 417 418 419 420 421 422
na_time_ad<- which(is.na(Maternity$time_admission))
na_time_ad
##  [1]   3   4   7  13  16  17  31  62  82  83  85 107 115 118 126 130 138 156 184
## [20] 195 201 215 233 240 244 248 249 257 260 262 264 274 278 280 281 298 300 309
## [39] 317 331 335 344 345 351 381 396 411

The missing values were replaced with median

### Replace the missing value  with the median
Maternity$age[is.na(Maternity$age)] <- median(Maternity$age, TRUE)
Maternity$temp[is.na(Maternity$temp)] <- median(Maternity$temp, TRUE)
Maternity$time_admission[is.na(Maternity$time_admission)] <- median(Maternity$time_admission, TRUE)
View(Maternity)
# Remove rows with NA in all columns that are character except 'age'
Maternity <- Maternity[complete.cases(Maternity[, -which(names(Maternity) == "age_cat")]), ]

Find the row of bmi that is negative and take the absolute value.

# Find rows where the column has negative values
negative_rowsbmi <- which(Maternity$bmi < 0)
negative_rowsbmi
## [1] 73
# Since there is only one negative value in `bmi` column this might be to typing error, hence we make the value positive by taking the absolute value.
Maternity$bmi[Maternity$bmi < 0] <- abs(Maternity$bmi[Maternity$bmi < 0])
View(Maternity)

One of the weight is negative, hence we take the absolute value

Maternity$wt_kg[Maternity$wt_kg < 0] <- abs(Maternity$wt_kg[Maternity$wt_kg < 0])

Data Visualization

library(ggplot2)
ggplot(Maternity, aes(gender, fill = gender)) + 
  geom_bar() + 
  scale_fill_manual(values = c("m" = "green", "f" = "red"))+
  labs(title=" Frequency representation of gender",x= "Gender", y= "Frequency")

library(ggplot2)
ggplot(Maternity, aes(outcome, fill=outcome)) + 
  geom_bar() + 
  scale_fill_manual(values = c("Death" = "black", "Recover" = "blue"))+
  labs(title=" Frequency representation of outcome",x= "outcome", y= "Frequency")

ggplot(Maternity, aes( gender,age, fill = outcome)) + 
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("Death" = "black", "Recover" = "blue")) +labs(title=" Scatter Plot representation of Sex",x= "gender", y= "age")

ggplot(Maternity, aes(x = gender, y = age, color = outcome)) + 
  geom_jitter(aes(x = interaction(gender, outcome)), width = 0.2) +
  scale_x_discrete(labels = c("m" = "Male", "f" = "Female")) + # Set proper labels
  scale_color_manual(values = c("Death" = "red", "Recover" = "blue")) +
  labs(
    title = "Gender Group",
    x = "Gender",
    y = "Age",
    color = "Outcome"
  ) +
  theme_minimal()

ggplot(Maternity, aes( age,fill=gender)) + 
  geom_bar(position = "dodge",col="black") +
  scale_fill_manual(values = c("f" = "pink", "m" = "green"))

#labs(title=" Scatter Plot representation of Sex",x= "gender", y= "age")
summary(Maternity$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     6.0    12.0    15.5    23.0    59.0

Distribution of Age and Generation

ggplot(Maternity, aes( age,generation)) + 
  geom_line(size=0.7,col="purple") +
  #scale_fill_manual(values = c("f" = "pink", "m" = "green"))+
labs(title=" Line Graph",x= "age", y= "generation")

summary(Maternity$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     6.0    12.0    15.5    23.0    59.0

Boxplots representations to view outliers

ol1 <- ggplot(Maternity, aes(x=generation)) + 
  geom_boxplot(fill="green")
ol2 <- ggplot(Maternity, aes(x=age)) + 
  geom_boxplot(fill="green") 
ol3 <- ggplot(Maternity, aes(x= wt_kg)) + 
  geom_boxplot(fill="green") 
ol4 <- ggplot(Maternity, aes(x= ht_cm)) + 
  geom_boxplot(fill="green") 
ol5 <- ggplot(Maternity, aes(x= ct_blood)) + 
  geom_boxplot(fill="green") 
ol6 <- ggplot(Maternity, aes(x= temp)) + 
  geom_boxplot(fill="green") 
ol7 <- ggplot(Maternity, aes(x= days_onset_hosp)) + 
  geom_boxplot(fill="green") 

ol8 <- ggplot(Maternity, aes(x= bmi)) + 
  geom_boxplot(fill="green") 
library(gridExtra)
grid.arrange(ol1,ol2,ol3,ol4,ol5,ol6,ol7,ol8,ncol=3)

Distribution of Gender by weight

ggplot(Maternity, aes(gender, wt_kg))+
  geom_bar(stat="identity",fill="blue")

Distribution of Gender by Height

ggplot(Maternity, aes(gender, ht_cm))+
  geom_bar(stat="identity",fill="blue")

Distribution of Fever

ggplot(Maternity, aes(fever,fill=fever))+
  geom_bar()+
  scale_fill_manual(values = c("no" = "orange", "yes" = "chocolate"))

#labs(title=" Line Graph",x= "age", y= "generation")
summary(Maternity$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     6.0    12.0    15.5    23.0    59.0

Distribution of Chills

ggplot(Maternity, aes(chills,fill=chills))+
  geom_bar()+
  scale_fill_manual(values = c("no" = "skyblue", "yes" = "gold"))

#labs(title=" Line Graph",x= "age", y= "Frequency")
summary(Maternity$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     6.0    12.0    15.5    23.0    59.0

Distribution by Aches

ggplot(Maternity, aes(aches,fill=aches))+
  geom_bar()+
  scale_fill_manual(values = c("no" = "gold", "yes" = "green"))

#labs(title=" Line Graph",x= "age", y= "Frequency")
summary(Maternity$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     6.0    12.0    15.5    23.0    59.0

Symptom Distribution by Gender

G1 <- ggplot(Maternity, aes(x = gender, fill = fever)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of Fever by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "Fever")
G2 <- ggplot(Maternity, aes(x = gender, fill = chills)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of Chills by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "Chills")
G3 <- ggplot(Maternity, aes(x = gender, fill = vomit)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of  vomit by Gender",
       x = "Gender",
       y = "Frequency",
       fill = " vomit")
G4 <- ggplot(Maternity, aes(x = gender, fill = aches)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of   aches by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "  aches")
G5 <- ggplot(Maternity, aes(x = gender, fill = cough)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of cough by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "cough")

library(gridExtra)
grid.arrange(G1,G2,G3,G4,G5, ncol=2)

Symptoms Distribution by Outcome

OT1 <- ggplot(Maternity, aes(x = outcome, fill = aches)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of aches by outcome",
       x = "outcome",
       y = "Frequency",
       fill = "  aches")
OT2 <- ggplot(Maternity, aes(x = outcome, fill = chills)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of Chills by outcome",
       x = "outcome",
       y = "Frequency",
       fill = "Chills")
OT3 <- ggplot(Maternity, aes(x = outcome, fill = cough)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of cough by outcome",
       x = "outcome",
       y = "Frequency",
       fill =" cough")
OT4 <- ggplot(Maternity, aes(x = outcome, fill = fever)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of fever by outcome",
       x = "outcome",
       y = "Frequency",
       fill ="fever")
OT5 <- ggplot(Maternity, aes(x = outcome, fill = vomit)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of vomit by outcome",
       x = "outcome",
       y = "Frequency",
       fill =" vomit")
library(gridExtra)
grid.arrange(OT1, OT2, OT3, OT4, OT5,ncol=2)

symtomps with Age

library(gridExtra)
A1 <- ggplot(Maternity, aes(x = aches,y= age, fill =aches)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by aches",
       x = "aches",
       y = "Age",
       fill =" aches")

A2 <- ggplot(Maternity, aes(x = chills,y= age, fill =chills)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by chills",
       x = "chills",
       y = "Age",
       fill =" chills")

A3<- ggplot(Maternity, aes(x = cough,y= age, fill = cough)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by cough",
       x = "cough",
       y = "Age",
       fill =" cough")

A4 <- ggplot(Maternity, aes(x = fever,y= age, fill = fever)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by fever",
       x = "fever",
       y = "Age",
       fill =" fever")

A5 <- ggplot(Maternity, aes(x = vomit,y= age, fill = vomit)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by vomit",
       x = "vomit",
       y = "Age",
       fill =" vomit")
grid.arrange(A1,A2,A3,A4,A5,ncol=3.5)

Since it was observed that some age_cat are repeated, then let us factor it

Maternity$age_cat<-factor(Maternity$age_cat,levels=c("0-4", "5-9",  "10-14", "15-19", "20-29" ,"30-49", "50-69"), ordered = TRUE)
str(Maternity$age_cat)
##  Ord.factor w/ 7 levels "0-4"<"5-9"<"10-14"<..: 4 3 4 1 3 6 1 3 6 1 ...
levels(Maternity$age_cat)
## [1] "0-4"   "5-9"   "10-14" "15-19" "20-29" "30-49" "50-69"
ggplot(Maternity,aes(age_cat))+
  geom_bar(fill="chocolate")+
  labs(title="Age Category Distribution",x="Age Category",y="Frequency")

library(ggplot2)
ggplot(Maternity,aes(x=gender,fill=outcome))+
  geom_bar(position="dodge")+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Gender Distribution with Outcome",x="gender",y="Frequency")

library(ggplot2)
Maternity$age_cat<-factor(Maternity$age_cat,levels=c("0-4", "5-9",  "10-14", "15-19", "20-29" ,"30-49", "50-69"), ordered = TRUE)
str(Maternity$age_cat)
##  Ord.factor w/ 7 levels "0-4"<"5-9"<"10-14"<..: 4 3 4 1 3 6 1 3 6 1 ...
ggplot(Maternity,aes(x=age_cat,fill=outcome))+
  geom_bar(position="dodge")+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Age_cat Distribution with Outcome",x="Age_cat",y="Frequency")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
Maternity$age_cat<-factor(Maternity$age_cat,levels=c("0-4", "5-9",  "10-14", "15-19", "20-29" ,"30-49", "50-69"), ordered = TRUE)
str(Maternity$age_cat)
##  Ord.factor w/ 7 levels "0-4"<"5-9"<"10-14"<..: 4 3 4 1 3 6 1 3 6 1 ...
ggplot(Maternity,aes(x=age_cat,fill=outcome))+
  geom_bar()+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Age_cat Distribution with Outcome",x="Age_cat",y="Frequency")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
ggplot(Maternity,aes(x=days_onset_hosp,fill=outcome))+
  geom_bar(position = "dodge")+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Days_onset_hosp Distribution with Outcome",x="Days_onset_hosp",y="Frequency")

  #theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
ggplot(Maternity,aes(x=days_onset_hosp,fill=outcome))+
  geom_bar()+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Days_onset_hosp Distribution with Outcome",x="Days_onset_hosp",y="Frequency")

library(corrplot)
sel_columns <- Maternity[, c("generation", "age", "wt_kg", "ht_cm","ct_blood","temp")]
# Calculate the correlation matrix
cor_matrix <- cor(sel_columns, use = "complete.obs")

# Plot the correlation matrix
corrplot(cor_matrix, method = "color", 
         addCoef.col = "black", # Display correlation coefficients
         tl.col = "black",      # Label text color
         tl.cex = 0.8,          # Label text size
         number.cex = 0.9,      # Coefficient text size
         col = colorRampPalette(c("red", "white", "blue"))(200))

To plot Age with respect to weight

W <- ggplot(Maternity, aes(wt_kg,age))+
  geom_point(size =1,col="red")
summary(Maternity)
##    case_id            generation      outcome             gender         
##  Length:305         Min.   : 3.00   Length:305         Length:305        
##  Class :character   1st Qu.:12.00   Class :character   Class :character  
##  Mode  :character   Median :16.00   Mode  :character   Mode  :character  
##                     Mean   :16.62                                        
##                     3rd Qu.:21.00                                        
##                     Max.   :37.00                                        
##                                                                          
##       age         age_unit          age_cat        lon              lat       
##  Min.   : 0.0   Length:305         0-4  :59   Min.   :-13.27   Min.   :8.448  
##  1st Qu.: 6.0   Class :character   5-9  :62   1st Qu.:-13.25   1st Qu.:8.461  
##  Median :12.0   Mode  :character   10-14:54   Median :-13.23   Median :8.468  
##  Mean   :15.5                      15-19:35   Mean   :-13.23   Mean   :8.469  
##  3rd Qu.:23.0                      20-29:56   3rd Qu.:-13.22   3rd Qu.:8.479  
##  Max.   :59.0                      30-49:35   Max.   :-13.21   Max.   :8.490  
##                                    50-69: 4                                   
##      wt_kg           ht_cm          ct_blood        fever          
##  Min.   : 4.00   Min.   : 12.0   Min.   :16.00   Length:305        
##  1st Qu.:40.00   1st Qu.: 85.0   1st Qu.:20.00   Class :character  
##  Median :54.00   Median :129.0   Median :22.00   Mode  :character  
##  Mean   :52.24   Mean   :122.1   Mean   :21.23                     
##  3rd Qu.:66.00   3rd Qu.:157.0   3rd Qu.:23.00                     
##  Max.   :95.00   Max.   :266.0   Max.   :25.00                     
##                                                                    
##     chills             cough              aches              vomit          
##  Length:305         Length:305         Length:305         Length:305        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##       temp       time_admission         bmi         days_onset_hosp 
##  Min.   :35.60   Length:305        Min.   : 12.31   Min.   : 0.000  
##  1st Qu.:38.40   Class1:hms        1st Qu.: 24.65   1st Qu.: 1.000  
##  Median :38.90   Class2:difftime   Median : 33.57   Median : 1.000  
##  Mean   :38.71   Mode  :numeric    Mean   : 52.33   Mean   : 2.151  
##  3rd Qu.:39.30                     3rd Qu.: 53.02   3rd Qu.: 3.000  
##  Max.   :40.40                     Max.   :428.06   Max.   :18.000  
## 

To plot Age with respect to Height

H <- ggplot(Maternity, aes(ht_cm,age))+
  geom_point(size=1, col="brown")
grid.arrange(W,H, ncol=2)

It was observed that both Height and weight is inversely proportional to Age. Increase in age cause an increase in height.

ggplot(Maternity,aes(outcome,days_onset_hosp,fill=outcome))+
  geom_bar(stat="identity")

ggplot(Maternity,aes(days_onset_hosp,fill=outcome))+
  geom_bar(position="dodge",col="white")+
  scale_fill_manual(values = c("Death" = "purple", "Recover" = "brown"))+
  labs(title = "Distribution of days_onset_hosp by Outcome",
       x = "days_onset_hosp",
       y = "Frequency",
       fill = "  Outcome")