Cental Hospital

Dorcas Olanike Agboola

2024-12-10

Introduction

Load the dataset using the library and find the summary of the dataset.

library(readr)
CHospital <- read_csv("C:/Users/USER/Desktop/ICAMMDA/Data Analytics/Central Hospital.csv")
View(CHospital)
dim(CHospital)
## [1] 454  30
summary(CHospital)
##    case_id            generation    date_infection         date_onset        
##  Length:454         Min.   : 2.00   Min.   :2014-04-23   Min.   :2014-05-01  
##  Class :character   1st Qu.:13.00   1st Qu.:2014-09-07   1st Qu.:2014-09-20  
##  Mode  :character   Median :16.00   Median :2014-10-12   Median :2014-10-23  
##                     Mean   :16.88   Mean   :2014-10-22   Mean   :2014-11-05  
##                     3rd Qu.:20.00   3rd Qu.:2014-11-22   3rd Qu.:2014-12-15  
##                     Max.   :36.00   Max.   :2015-04-22   Max.   :2015-04-28  
##                                     NA's   :154                              
##  date_hospitalisation  date_outcome          outcome         
##  Min.   :2014-05-06   Min.   :2014-05-11   Length:454        
##  1st Qu.:2014-09-23   1st Qu.:2014-10-03   Class :character  
##  Median :2014-10-25   Median :2014-11-06   Mode  :character  
##  Mean   :2014-11-07   Mean   :2014-11-18                     
##  3rd Qu.:2014-12-17   3rd Qu.:2014-12-31                     
##  Max.   :2015-04-28   Max.   :2015-05-17                     
##                       NA's   :74                             
##     gender               age         age_unit           age_years    
##  Length:454         Min.   : 0.0   Length:454         Min.   : 0.00  
##  Class :character   1st Qu.: 7.0   Class :character   1st Qu.: 7.00  
##  Mode  :character   Median :15.0   Mode  :character   Median :15.00  
##                     Mean   :17.4                      Mean   :17.38  
##                     3rd Qu.:24.0                      3rd Qu.:24.00  
##                     Max.   :87.0                      Max.   :87.00  
##                     NA's   :9                         NA's   :9      
##    age_cat            age_cat5           hospital              lon        
##  Length:454         Length:454         Length:454         Min.   :-13.27  
##  Class :character   Class :character   Class :character   1st Qu.:-13.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :-13.23  
##                                                           Mean   :-13.23  
##                                                           3rd Qu.:-13.22  
##                                                           Max.   :-13.21  
##                                                                           
##       lat          infector            source              wt_kg       
##  Min.   :8.448   Length:454         Length:454         Min.   : -2.00  
##  1st Qu.:8.460   Class :character   Class :character   1st Qu.: 43.00  
##  Median :8.468   Mode  :character   Mode  :character   Median : 57.50  
##  Mean   :8.469                                         Mean   : 55.13  
##  3rd Qu.:8.479                                         3rd Qu.: 67.00  
##  Max.   :8.490                                         Max.   :103.00  
##                                                                        
##      ht_cm          ct_blood        fever              chills         
##  Min.   : 15.0   Min.   :17.00   Length:454         Length:454        
##  1st Qu.: 97.0   1st Qu.:20.00   Class :character   Class :character  
##  Median :135.0   Median :22.00   Mode  :character   Mode  :character  
##  Mean   :129.7   Mean   :21.19                                        
##  3rd Qu.:161.0   3rd Qu.:22.00                                        
##  Max.   :335.0   Max.   :25.00                                        
##                                                                       
##     cough              aches              vomit                temp     
##  Length:454         Length:454         Length:454         Min.   :35.7  
##  Class :character   Class :character   Class :character   1st Qu.:37.8  
##  Mode  :character   Mode  :character   Mode  :character   Median :38.8  
##                                                           Mean   :38.5  
##                                                           3rd Qu.:39.2  
##                                                           Max.   :40.4  
##                                                           NA's   :3     
##  time_admission         bmi         days_onset_hosp 
##  Length:454        Min.   :-41.32   Min.   : 0.000  
##  Class1:hms        1st Qu.: 24.10   1st Qu.: 1.000  
##  Class2:difftime   Median : 32.23   Median : 1.000  
##  Mode  :numeric    Mean   : 44.57   Mean   : 1.852  
##                    3rd Qu.: 47.71   3rd Qu.: 2.000  
##                    Max.   :370.37   Max.   :12.000  
## 

Observation from the dataset

It was observed that

  1. There are missing values with the following columns
  1. Some age_units is in months instead of years
  2. age column = age_years columns
  3. age_cat column = age_cats columns

I remove some columns that are not needed in answering the question in this work.

CHospital$date_infection <- NULL
CHospitaldate_outcome <- NULL
CHospital$date_onset <- NULL
CHospital$date_hospitalisation <- NULL
CHospital$hospital <- NULL
CHospital$infector <- NULL
CHospital$source<- NULL
View(CHospital)
### Since age column is the same as age_years, we remove age_years
CHospital$age_years <- NULL
### Since age_cat column is the same as age_cat5, we remove age_cat5.
CHospital$age_cat5 <- NULL

I checked the rows with NA in temp and time_admission column

### To check the rows with NA in column temp
na_age <- which(is.na(CHospital$age))
na_age
## [1]  69 158 250 279 312 322 325 360 363
na_temp <- which(is.na(CHospital$temp))
na_temp
## [1] 452 453 454
na_time_ad<- which(is.na(CHospital$time_admission))
na_time_ad
##  [1]  13  18  20  21  27  32  45  54  64  65  70  72  74  91  92 103 107 110 125
## [20] 128 169 171 172 187 192 195 199 206 209 220 228 229 251 253 261 274 286 287
## [39] 291 300 304 305 306 307 314 335 341 346 354 362 363 369 378 379 389 394 409
## [58] 411 420 435 437 438 443 446 449 454

The missing values were replaced with median

### Replace the missing value  with the median
CHospital$age[is.na(CHospital$age)] <- median(CHospital$age, TRUE)
CHospital$temp[is.na(CHospital$temp)] <- median(CHospital$temp, TRUE)
CHospital$time_admission[is.na(CHospital$time_admission)] <- median(CHospital$time_admission, TRUE)
View(CHospital)
# Remove rows with NA in all columns that are character except 'age_cat'
CHospital <- CHospital[complete.cases(CHospital[, -which(names(CHospital) == "age_cat")]), ]

Identify the row of bmi that is negative and take the absolute value.

# Find rows where the column has negative values
negative_rowsbmi <- which(CHospital$bmi < 0)
negative_rowsbmi
## integer(0)
# Since there is only one negative value in `bmi` column this might be due to typing error, hence we make the value positive by taking the absolute value.
CHospital$bmi[CHospital$bmi < 0] <- abs(CHospital$bmi[CHospital$bmi < 0])
View(CHospital)

One of the weight is negative, hence we take the absolute value, since weight can not be negative.

CHospital$wt_kg[CHospital$wt_kg < 0] <- abs(CHospital$wt_kg[CHospital$wt_kg < 0])

Data Visualization

library(ggplot2)
ggplot(CHospital, aes(gender, fill = gender)) + 
  geom_bar() + 
  scale_fill_manual(values = c("m" = "green", "f" = "red"))+
  labs(title=" Frequency representation of gender",x= "Gender", y= "Frequency")

library(ggplot2)
ggplot(CHospital, aes(outcome, fill=outcome)) + 
  geom_bar() + 
  scale_fill_manual(values = c("Death" = "black", "Recover" = "blue"))+
  labs(title=" Frequency representation of outcome",x= "outcome", y= "Frequency")

ggplot(CHospital, aes( gender,age, fill = outcome)) + 
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("Death" = "black", "Recover" = "blue")) +labs(title=" Barchart representation of gender",x= "gender", y= "age")

ggplot(CHospital, aes(x = gender, y = age, color = outcome)) + 
  geom_jitter(aes(x = interaction(gender, outcome)), width = 0.2) +
  scale_x_discrete(labels = c("m" = "Male", "f" = "Female")) + # Set proper labels
  scale_color_manual(values = c("Death" = "red", "Recover" = "blue")) +
  labs(
    title = "Gender Group",
    x = "Gender",
    y = "Age",
    color = "Outcome"
  ) +
  theme_minimal()

ggplot(CHospital, aes( age,fill=gender)) + 
  geom_bar(position = "dodge",col="black") +
  scale_fill_manual(values = c("f" = "pink", "m" = "green"))

#labs(title=" Scatter Plot representation of Sex",x= "gender", y= "age")
summary(CHospital$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    15.0    17.9    24.0    73.0

Distribution of Age and Generation

ggplot(CHospital, aes( age,generation)) + 
  geom_line(size=0.7,col="purple") +
  #scale_fill_manual(values = c("f" = "pink", "m" = "green"))+
labs(title=" Line Graph",x= "age", y= "generation")

summary(CHospital$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    15.0    17.9    24.0    73.0

Boxplots representations to view outliers

ol1 <- ggplot(CHospital, aes(x=generation)) + 
  geom_boxplot(fill="green")
ol2 <- ggplot(CHospital, aes(x=age)) + 
  geom_boxplot(fill="green") 
ol3 <- ggplot(CHospital, aes(x= wt_kg)) + 
  geom_boxplot(fill="green") 
ol4 <- ggplot(CHospital, aes(x= ht_cm)) + 
  geom_boxplot(fill="green") 
ol5 <- ggplot(CHospital, aes(x= ct_blood)) + 
  geom_boxplot(fill="green") 
ol6 <- ggplot(CHospital, aes(x= temp)) + 
  geom_boxplot(fill="green") 
ol7 <- ggplot(CHospital, aes(x= days_onset_hosp)) + 
  geom_boxplot(fill="green") 
ol8 <- ggplot(CHospital, aes(x= bmi)) + 
  geom_boxplot(fill="green") 
library(gridExtra)
grid.arrange(ol1,ol2,ol3,ol4,ol5,ol6,ol7,ol8,ncol=3)

Distribution of Gender by weight

ggplot(CHospital, aes(gender, wt_kg))+
  geom_bar(stat="identity",fill="blue")

Distribution of Gender by Height

ggplot(CHospital, aes(gender, ht_cm))+
  geom_bar(stat="identity",fill="blue")

Distribution of Fever

ggplot(CHospital, aes(fever,fill=fever))+
  geom_bar()+
  scale_fill_manual(values = c("no" = "orange", "yes" = "chocolate"))

#labs(title=" Line Graph",x= "age", y= "generation")
summary(CHospital$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    15.0    17.9    24.0    73.0

Distribution of Chills

ggplot(CHospital, aes(chills,fill=chills))+
  geom_bar()+
  scale_fill_manual(values = c("no" = "skyblue", "yes" = "gold"))

#labs(title=" Line Graph",x= "age", y= "Frequency")
summary(CHospital$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    15.0    17.9    24.0    73.0

Distribution by Aches

ggplot(CHospital, aes(aches,fill=aches))+
  geom_bar()+
  scale_fill_manual(values = c("no" = "gold", "yes" = "green"))

#labs(title=" Line Graph",x= "age", y= "Frequency")
summary(CHospital$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    15.0    17.9    24.0    73.0

Distribution of symptoms by Gender

G1 <- ggplot(CHospital, aes(x = gender, fill = fever)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of Fever by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "Fever")
G2 <- ggplot(CHospital, aes(x = gender, fill = chills)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of Chills by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "Chills")
G3 <- ggplot(CHospital, aes(x = gender, fill = vomit)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of  vomit by Gender",
       x = "Gender",
       y = "Frequency",
       fill = " vomit")
G4 <- ggplot(CHospital, aes(x = gender, fill = aches)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of   aches by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "  aches")
G5 <- ggplot(CHospital, aes(x = gender, fill = cough)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of cough by Gender",
       x = "Gender",
       y = "Frequency",
       fill = "cough")

library(gridExtra)
grid.arrange(G1,G2,G3,G4,G5, ncol=2)

Distribution of symptoms by Outcome

OT1 <- ggplot(CHospital, aes(x = outcome, fill = aches)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "red", "yes" = "green"))+
  labs(title = "Distribution of aches",
       x = "outcome",
       y = "Frequency",
       fill = "  aches")
OT2 <- ggplot(CHospital, aes(x = outcome, fill = chills)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "red", "yes" = "green"))+
  labs(title = "Distribution of Chills by outcome",
       x = "outcome",
       y = "Frequency",
       fill = "Chills")
OT3 <- ggplot(CHospital, aes(x = outcome, fill = cough)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "red", "yes" = "green"))+
  labs(title = "Distribution of cough by outcome",
       x = "outcome",
       y = "Frequency",
       fill =" cough")
OT4 <- ggplot(CHospital, aes(x = outcome, fill = fever)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "red", "yes" = "green"))+
  labs(title = "Distribution of fever by outcome",
       x = "outcome",
       y = "Frequency",
       fill ="fever")
OT5 <- ggplot(CHospital, aes(x = outcome, fill = vomit)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("no" = "red", "yes" = "green"))+
  labs(title = "Distribution of vomit by outcome",
       x = "outcome",
       y = "Frequency",
       fill =" vomit")
library(gridExtra)
grid.arrange(OT1, OT2, OT3, OT4, OT5,ncol=2)

symtomps with Age

library(gridExtra)
A1 <- ggplot(CHospital, aes(x = aches,y= age, fill =aches)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by aches",
       x = "aches",
       y = "Age",
       fill =" aches")

A2 <- ggplot(CHospital, aes(x = chills,y= age, fill =chills)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by chills",
       x = "chills",
       y = "Age",
       fill =" chills")

A3<- ggplot(CHospital, aes(x = cough,y= age, fill = cough)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by cough",
       x = "cough",
       y = "Age",
       fill =" cough")

A4 <- ggplot(CHospital, aes(x = fever,y= age, fill = fever)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by fever",
       x = "fever",
       y = "Age",
       fill =" fever")

A5 <- ggplot(CHospital, aes(x = vomit,y= age, fill = vomit)) +
  geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(values = c("no" = "gold", "yes" = "chocolate"))+
  labs(title = "Distribution of age by vomit",
       x = "vomit",
       y = "Age",
       fill =" vomit")
grid.arrange(A1,A2,A3,A4,A5,ncol=3.5)

Since it was observed that some age_cat are repeated, then let us factor it

CHospital$age_cat<-factor(CHospital$age_cat,levels=c("0-4", "5-9",  "10-14", "15-19", "20-29" ,"30-49", "50-69"), ordered = TRUE)
str(CHospital$age_cat)
##  Ord.factor w/ 7 levels "0-4"<"5-9"<"10-14"<..: 1 6 5 3 1 3 5 6 5 6 ...
levels(CHospital$age_cat)
## [1] "0-4"   "5-9"   "10-14" "15-19" "20-29" "30-49" "50-69"
ggplot(CHospital,aes(age_cat))+
  geom_bar(fill="chocolate")+
  labs(title="Age Category Distribution",x="Age Category",y="Frequency")

library(ggplot2)
ggplot(CHospital,aes(x=gender,fill=outcome))+
  geom_bar(position="dodge")+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Gender Distribution with Outcome",x="gender",y="Frequency")

library(ggplot2)
CHospital$age_cat<-factor(CHospital$age_cat,levels=c("0-4", "5-9",  "10-14", "15-19", "20-29" ,"30-49", "50-69"), ordered = TRUE)
str(CHospital$age_cat)
##  Ord.factor w/ 7 levels "0-4"<"5-9"<"10-14"<..: 1 6 5 3 1 3 5 6 5 6 ...
ggplot(CHospital,aes(x=age_cat,fill=outcome))+
  geom_bar(position="dodge")+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Age_cat Distribution with Outcome",x="Age_cat",y="Frequency")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
CHospital$age_cat<-factor(CHospital$age_cat,levels=c("0-4", "5-9",  "10-14", "15-19", "20-29" ,"30-49", "50-69"), ordered = TRUE)
str(CHospital$age_cat)
##  Ord.factor w/ 7 levels "0-4"<"5-9"<"10-14"<..: 1 6 5 3 1 3 5 6 5 6 ...
ggplot(CHospital,aes(x=age_cat,fill=outcome))+
  geom_bar()+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Age_cat Distribution with Outcome",x="Age_cat",y="Frequency")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
ggplot(CHospital,aes(x=days_onset_hosp,fill=outcome))+
  geom_bar(position = "dodge")+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Days_onset_hosp Distribution with Outcome",x="Days_onset_hosp",y="Frequency")

  #theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
ggplot(CHospital,aes(x=days_onset_hosp,fill=outcome))+
  geom_bar()+
  scale_fill_manual(values = c("Death" = "gold", "Recover" = "chocolate"))+
  labs(title="Days_onset_hosp Distribution with Outcome",x="Days_onset_hosp",y="Frequency")

library(corrplot)
sel_columns <- CHospital[, c("generation", "age", "wt_kg", "ht_cm","ct_blood","temp")]
# Calculate the correlation matrix
cor_matrix <- cor(sel_columns, use = "complete.obs")

# Plot the correlation matrix
corrplot(cor_matrix, method = "color", 
         addCoef.col = "black", # Display correlation coefficients
         tl.col = "black",      # Label text color
         tl.cex = 0.8,          # Label text size
         number.cex = 0.9,      # Coefficient text size
         col = colorRampPalette(c("red", "white", "blue"))(200))

Plot representing Age with respect to weight

W <- ggplot(CHospital, aes(wt_kg,age))+
  geom_point(size =1,col="red")
summary(CHospital)
##    case_id            generation     date_outcome          outcome         
##  Length:268         Min.   : 2.00   Min.   :2014-05-11   Length:268        
##  Class :character   1st Qu.:13.00   1st Qu.:2014-10-01   Class :character  
##  Mode  :character   Median :17.00   Median :2014-11-07   Mode  :character  
##                     Mean   :17.02   Mean   :2014-11-20                     
##                     3rd Qu.:21.00   3rd Qu.:2015-01-02                     
##                     Max.   :35.00   Max.   :2015-05-14                     
##                                                                            
##     gender               age         age_unit            age_cat  
##  Length:268         Min.   : 0.0   Length:268         20-29  :61  
##  Class :character   1st Qu.: 7.0   Class :character   5-9    :49  
##  Mode  :character   Median :15.0   Mode  :character   10-14  :41  
##                     Mean   :17.9                      0-4    :39  
##                     3rd Qu.:24.0                      30-49  :39  
##                     Max.   :73.0                      (Other):38  
##                                                       NA's   : 1  
##       lon              lat            wt_kg            ht_cm      
##  Min.   :-13.27   Min.   :8.450   Min.   :  3.00   Min.   : 30.0  
##  1st Qu.:-13.25   1st Qu.:8.459   1st Qu.: 44.75   1st Qu.:100.8  
##  Median :-13.23   Median :8.468   Median : 57.50   Median :135.5  
##  Mean   :-13.23   Mean   :8.469   Mean   : 56.12   Mean   :131.7  
##  3rd Qu.:-13.22   3rd Qu.:8.479   3rd Qu.: 67.00   3rd Qu.:160.0  
##  Max.   :-13.21   Max.   :8.490   Max.   :100.00   Max.   :281.0  
##                                                                   
##     ct_blood       fever              chills             cough          
##  Min.   :17.0   Length:268         Length:268         Length:268        
##  1st Qu.:20.0   Class :character   Class :character   Class :character  
##  Median :22.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :21.2                                                           
##  3rd Qu.:22.0                                                           
##  Max.   :25.0                                                           
##                                                                         
##     aches              vomit                temp      time_admission   
##  Length:268         Length:268         Min.   :35.9   Length:268       
##  Class :character   Class :character   1st Qu.:38.3   Class1:hms       
##  Mode  :character   Mode  :character   Median :38.9   Class2:difftime  
##                                        Mean   :38.6   Mode  :numeric   
##                                        3rd Qu.:39.2                    
##                                        Max.   :40.4                    
##                                                                        
##       bmi         days_onset_hosp 
##  Min.   : 10.89   Min.   : 0.000  
##  1st Qu.: 24.43   1st Qu.: 1.000  
##  Median : 32.38   Median : 1.000  
##  Mean   : 41.77   Mean   : 1.843  
##  3rd Qu.: 46.94   3rd Qu.: 2.250  
##  Max.   :231.57   Max.   :11.000  
## 

To plot Age with respect to Height

H <- ggplot(CHospital, aes(ht_cm,age))+
  geom_point(size=1, col="brown")
grid.arrange(W,H, ncol=2)

It was observed that both Height and weight is inversely proportional to Age. Increase in age cause an increase in height.

ggplot(CHospital,aes(outcome,days_onset_hosp,fill=outcome))+
  geom_bar(stat="identity")

ggplot(CHospital,aes(days_onset_hosp,fill=outcome))+
  geom_bar(position="dodge",col="white")+
  scale_fill_manual(values = c("Death" = "purple", "Recover" = "brown"))+
  labs(title = "Distribution of days_onset_hosp by Outcome",
       x = "days_onset_hosp",
       y = "Frequency",
       fill = "  Outcome")