Joy Winner Emmanuel
2025-01-22
This is a presentation which includes data exploration and visualization, data cleaning and preprocessing of the world standard population by sex.
Data Exploration and Visualization
Data Cleaning and Preprocessing
## case_id generation date_infection date_onset
## Length:422 Min. : 2.00 Min. :2014-04-26 Min. :2014-05-01
## Class :character 1st Qu.:13.00 1st Qu.:2014-09-06 1st Qu.:2014-09-15
## Mode :character Median :16.00 Median :2014-10-16 Median :2014-10-24
## Mean :16.67 Mean :2014-10-26 Mean :2014-11-03
## 3rd Qu.:20.00 3rd Qu.:2014-12-16 3rd Qu.:2014-12-20
## Max. :37.00 Max. :2015-04-17 Max. :2015-04-27
## NA's :152
## date_hospitalisation date_outcome outcome
## Min. :2014-05-05 Min. :2014-05-23 Length:422
## 1st Qu.:2014-09-17 1st Qu.:2014-09-25 Class :character
## Median :2014-10-24 Median :2014-11-06 Mode :character
## Mean :2014-11-05 Mean :2014-11-13
## 3rd Qu.:2014-12-20 3rd Qu.:2014-12-28
## Max. :2015-04-29 Max. :2015-05-10
## NA's :79
## gender age age_unit age_years
## Length:422 Min. : 0.00 Length:422 Min. : 0.00
## Class :character 1st Qu.: 6.00 Class :character 1st Qu.: 6.00
## Mode :character Median :12.00 Mode :character Median :12.00
## Mean :15.12 Mean :15.03
## 3rd Qu.:22.00 3rd Qu.:22.00
## Max. :59.00 Max. :59.00
## NA's :5 NA's :5
## age_cat age_cat5 hospital lon
## Length:422 Length:422 Length:422 Min. :-13.27
## Class :character Class :character Class :character 1st Qu.:-13.25
## Mode :character Mode :character Mode :character Median :-13.23
## Mean :-13.23
## 3rd Qu.:-13.22
## Max. :-13.21
##
## lat infector source wt_kg
## Min. :8.448 Length:422 Length:422 Min. :-4.00
## 1st Qu.:8.461 Class :character Class :character 1st Qu.:40.00
## Median :8.469 Mode :character Mode :character Median :54.50
## Mean :8.470 Mean :52.12
## 3rd Qu.:8.479 3rd Qu.:65.00
## Max. :8.490 Max. :95.00
##
## ht_cm ct_blood fever chills
## Min. : 12.0 Min. :16.00 Length:422 Length:422
## 1st Qu.: 83.5 1st Qu.:20.00 Class :character Class :character
## Median :128.0 Median :22.00 Mode :character Mode :character
## Mean :121.6 Mean :21.29
## 3rd Qu.:156.0 3rd Qu.:23.00
## Max. :266.0 Max. :25.00
##
## cough aches vomit temp
## Length:422 Length:422 Length:422 Min. :35.60
## Class :character Class :character Class :character 1st Qu.:38.30
## Mode :character Mode :character Mode :character Median :38.90
## Mean :38.63
## 3rd Qu.:39.30
## Max. :40.40
## NA's :11
## time_admission bmi days_onset_hosp
## Length:422 Min. :-100.00 Min. : 0.000
## Class1:hms 1st Qu.: 24.87 1st Qu.: 1.000
## Class2:difftime Median : 33.59 Median : 1.000
## Mode :numeric Mean : 51.04 Mean : 2.036
## 3rd Qu.: 53.27 3rd Qu.: 3.000
## Max. : 428.06 Max. :18.000
##
##
## FALSE TRUE
## 11876 784
There are lots of missing values in the columns listed below:
Date of infection, source, and infector will be removed from the column because they have so many missing values.
Date_onset and date_hospitalize will be removed because the column days_onset_hosp shows the difference between both of them.
library(dplyr)
# Create a new data frame (st_mark_new) from the original data frame (st_mark)
# by removing the following columns:
st_mark_new <- st_mark %>% select(-c("date_infection", "date_onset", "date_hospitalisation", "date_outcome", "age_years", "age_cat5", "hospital", "infector", "source"))# Generate a frequency table for the "age_unit" column in the st_mark_new data frame.
table(st_mark_new$age_unit)##
## months years
## 2 420
# Convert ages from months to years by dividing by 12 and update the "age_unit" to "years".
st_mark_new <- st_mark_new %>%
mutate(
age = ifelse(age_unit == "months", age / 12, age),
age_unit = ifelse(age_unit == "months","years",age_unit)
)##
## years
## 422
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 6.00 12.00 15.03 22.00 59.00 5
# Replace missing ages with the median age.
st_mark_new$age[is.na(st_mark_new$age)] <- median(st_mark_new$age, na.rm = TRUE)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 12.00 14.99 22.00 59.00
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 35.60 38.30 38.90 38.63 39.30 40.40 11
# Box plot for age
j1 <- ggplot(st_mark_new, aes(age)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Age Distribution")
# Box plot for height (ht_cm)
j2 <- ggplot(st_mark_new, aes(ht_cm)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Height Distribution")
# Box plot for BMI
j3 <- ggplot(st_mark_new, aes(bmi)) +
geom_boxplot(fill = "salmon") +
labs(title = "BMI Distribution")
# Box plot for days from onset to hospital (days_onset_hosp)
j4 <- ggplot(st_mark_new, aes(days_onset_hosp)) +
geom_boxplot(fill = "lightcoral") +
labs(title = "Days from Onset to Hospital")
# Box plot for weight (wt_kg)
j5 <- ggplot(st_mark_new, aes(wt_kg)) +
geom_boxplot(fill = "yellow") +
labs(title = "Weight Distribution")
# Box plot for temperature (temp)
j6 <- ggplot(st_mark_new, aes(temp)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Temperature Distribution")
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Plotting the bar chart of gender against outcome
ggplot(st_mark_new, aes(x = gender, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Gender vs Outcome", x = "Gender", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "skyblue", "Death" = "gold")) +
theme_minimal()# Bar plot for outcome
ggplot(st_mark_new, aes(x = outcome)) +
geom_bar(fill = "orange") + # Bar color set to skyblue
labs(title = "Outcome Distribution", x = "Outcome", y = "Count") +
theme_minimal()# Arrange age categories in natural numeric order
st_mark_new$age_cat <- factor(
st_mark_new$age_cat,
levels = c("0-4", "5-9", "10-14", "15-19", "20-29", "30-49","50-69"))
# Verify the order
levels(st_mark_new$age_cat)## [1] "0-4" "5-9" "10-14" "15-19" "20-29" "30-49" "50-69"
ggplot(st_mark_new, aes(x = age_cat, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Age Category vs Outcome", x = "Age Category", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "blue", "Death" = "gold")) +
theme_minimal()ggplot(st_mark_new, aes(x = days_onset_hosp, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Days_onset_hosp vs Outcome", x = "days_onset_hosp", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "blue", "Death" = "red")) +
theme_minimal()SY1 <- ggplot(st_mark_new, aes(x = fever, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Fever vs Outcome", x = "fever", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "blue", "Death" = "red")) +
theme_minimal()
SY2 <- ggplot(st_mark_new, aes(x = chills, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Chills vs Outcome", x = "chills", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "gold", "Death" = "red")) +
theme_minimal()
SY3 <- ggplot(st_mark_new, aes(x = cough, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Cough vs Outcome", x = "cough", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "lightblue", "Death" = "yellow")) +
theme_minimal()
SY4 <- ggplot(st_mark_new, aes(x = aches, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Aches vs Outcome", x = "aches", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "lightgreen", "Death" = "red")) +
theme_minimal()
SY5 <- ggplot(st_mark_new, aes(x = vomit, fill = outcome)) +
geom_bar(position = "dodge") +
labs(title = "Vomit vs Outcome", x = "vomit", y = "Frequency") +
scale_fill_manual(values = c("Recover" = "blue", "Death" = "orange")) +
theme_minimal()
grid.arrange(SY1,SY2,SY3,SY4,SY5, ncol=3)library(corrplot)
sel_columns <- st_mark_new[,c("generation","age","wt_kg","ht_cm","ct_blood","temp")]#Calculate the correlation matrix
cor_matrix <- cor(sel_columns, use = "complete.obs")
#Plot the correlation matrix
corrplot(cor_matrix, method = "color",
addCoef.col = "black",
tl.col = "black",
tl.cex = 0.8,
number.cex = 0.7,
col = colorRampPalette(c("red","white","blue"))(200))# Scatter plot of the weight vs age
ggplot(st_mark_new, aes(x = age, y = wt_kg)) +
geom_point(color = "blue", size = 2) +
labs(title = "Scatter Plot of Weight vs Age",
x = "Age",
y = "Weight") +
theme_minimal() # Apply a minimal theme for a clean look# Convert categorical variables to numerical variables
# Convert multiple categorical columns to numeric
cols_to_convert <- c("gender", "fever", "chills", "outcome", "cough", "aches", "vomit")
st_mark_new[cols_to_convert] <- lapply(st_mark_new[cols_to_convert], function(x) as.numeric(as.factor(x)))
View(st_mark_new)