Loading libraries

# Load the readxl package
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.1     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)
library(tidyr)
library(scales) 
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Loading Data

# Specify the path to your Excel file
excel_file <- "COVID_example_data.xlsx"

# Load data from the Excel file
data <- read_excel(excel_file)

Data Inspection

# Set the options to display all rows and columns
options(max.print = 10000)

# Display the data frame using kable
knitr::kable(head(data))
PID reprt_creationdt_FALSE case_dob_FALSE case_age case_gender case_race case_eth case_zip Contact_id sym_startdt_FALSE sym_fever sym_subjfever sym_myalgia sym_losstastesmell sym_sorethroat sym_cough sym_headache sym_resolved sym_resolveddt_FALSE contact_household hospitalized hosp_admidt_FALSE hosp_dischdt_FALSE died died_covid died_dt_FALSE confirmed_case covid_dx pos_sampledt_FALSE latitude_JITT longitude_JITT
3a85e6992a5ac52f 2020-03-22 2004-11-08 16 Male WHITE NON-HISPANIC/LATINO 30308 Yes-Symptomatic 2020-03-20 Yes Yes No NA Yes Yes Yes No, still symptomatic NA Yes No NA NA No No NA Yes Confirmed 2020-03-22 33.77665 -84.38569
c6b5281d5fc50b96 2020-02-01 1964-06-07 57 Male WHITE NON-HISPANIC/LATINO 30308 Yes-Symptomatic 2020-01-28 No No Yes NA No Yes No No, still symptomatic NA No No NA NA No No NA Yes Confirmed 2020-02-01 33.78051 -84.38947
53495ad0dca4e22c 2020-02-10 1944-04-06 77 Female BLACK NON-HISPANIC/LATINO 30315 Yes-Symptomatic 2020-02-10 Yes NA Yes NA Yes Yes NA No, still symptomatic NA NA Yes 2020-02-08 NA No No NA Yes Confirmed 2020-02-10 33.73023 -84.38425
2948a265da0d081b 2020-03-20 1964-06-25 57 Female BLACK NON-HISPANIC/LATINO 30213 Yes-Symptomatic 2021-05-19 No Yes Yes NA Yes Yes Yes No, still symptomatic NA No NA NA NA No No NA Yes Confirmed 2021-01-17 33.55507 -84.62885
a5524aadd1ca0458 2020-02-26 1964-12-21 56 Male WHITE NOT SPECIFIED 30004 Yes-Symptomatic 2020-02-20 Yes Yes Yes NA No Yes No No, still symptomatic NA No Yes 2020-02-26 NA NA NA NA Yes Confirmed 2020-02-25 34.10609 -84.27454
db14eeabe531fab7 2020-02-11 1956-06-21 65 Male BLACK NON-HISPANIC/LATINO 30314 Yes-Symptomatic 2020-01-17 Yes Yes No NA Unk Yes Unk Yes, date specified below 2020-02-21 No Yes 2020-01-27 2020-02-21 Yes Yes 2020-02-21 Yes Confirmed 2020-02-20 33.75937 -84.42582
# View the data structure
knitr::kable(str(data))
## tibble [82,101 × 31] (S3: tbl_df/tbl/data.frame)
##  $ PID                   : chr [1:82101] "3a85e6992a5ac52f" "c6b5281d5fc50b96" "53495ad0dca4e22c" "2948a265da0d081b" ...
##  $ reprt_creationdt_FALSE: POSIXct[1:82101], format: "2020-03-22" "2020-02-01" ...
##  $ case_dob_FALSE        : POSIXct[1:82101], format: "2004-11-08" "1964-06-07" ...
##  $ case_age              : num [1:82101] 16 57 77 57 56 65 47 61 36 42 ...
##  $ case_gender           : chr [1:82101] "Male" "Male" "Female" "Female" ...
##  $ case_race             : chr [1:82101] "WHITE" "WHITE" "BLACK" "BLACK" ...
##  $ case_eth              : chr [1:82101] "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" ...
##  $ case_zip              : num [1:82101] 30308 30308 30315 30213 30004 ...
##  $ Contact_id            : chr [1:82101] "Yes-Symptomatic" "Yes-Symptomatic" "Yes-Symptomatic" "Yes-Symptomatic" ...
##  $ sym_startdt_FALSE     : POSIXct[1:82101], format: "2020-03-20" "2020-01-28" ...
##  $ sym_fever             : chr [1:82101] "Yes" "No" "Yes" "No" ...
##  $ sym_subjfever         : chr [1:82101] "Yes" "No" NA "Yes" ...
##  $ sym_myalgia           : chr [1:82101] "No" "Yes" "Yes" "Yes" ...
##  $ sym_losstastesmell    : chr [1:82101] NA NA NA NA ...
##  $ sym_sorethroat        : chr [1:82101] "Yes" "No" "Yes" "Yes" ...
##  $ sym_cough             : chr [1:82101] "Yes" "Yes" "Yes" "Yes" ...
##  $ sym_headache          : chr [1:82101] "Yes" "No" NA "Yes" ...
##  $ sym_resolved          : chr [1:82101] "No, still symptomatic" "No, still symptomatic" "No, still symptomatic" "No, still symptomatic" ...
##  $ sym_resolveddt_FALSE  : POSIXct[1:82101], format: NA NA ...
##  $ contact_household     : chr [1:82101] "Yes" "No" NA "No" ...
##  $ hospitalized          : chr [1:82101] "No" "No" "Yes" NA ...
##  $ hosp_admidt_FALSE     : POSIXct[1:82101], format: NA NA ...
##  $ hosp_dischdt_FALSE    : POSIXct[1:82101], format: NA NA ...
##  $ died                  : chr [1:82101] "No" "No" "No" "No" ...
##  $ died_covid            : chr [1:82101] "No" "No" "No" "No" ...
##  $ died_dt_FALSE         : POSIXct[1:82101], format: NA NA ...
##  $ confirmed_case        : chr [1:82101] "Yes" "Yes" "Yes" "Yes" ...
##  $ covid_dx              : chr [1:82101] "Confirmed" "Confirmed" "Confirmed" "Confirmed" ...
##  $ pos_sampledt_FALSE    : POSIXct[1:82101], format: "2020-03-22" "2020-02-01" ...
##  $ latitude_JITT         : num [1:82101] 33.8 33.8 33.7 33.6 34.1 ...
##  $ longitude_JITT        : num [1:82101] -84.4 -84.4 -84.4 -84.6 -84.3 ...
# Calculate the percentage of missing values per variable
missing_percent <- colSums(is.na(data)) / nrow(data) * 100

# Create a data frame with variable names and their missing percentages
missing_data <- data.frame(Variable = names(data), Missing_Percentage = missing_percent)

# Sort the data frame by missing percentage in descending order
missing_data <- missing_data[order(-missing_data$Missing_Percentage), ]

# Render the data frame using knitr::kable()
knitr::kable(missing_data)
Variable Missing_Percentage
died_dt_FALSE died_dt_FALSE 97.9208536
hosp_dischdt_FALSE hosp_dischdt_FALSE 95.7357401
hosp_admidt_FALSE hosp_admidt_FALSE 93.9269924
sym_resolveddt_FALSE sym_resolveddt_FALSE 80.1439690
sym_losstastesmell sym_losstastesmell 61.7824387
died_covid died_covid 51.5243420
sym_resolved sym_resolved 51.5145979
sym_subjfever sym_subjfever 46.1723974
sym_startdt_FALSE sym_startdt_FALSE 45.6510883
died died 44.8618165
contact_household contact_household 44.7461054
hospitalized hospitalized 39.5634645
sym_sorethroat sym_sorethroat 39.2699236
Contact_id Contact_id 39.2260752
sym_myalgia sym_myalgia 39.1432504
sym_headache sym_headache 38.9983070
sym_cough sym_cough 38.5257183
sym_fever sym_fever 38.4611637
case_race case_race 3.2033715
case_eth case_eth 3.1351628
longitude_JITT longitude_JITT 0.2436024
pos_sampledt_FALSE pos_sampledt_FALSE 0.1485975
latitude_JITT latitude_JITT 0.1144931
case_gender case_gender 0.0767348
case_dob_FALSE case_dob_FALSE 0.0584646
case_age case_age 0.0584646
case_zip case_zip 0.0158342
confirmed_case confirmed_case 0.0109621
PID PID 0.0000000
reprt_creationdt_FALSE reprt_creationdt_FALSE 0.0000000
covid_dx covid_dx 0.0000000

Data Pre-Processing

# Convert date columns to Date objects
data$reprt_creationdt_FALSE <- as.Date(data$reprt_creationdt_FALSE)
data$sym_startdt_FALSE <- as.Date(data$sym_startdt_FALSE)
data$sym_resolveddt_FALSE <- as.Date(data$sym_resolveddt_FALSE)
data$hosp_admidt_FALSE <- as.Date(data$hosp_admidt_FALSE)
data$hosp_dischdt_FALSE <- as.Date(data$hosp_dischdt_FALSE)
data$died_dt_FALSE <- as.Date(data$died_dt_FALSE)

Data Analysis

Demographic Analyses

COVID Cases by Gender & Age

# Creating a subset of the data where confirmed_cases is "Yes"
subset_data <- subset(data, confirmed_case == "Yes")

# Define the range for the x-axis labels
x_axis_breaks <- seq(0, 100, by = 10)  

# Distribution by Age and Gender of New_Confirmed Cases
ggplot(subset_data, aes(x = case_age, fill = case_gender)) +
  geom_histogram(binwidth = 1, position = "dodge", na.rm = TRUE) +
  labs(title = "Distribution of Cases by Age and Gender",
       x = "Age",
       y = "Count",
       fill = "Gender") +
  scale_x_continuous(breaks = x_axis_breaks, limits = c(0, max(subset_data$case_age))) +  # Set x-axis breaks and limits
  scale_fill_manual(values = c("Male" = "blue", "Female" = "orange")) +
  theme(legend.position = "top", legend.justification = c(1, 1))

  • Higher number of females between ages (20-30) have contracted the Virus relative to Males

COVID Cases by ethnicity

# COVID cases by ethnicity

ggplot(data, aes(x = case_eth)) +
  geom_bar(fill = "blue") +
  labs(title = "Distribution of Cases by Ethnicity",
       x = "Ethnicity",
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5))

  • The Non_Hispanic/Latino has been hardest hit by the epidemic

COVID Cases by Race

# COVID cases by race

ggplot(data, aes(x = case_race)) +
  geom_bar(fill = "blue") +
  labs(title = "Distribution of Cases by Race",
       x = "Race",
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 4))  

  • Black and white people have been hit the most by this epidemic

What are the most commonly reported symptoms among COVID-19 patients?

Symptoms and Severity

# Filter data for confirmed cases
confirmed_cases <- data %>%
  filter(confirmed_case == "Yes")

# Calculate the most commonly reported symptoms as a percentage of the grand total
symptoms_percentage <- confirmed_cases %>%
  select(sym_fever, sym_subjfever, sym_myalgia, sym_losstastesmell, sym_sorethroat, sym_cough, sym_headache) %>%
  summarise_all(~ sum(. == "Yes", na.rm = TRUE) / n()) %>%
  gather(symptom, percentage) %>%
  arrange(desc(percentage))

# Format the percentage column as percentages with two decimal places
symptoms_percentage$percentage <- percent(symptoms_percentage$percentage, accuracy = 0.01)

# Print the most commonly reported symptoms as a percentage of the grand total
cat("Most commonly reported symptoms among COVID-19 patients as a percentage of the grand total:\n")
## Most commonly reported symptoms among COVID-19 patients as a percentage of the grand total:
print(symptoms_percentage)
## # A tibble: 7 × 2
##   symptom            percentage
##   <chr>              <chr>     
## 1 sym_cough          26.73%    
## 2 sym_headache       26.40%    
## 3 sym_myalgia        23.78%    
## 4 sym_fever          18.42%    
## 5 sym_losstastesmell 15.51%    
## 6 sym_subjfever      15.47%    
## 7 sym_sorethroat     15.24%
  • One in four subjects confirmed to have contracted the virus exhibited symptoms such as headache, myalgia, or a cough
# Categorizing Cases by Severity:
confirmed_cases <- confirmed_cases %>%
  mutate(
    Severity = case_when(
      died_covid == "Yes" ~ "Fatal",
      hospitalized == "Yes" & is.na(sym_resolveddt_FALSE) ~ "Severe",
      hospitalized == "Yes" & !is.na(sym_resolveddt_FALSE) ~ "Moderate",
      hospitalized == "No" & !is.na(sym_resolveddt_FALSE) ~ "Mild",
      TRUE ~ "Unknown"
    )
  )
# Distribution of COVID Cases by Severity

# Calculate the percentage of each Severity level
confirmed_cases <- confirmed_cases %>%
  group_by(Severity) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = (Count / sum(Count)) * 100)

# Create the bar chart with percentages and labels
ggplot(confirmed_cases, aes(x = Severity, y = Percentage)) +
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", Percentage)), vjust = -0.5, size = 3) +
  labs(title = "Distribution of COVID Cases by Severity (Percentage)",
       x = "Severity",
       y = "Percentage") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 7))

  • Less than 5% of all confirmed cases lead to death

Geospatial Analysis

# Filter data for confirmed cases
confirmed_cases <- data %>%
  filter(confirmed_case == "Yes")

# Drop missing values from 'longitude_JITT' and 'latitude_JITT' columns
confirmed_cases <- confirmed_cases[complete.cases(confirmed_cases[c("longitude_JITT", "latitude_JITT")]), ]
# Create a leaflet map with the sampled data
leaflet() %>%
  addTiles() %>%  # Add a basic map background
  addCircleMarkers(
    data = confirmed_cases,  # Use the sampled data
    lng = ~longitude_JITT,
    lat = ~latitude_JITT,
    color = "red",
    radius = 4
  )