Loading libraries
# Load the readxl package
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.1 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)
library(tidyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
Loading Data
# Specify the path to your Excel file
excel_file <- "COVID_example_data.xlsx"
# Load data from the Excel file
data <- read_excel(excel_file)
Data Inspection
# Set the options to display all rows and columns
options(max.print = 10000)
# Display the data frame using kable
knitr::kable(head(data))
| 3a85e6992a5ac52f |
2020-03-22 |
2004-11-08 |
16 |
Male |
WHITE |
NON-HISPANIC/LATINO |
30308 |
Yes-Symptomatic |
2020-03-20 |
Yes |
Yes |
No |
NA |
Yes |
Yes |
Yes |
No, still symptomatic |
NA |
Yes |
No |
NA |
NA |
No |
No |
NA |
Yes |
Confirmed |
2020-03-22 |
33.77665 |
-84.38569 |
| c6b5281d5fc50b96 |
2020-02-01 |
1964-06-07 |
57 |
Male |
WHITE |
NON-HISPANIC/LATINO |
30308 |
Yes-Symptomatic |
2020-01-28 |
No |
No |
Yes |
NA |
No |
Yes |
No |
No, still symptomatic |
NA |
No |
No |
NA |
NA |
No |
No |
NA |
Yes |
Confirmed |
2020-02-01 |
33.78051 |
-84.38947 |
| 53495ad0dca4e22c |
2020-02-10 |
1944-04-06 |
77 |
Female |
BLACK |
NON-HISPANIC/LATINO |
30315 |
Yes-Symptomatic |
2020-02-10 |
Yes |
NA |
Yes |
NA |
Yes |
Yes |
NA |
No, still symptomatic |
NA |
NA |
Yes |
2020-02-08 |
NA |
No |
No |
NA |
Yes |
Confirmed |
2020-02-10 |
33.73023 |
-84.38425 |
| 2948a265da0d081b |
2020-03-20 |
1964-06-25 |
57 |
Female |
BLACK |
NON-HISPANIC/LATINO |
30213 |
Yes-Symptomatic |
2021-05-19 |
No |
Yes |
Yes |
NA |
Yes |
Yes |
Yes |
No, still symptomatic |
NA |
No |
NA |
NA |
NA |
No |
No |
NA |
Yes |
Confirmed |
2021-01-17 |
33.55507 |
-84.62885 |
| a5524aadd1ca0458 |
2020-02-26 |
1964-12-21 |
56 |
Male |
WHITE |
NOT SPECIFIED |
30004 |
Yes-Symptomatic |
2020-02-20 |
Yes |
Yes |
Yes |
NA |
No |
Yes |
No |
No, still symptomatic |
NA |
No |
Yes |
2020-02-26 |
NA |
NA |
NA |
NA |
Yes |
Confirmed |
2020-02-25 |
34.10609 |
-84.27454 |
| db14eeabe531fab7 |
2020-02-11 |
1956-06-21 |
65 |
Male |
BLACK |
NON-HISPANIC/LATINO |
30314 |
Yes-Symptomatic |
2020-01-17 |
Yes |
Yes |
No |
NA |
Unk |
Yes |
Unk |
Yes, date specified below |
2020-02-21 |
No |
Yes |
2020-01-27 |
2020-02-21 |
Yes |
Yes |
2020-02-21 |
Yes |
Confirmed |
2020-02-20 |
33.75937 |
-84.42582 |
# View the data structure
knitr::kable(str(data))
## tibble [82,101 × 31] (S3: tbl_df/tbl/data.frame)
## $ PID : chr [1:82101] "3a85e6992a5ac52f" "c6b5281d5fc50b96" "53495ad0dca4e22c" "2948a265da0d081b" ...
## $ reprt_creationdt_FALSE: POSIXct[1:82101], format: "2020-03-22" "2020-02-01" ...
## $ case_dob_FALSE : POSIXct[1:82101], format: "2004-11-08" "1964-06-07" ...
## $ case_age : num [1:82101] 16 57 77 57 56 65 47 61 36 42 ...
## $ case_gender : chr [1:82101] "Male" "Male" "Female" "Female" ...
## $ case_race : chr [1:82101] "WHITE" "WHITE" "BLACK" "BLACK" ...
## $ case_eth : chr [1:82101] "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" ...
## $ case_zip : num [1:82101] 30308 30308 30315 30213 30004 ...
## $ Contact_id : chr [1:82101] "Yes-Symptomatic" "Yes-Symptomatic" "Yes-Symptomatic" "Yes-Symptomatic" ...
## $ sym_startdt_FALSE : POSIXct[1:82101], format: "2020-03-20" "2020-01-28" ...
## $ sym_fever : chr [1:82101] "Yes" "No" "Yes" "No" ...
## $ sym_subjfever : chr [1:82101] "Yes" "No" NA "Yes" ...
## $ sym_myalgia : chr [1:82101] "No" "Yes" "Yes" "Yes" ...
## $ sym_losstastesmell : chr [1:82101] NA NA NA NA ...
## $ sym_sorethroat : chr [1:82101] "Yes" "No" "Yes" "Yes" ...
## $ sym_cough : chr [1:82101] "Yes" "Yes" "Yes" "Yes" ...
## $ sym_headache : chr [1:82101] "Yes" "No" NA "Yes" ...
## $ sym_resolved : chr [1:82101] "No, still symptomatic" "No, still symptomatic" "No, still symptomatic" "No, still symptomatic" ...
## $ sym_resolveddt_FALSE : POSIXct[1:82101], format: NA NA ...
## $ contact_household : chr [1:82101] "Yes" "No" NA "No" ...
## $ hospitalized : chr [1:82101] "No" "No" "Yes" NA ...
## $ hosp_admidt_FALSE : POSIXct[1:82101], format: NA NA ...
## $ hosp_dischdt_FALSE : POSIXct[1:82101], format: NA NA ...
## $ died : chr [1:82101] "No" "No" "No" "No" ...
## $ died_covid : chr [1:82101] "No" "No" "No" "No" ...
## $ died_dt_FALSE : POSIXct[1:82101], format: NA NA ...
## $ confirmed_case : chr [1:82101] "Yes" "Yes" "Yes" "Yes" ...
## $ covid_dx : chr [1:82101] "Confirmed" "Confirmed" "Confirmed" "Confirmed" ...
## $ pos_sampledt_FALSE : POSIXct[1:82101], format: "2020-03-22" "2020-02-01" ...
## $ latitude_JITT : num [1:82101] 33.8 33.8 33.7 33.6 34.1 ...
## $ longitude_JITT : num [1:82101] -84.4 -84.4 -84.4 -84.6 -84.3 ...
# Calculate the percentage of missing values per variable
missing_percent <- colSums(is.na(data)) / nrow(data) * 100
# Create a data frame with variable names and their missing percentages
missing_data <- data.frame(Variable = names(data), Missing_Percentage = missing_percent)
# Sort the data frame by missing percentage in descending order
missing_data <- missing_data[order(-missing_data$Missing_Percentage), ]
# Render the data frame using knitr::kable()
knitr::kable(missing_data)
| died_dt_FALSE |
died_dt_FALSE |
97.9208536 |
| hosp_dischdt_FALSE |
hosp_dischdt_FALSE |
95.7357401 |
| hosp_admidt_FALSE |
hosp_admidt_FALSE |
93.9269924 |
| sym_resolveddt_FALSE |
sym_resolveddt_FALSE |
80.1439690 |
| sym_losstastesmell |
sym_losstastesmell |
61.7824387 |
| died_covid |
died_covid |
51.5243420 |
| sym_resolved |
sym_resolved |
51.5145979 |
| sym_subjfever |
sym_subjfever |
46.1723974 |
| sym_startdt_FALSE |
sym_startdt_FALSE |
45.6510883 |
| died |
died |
44.8618165 |
| contact_household |
contact_household |
44.7461054 |
| hospitalized |
hospitalized |
39.5634645 |
| sym_sorethroat |
sym_sorethroat |
39.2699236 |
| Contact_id |
Contact_id |
39.2260752 |
| sym_myalgia |
sym_myalgia |
39.1432504 |
| sym_headache |
sym_headache |
38.9983070 |
| sym_cough |
sym_cough |
38.5257183 |
| sym_fever |
sym_fever |
38.4611637 |
| case_race |
case_race |
3.2033715 |
| case_eth |
case_eth |
3.1351628 |
| longitude_JITT |
longitude_JITT |
0.2436024 |
| pos_sampledt_FALSE |
pos_sampledt_FALSE |
0.1485975 |
| latitude_JITT |
latitude_JITT |
0.1144931 |
| case_gender |
case_gender |
0.0767348 |
| case_dob_FALSE |
case_dob_FALSE |
0.0584646 |
| case_age |
case_age |
0.0584646 |
| case_zip |
case_zip |
0.0158342 |
| confirmed_case |
confirmed_case |
0.0109621 |
| PID |
PID |
0.0000000 |
| reprt_creationdt_FALSE |
reprt_creationdt_FALSE |
0.0000000 |
| covid_dx |
covid_dx |
0.0000000 |
Data Pre-Processing
# Convert date columns to Date objects
data$reprt_creationdt_FALSE <- as.Date(data$reprt_creationdt_FALSE)
data$sym_startdt_FALSE <- as.Date(data$sym_startdt_FALSE)
data$sym_resolveddt_FALSE <- as.Date(data$sym_resolveddt_FALSE)
data$hosp_admidt_FALSE <- as.Date(data$hosp_admidt_FALSE)
data$hosp_dischdt_FALSE <- as.Date(data$hosp_dischdt_FALSE)
data$died_dt_FALSE <- as.Date(data$died_dt_FALSE)
Data Analysis
Epidemiological Trends
# Sort the data by date
t_data <- data %>% arrange(reprt_creationdt_FALSE)
# Calculate daily new confirmed cases
t_data <- t_data %>% group_by(week = week(reprt_creationdt_FALSE), year = year(reprt_creationdt_FALSE)) %>%
summarise(new_confirmed_cases = sum(confirmed_case == "Yes", na.rm = TRUE), .groups = 'drop')
# Sort the data by "year" and "week" in ascending order
t_data <- t_data %>%
arrange(year, week)
# Convert the data to a time series
ts_data <- ts(t_data$new_confirmed_cases, frequency = 52) # Assuming 52 weeks in a year
# Create a continuous line graph combining year and week
t_data$year_week <- factor(paste(t_data$year, t_data$week), levels = unique(paste(t_data$year, t_data$week)))
ggplot(t_data, aes(x = year_week, y = new_confirmed_cases, group = 1)) +
geom_line(color = "red") +
labs(title = "Number of New Confirmed Cases by Year and Week",
x = "Year and Week",
y = "New Confirmed Cases") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_discrete(breaks = t_data$year_week[seq(1, length(t_data$year_week), by = 5)],
labels = t_data$year_week[seq(1, length(t_data$year_week), by = 5)]) +
theme(plot.title = element_text(hjust = 0.5))

- The number of cases is on a decline steep decline from the peek seen
at the start of 2021
Demographic Analyses
COVID Cases by Gender & Age
# Creating a subset of the data where confirmed_cases is "Yes"
subset_data <- subset(data, confirmed_case == "Yes")
# Define the range for the x-axis labels
x_axis_breaks <- seq(0, 100, by = 10)
# Distribution by Age and Gender of New_Confirmed Cases
ggplot(subset_data, aes(x = case_age, fill = case_gender)) +
geom_histogram(binwidth = 1, position = "dodge", na.rm = TRUE) +
labs(title = "Distribution of Cases by Age and Gender",
x = "Age",
y = "Count",
fill = "Gender") +
scale_x_continuous(breaks = x_axis_breaks, limits = c(0, max(subset_data$case_age))) + # Set x-axis breaks and limits
scale_fill_manual(values = c("Male" = "blue", "Female" = "orange")) +
theme(legend.position = "top", legend.justification = c(1, 1))

- Higher number of females between ages (20-30) have contracted the
Virus relative to Males
COVID Cases by ethnicity
# COVID cases by ethnicity
ggplot(data, aes(x = case_eth)) +
geom_bar(fill = "blue") +
labs(title = "Distribution of Cases by Ethnicity",
x = "Ethnicity",
y = "Count") +
theme(plot.title = element_text(hjust = 0.5))

- The Non_Hispanic/Latino has been hardest hit by the epidemic
COVID Cases by Race
# COVID cases by race
ggplot(data, aes(x = case_race)) +
geom_bar(fill = "blue") +
labs(title = "Distribution of Cases by Race",
x = "Race",
y = "Count") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size = 4))

- Black and white people have been hit the most by this epidemic
What are the most commonly reported symptoms among COVID-19
patients?
Symptoms and Severity
# Filter data for confirmed cases
confirmed_cases <- data %>%
filter(confirmed_case == "Yes")
# Calculate the most commonly reported symptoms as a percentage of the grand total
symptoms_percentage <- confirmed_cases %>%
select(sym_fever, sym_subjfever, sym_myalgia, sym_losstastesmell, sym_sorethroat, sym_cough, sym_headache) %>%
summarise_all(~ sum(. == "Yes", na.rm = TRUE) / n()) %>%
gather(symptom, percentage) %>%
arrange(desc(percentage))
# Format the percentage column as percentages with two decimal places
symptoms_percentage$percentage <- percent(symptoms_percentage$percentage, accuracy = 0.01)
# Print the most commonly reported symptoms as a percentage of the grand total
cat("Most commonly reported symptoms among COVID-19 patients as a percentage of the grand total:\n")
## Most commonly reported symptoms among COVID-19 patients as a percentage of the grand total:
print(symptoms_percentage)
## # A tibble: 7 × 2
## symptom percentage
## <chr> <chr>
## 1 sym_cough 26.73%
## 2 sym_headache 26.40%
## 3 sym_myalgia 23.78%
## 4 sym_fever 18.42%
## 5 sym_losstastesmell 15.51%
## 6 sym_subjfever 15.47%
## 7 sym_sorethroat 15.24%
- One in four subjects confirmed to have contracted the virus
exhibited symptoms such as headache, myalgia, or a cough
# Categorizing Cases by Severity:
confirmed_cases <- confirmed_cases %>%
mutate(
Severity = case_when(
died_covid == "Yes" ~ "Fatal",
hospitalized == "Yes" & is.na(sym_resolveddt_FALSE) ~ "Severe",
hospitalized == "Yes" & !is.na(sym_resolveddt_FALSE) ~ "Moderate",
hospitalized == "No" & !is.na(sym_resolveddt_FALSE) ~ "Mild",
TRUE ~ "Unknown"
)
)
# Distribution of COVID Cases by Severity
# Calculate the percentage of each Severity level
confirmed_cases <- confirmed_cases %>%
group_by(Severity) %>%
summarise(Count = n()) %>%
mutate(Percentage = (Count / sum(Count)) * 100)
# Create the bar chart with percentages and labels
ggplot(confirmed_cases, aes(x = Severity, y = Percentage)) +
geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = sprintf("%.2f%%", Percentage)), vjust = -0.5, size = 3) +
labs(title = "Distribution of COVID Cases by Severity (Percentage)",
x = "Severity",
y = "Percentage") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size = 7))

- Less than 5% of all confirmed cases lead to death
Geospatial Analysis
# Filter data for confirmed cases
confirmed_cases <- data %>%
filter(confirmed_case == "Yes")
# Drop missing values from 'longitude_JITT' and 'latitude_JITT' columns
confirmed_cases <- confirmed_cases[complete.cases(confirmed_cases[c("longitude_JITT", "latitude_JITT")]), ]
# Create a leaflet map with the sampled data
leaflet() %>%
addTiles() %>% # Add a basic map background
addCircleMarkers(
data = confirmed_cases, # Use the sampled data
lng = ~longitude_JITT,
lat = ~latitude_JITT,
color = "red",
radius = 4
)