Weekly COVID OutBreak Situation

Loading libraries

# Load the readxl package
library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.1     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(leaflet)
library(tidyr)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Loading Data

# Specify the path to your Excel file
excel_file <- "COVID_example_data.xlsx"

# Load data from the Excel file
data <- read_excel(excel_file)

Data Inspection

# Set the options to display all rows and columns
options(max.print = 10000)

# Display the data frame using kable
knitr::kable(head(data))

PID	reprt_creationdt_FALSE	case_dob_FALSE	case_age	case_gender	case_race	case_eth	case_zip	Contact_id	sym_startdt_FALSE	sym_fever	sym_subjfever	sym_myalgia	sym_losstastesmell	sym_sorethroat	sym_cough	sym_headache	sym_resolved	sym_resolveddt_FALSE	contact_household	hospitalized	hosp_admidt_FALSE	hosp_dischdt_FALSE	died	died_covid	died_dt_FALSE	confirmed_case	covid_dx	pos_sampledt_FALSE	latitude_JITT	longitude_JITT
3a85e6992a5ac52f	2020-03-22	2004-11-08	16	Male	WHITE	NON-HISPANIC/LATINO	30308	Yes-Symptomatic	2020-03-20	Yes	Yes	No	NA	Yes	Yes	Yes	No, still symptomatic	NA	Yes	No	NA	NA	No	No	NA	Yes	Confirmed	2020-03-22	33.77665	-84.38569
c6b5281d5fc50b96	2020-02-01	1964-06-07	57	Male	WHITE	NON-HISPANIC/LATINO	30308	Yes-Symptomatic	2020-01-28	No	No	Yes	NA	No	Yes	No	No, still symptomatic	NA	No	No	NA	NA	No	No	NA	Yes	Confirmed	2020-02-01	33.78051	-84.38947
53495ad0dca4e22c	2020-02-10	1944-04-06	77	Female	BLACK	NON-HISPANIC/LATINO	30315	Yes-Symptomatic	2020-02-10	Yes	NA	Yes	NA	Yes	Yes	NA	No, still symptomatic	NA	NA	Yes	2020-02-08	NA	No	No	NA	Yes	Confirmed	2020-02-10	33.73023	-84.38425
2948a265da0d081b	2020-03-20	1964-06-25	57	Female	BLACK	NON-HISPANIC/LATINO	30213	Yes-Symptomatic	2021-05-19	No	Yes	Yes	NA	Yes	Yes	Yes	No, still symptomatic	NA	No	NA	NA	NA	No	No	NA	Yes	Confirmed	2021-01-17	33.55507	-84.62885
a5524aadd1ca0458	2020-02-26	1964-12-21	56	Male	WHITE	NOT SPECIFIED	30004	Yes-Symptomatic	2020-02-20	Yes	Yes	Yes	NA	No	Yes	No	No, still symptomatic	NA	No	Yes	2020-02-26	NA	NA	NA	NA	Yes	Confirmed	2020-02-25	34.10609	-84.27454
db14eeabe531fab7	2020-02-11	1956-06-21	65	Male	BLACK	NON-HISPANIC/LATINO	30314	Yes-Symptomatic	2020-01-17	Yes	Yes	No	NA	Unk	Yes	Unk	Yes, date specified below	2020-02-21	No	Yes	2020-01-27	2020-02-21	Yes	Yes	2020-02-21	Yes	Confirmed	2020-02-20	33.75937	-84.42582

# View the data structure
knitr::kable(str(data))

## tibble [82,101 × 31] (S3: tbl_df/tbl/data.frame)
##  $ PID                   : chr [1:82101] "3a85e6992a5ac52f" "c6b5281d5fc50b96" "53495ad0dca4e22c" "2948a265da0d081b" ...
##  $ reprt_creationdt_FALSE: POSIXct[1:82101], format: "2020-03-22" "2020-02-01" ...
##  $ case_dob_FALSE        : POSIXct[1:82101], format: "2004-11-08" "1964-06-07" ...
##  $ case_age              : num [1:82101] 16 57 77 57 56 65 47 61 36 42 ...
##  $ case_gender           : chr [1:82101] "Male" "Male" "Female" "Female" ...
##  $ case_race             : chr [1:82101] "WHITE" "WHITE" "BLACK" "BLACK" ...
##  $ case_eth              : chr [1:82101] "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" "NON-HISPANIC/LATINO" ...
##  $ case_zip              : num [1:82101] 30308 30308 30315 30213 30004 ...
##  $ Contact_id            : chr [1:82101] "Yes-Symptomatic" "Yes-Symptomatic" "Yes-Symptomatic" "Yes-Symptomatic" ...
##  $ sym_startdt_FALSE     : POSIXct[1:82101], format: "2020-03-20" "2020-01-28" ...
##  $ sym_fever             : chr [1:82101] "Yes" "No" "Yes" "No" ...
##  $ sym_subjfever         : chr [1:82101] "Yes" "No" NA "Yes" ...
##  $ sym_myalgia           : chr [1:82101] "No" "Yes" "Yes" "Yes" ...
##  $ sym_losstastesmell    : chr [1:82101] NA NA NA NA ...
##  $ sym_sorethroat        : chr [1:82101] "Yes" "No" "Yes" "Yes" ...
##  $ sym_cough             : chr [1:82101] "Yes" "Yes" "Yes" "Yes" ...
##  $ sym_headache          : chr [1:82101] "Yes" "No" NA "Yes" ...
##  $ sym_resolved          : chr [1:82101] "No, still symptomatic" "No, still symptomatic" "No, still symptomatic" "No, still symptomatic" ...
##  $ sym_resolveddt_FALSE  : POSIXct[1:82101], format: NA NA ...
##  $ contact_household     : chr [1:82101] "Yes" "No" NA "No" ...
##  $ hospitalized          : chr [1:82101] "No" "No" "Yes" NA ...
##  $ hosp_admidt_FALSE     : POSIXct[1:82101], format: NA NA ...
##  $ hosp_dischdt_FALSE    : POSIXct[1:82101], format: NA NA ...
##  $ died                  : chr [1:82101] "No" "No" "No" "No" ...
##  $ died_covid            : chr [1:82101] "No" "No" "No" "No" ...
##  $ died_dt_FALSE         : POSIXct[1:82101], format: NA NA ...
##  $ confirmed_case        : chr [1:82101] "Yes" "Yes" "Yes" "Yes" ...
##  $ covid_dx              : chr [1:82101] "Confirmed" "Confirmed" "Confirmed" "Confirmed" ...
##  $ pos_sampledt_FALSE    : POSIXct[1:82101], format: "2020-03-22" "2020-02-01" ...
##  $ latitude_JITT         : num [1:82101] 33.8 33.8 33.7 33.6 34.1 ...
##  $ longitude_JITT        : num [1:82101] -84.4 -84.4 -84.4 -84.6 -84.3 ...

# Calculate the percentage of missing values per variable
missing_percent <- colSums(is.na(data)) / nrow(data) * 100

# Create a data frame with variable names and their missing percentages
missing_data <- data.frame(Variable = names(data), Missing_Percentage = missing_percent)

# Sort the data frame by missing percentage in descending order
missing_data <- missing_data[order(-missing_data$Missing_Percentage), ]

# Render the data frame using knitr::kable()
knitr::kable(missing_data)

	Variable	Missing_Percentage
died_dt_FALSE	died_dt_FALSE	97.9208536
hosp_dischdt_FALSE	hosp_dischdt_FALSE	95.7357401
hosp_admidt_FALSE	hosp_admidt_FALSE	93.9269924
sym_resolveddt_FALSE	sym_resolveddt_FALSE	80.1439690
sym_losstastesmell	sym_losstastesmell	61.7824387
died_covid	died_covid	51.5243420
sym_resolved	sym_resolved	51.5145979
sym_subjfever	sym_subjfever	46.1723974
sym_startdt_FALSE	sym_startdt_FALSE	45.6510883
died	died	44.8618165
contact_household	contact_household	44.7461054
hospitalized	hospitalized	39.5634645
sym_sorethroat	sym_sorethroat	39.2699236
Contact_id	Contact_id	39.2260752
sym_myalgia	sym_myalgia	39.1432504
sym_headache	sym_headache	38.9983070
sym_cough	sym_cough	38.5257183
sym_fever	sym_fever	38.4611637
case_race	case_race	3.2033715
case_eth	case_eth	3.1351628
longitude_JITT	longitude_JITT	0.2436024
pos_sampledt_FALSE	pos_sampledt_FALSE	0.1485975
latitude_JITT	latitude_JITT	0.1144931
case_gender	case_gender	0.0767348
case_dob_FALSE	case_dob_FALSE	0.0584646
case_age	case_age	0.0584646
case_zip	case_zip	0.0158342
confirmed_case	confirmed_case	0.0109621
PID	PID	0.0000000
reprt_creationdt_FALSE	reprt_creationdt_FALSE	0.0000000
covid_dx	covid_dx	0.0000000

Data Pre-Processing

# Convert date columns to Date objects
data$reprt_creationdt_FALSE <- as.Date(data$reprt_creationdt_FALSE)
data$sym_startdt_FALSE <- as.Date(data$sym_startdt_FALSE)
data$sym_resolveddt_FALSE <- as.Date(data$sym_resolveddt_FALSE)
data$hosp_admidt_FALSE <- as.Date(data$hosp_admidt_FALSE)
data$hosp_dischdt_FALSE <- as.Date(data$hosp_dischdt_FALSE)
data$died_dt_FALSE <- as.Date(data$died_dt_FALSE)

Data Analysis

Epidemiological Trends

# Sort the data by date
t_data <- data %>% arrange(reprt_creationdt_FALSE)


# Calculate daily new confirmed cases
t_data <- t_data %>% group_by(week = week(reprt_creationdt_FALSE), year = year(reprt_creationdt_FALSE)) %>%
  summarise(new_confirmed_cases = sum(confirmed_case == "Yes", na.rm = TRUE), .groups = 'drop')

# Sort the data by "year" and "week" in ascending order
t_data <- t_data %>%
  arrange(year, week)

# Convert the data to a time series
ts_data <- ts(t_data$new_confirmed_cases, frequency = 52)  # Assuming 52 weeks in a year

# Create a continuous line graph combining year and week
t_data$year_week <- factor(paste(t_data$year, t_data$week), levels = unique(paste(t_data$year, t_data$week)))

ggplot(t_data, aes(x = year_week, y = new_confirmed_cases, group = 1)) +
  geom_line(color = "red") +
  labs(title = "Number of New Confirmed Cases by Year and Week",
       x = "Year and Week",
       y = "New Confirmed Cases") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_discrete(breaks = t_data$year_week[seq(1, length(t_data$year_week), by = 5)],
                   labels = t_data$year_week[seq(1, length(t_data$year_week), by = 5)]) +
  theme(plot.title = element_text(hjust = 0.5))

The number of cases is on a decline steep decline from the peek seen at the start of 2021

Demographic Analyses

COVID Cases by Gender & Age

# Creating a subset of the data where confirmed_cases is "Yes"
subset_data <- subset(data, confirmed_case == "Yes")

# Define the range for the x-axis labels
x_axis_breaks <- seq(0, 100, by = 10)  

# Distribution by Age and Gender of New_Confirmed Cases
ggplot(subset_data, aes(x = case_age, fill = case_gender)) +
  geom_histogram(binwidth = 1, position = "dodge", na.rm = TRUE) +
  labs(title = "Distribution of Cases by Age and Gender",
       x = "Age",
       y = "Count",
       fill = "Gender") +
  scale_x_continuous(breaks = x_axis_breaks, limits = c(0, max(subset_data$case_age))) +  # Set x-axis breaks and limits
  scale_fill_manual(values = c("Male" = "blue", "Female" = "orange")) +
  theme(legend.position = "top", legend.justification = c(1, 1))

Higher number of females between ages (20-30) have contracted the Virus relative to Males

COVID Cases by ethnicity

# COVID cases by ethnicity

ggplot(data, aes(x = case_eth)) +
  geom_bar(fill = "blue") +
  labs(title = "Distribution of Cases by Ethnicity",
       x = "Ethnicity",
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5))

The Non_Hispanic/Latino has been hardest hit by the epidemic

COVID Cases by Race

# COVID cases by race

ggplot(data, aes(x = case_race)) +
  geom_bar(fill = "blue") +
  labs(title = "Distribution of Cases by Race",
       x = "Race",
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 4))

Black and white people have been hit the most by this epidemic

What are the most commonly reported symptoms among COVID-19 patients?

Symptoms and Severity

# Filter data for confirmed cases
confirmed_cases <- data %>%
  filter(confirmed_case == "Yes")

# Calculate the most commonly reported symptoms as a percentage of the grand total
symptoms_percentage <- confirmed_cases %>%
  select(sym_fever, sym_subjfever, sym_myalgia, sym_losstastesmell, sym_sorethroat, sym_cough, sym_headache) %>%
  summarise_all(~ sum(. == "Yes", na.rm = TRUE) / n()) %>%
  gather(symptom, percentage) %>%
  arrange(desc(percentage))

# Format the percentage column as percentages with two decimal places
symptoms_percentage$percentage <- percent(symptoms_percentage$percentage, accuracy = 0.01)

# Print the most commonly reported symptoms as a percentage of the grand total
cat("Most commonly reported symptoms among COVID-19 patients as a percentage of the grand total:\n")

## Most commonly reported symptoms among COVID-19 patients as a percentage of the grand total:

print(symptoms_percentage)

## # A tibble: 7 × 2
##   symptom            percentage
##   <chr>              <chr>     
## 1 sym_cough          26.73%    
## 2 sym_headache       26.40%    
## 3 sym_myalgia        23.78%    
## 4 sym_fever          18.42%    
## 5 sym_losstastesmell 15.51%    
## 6 sym_subjfever      15.47%    
## 7 sym_sorethroat     15.24%

One in four subjects confirmed to have contracted the virus exhibited symptoms such as headache, myalgia, or a cough

# Categorizing Cases by Severity:
confirmed_cases <- confirmed_cases %>%
  mutate(
    Severity = case_when(
      died_covid == "Yes" ~ "Fatal",
      hospitalized == "Yes" & is.na(sym_resolveddt_FALSE) ~ "Severe",
      hospitalized == "Yes" & !is.na(sym_resolveddt_FALSE) ~ "Moderate",
      hospitalized == "No" & !is.na(sym_resolveddt_FALSE) ~ "Mild",
      TRUE ~ "Unknown"
    )
  )

# Distribution of COVID Cases by Severity

# Calculate the percentage of each Severity level
confirmed_cases <- confirmed_cases %>%
  group_by(Severity) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = (Count / sum(Count)) * 100)

# Create the bar chart with percentages and labels
ggplot(confirmed_cases, aes(x = Severity, y = Percentage)) +
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = sprintf("%.2f%%", Percentage)), vjust = -0.5, size = 3) +
  labs(title = "Distribution of COVID Cases by Severity (Percentage)",
       x = "Severity",
       y = "Percentage") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 7))

Less than 5% of all confirmed cases lead to death

Geospatial Analysis

# Filter data for confirmed cases
confirmed_cases <- data %>%
  filter(confirmed_case == "Yes")

# Drop missing values from 'longitude_JITT' and 'latitude_JITT' columns
confirmed_cases <- confirmed_cases[complete.cases(confirmed_cases[c("longitude_JITT", "latitude_JITT")]), ]

# Create a leaflet map with the sampled data
leaflet() %>%
  addTiles() %>%  # Add a basic map background
  addCircleMarkers(
    data = confirmed_cases,  # Use the sampled data
    lng = ~longitude_JITT,
    lat = ~latitude_JITT,
    color = "red",
    radius = 4
  )

Weekly COVID OutBreak Situation

Kiprop Amos

2023-10-27

Loading libraries

Loading Data

Data Inspection

Data Pre-Processing

Data Analysis

Epidemiological Trends

Demographic Analyses

COVID Cases by Gender & Age

COVID Cases by ethnicity

COVID Cases by Race

What are the most commonly reported symptoms among COVID-19 patients?

Symptoms and Severity

Geospatial Analysis