Introduction

This project performs advanced exploratory data analysis of global COVID-19 data using R.

Load Libraries & Data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
covid_data <- read.csv("covid_data.csv")
nrow(covid_data)
## [1] 429435

Data Cleaning

ncol(covid_data)
## [1] 67
dim(covid_data)
## [1] 429435     67
covid_clean <- covid_data %>%
  select(location, continent, date, total_cases, new_cases, total_deaths, total_vaccinations)

covid_clean$date <- as.Date(covid_clean$date)

covid_clean <- covid_clean %>%
  filter(!is.na(total_cases), !is.na(total_deaths))

Global Trend

global_trend <- covid_clean %>%
  group_by(date) %>%
  summarise(total_cases = sum(total_cases, na.rm = TRUE))

ggplot(global_trend, aes(x=date, y=total_cases)) +
  geom_line(color="blue", size=1) +
  labs(title="Global COVID-19 Cases Over Time",
       x="Date", y="Total Cases") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

New Cases Trend

ggplot(covid_clean, aes(x=date, y=new_cases)) +
  geom_line(color="red", size=1) +
  labs(title="Daily New COVID-19 Cases",
       x="Date", y="New Cases") +
  theme_light()

Death Trend

ggplot(covid_clean, aes(x=date, y=total_deaths)) +
  geom_line(color="black", size=1) +
  labs(title="Total Deaths Over Time",
       x="Date", y="Deaths") +
  theme_classic()

Log Scale Trend

ggplot(global_trend, aes(x=date, y=total_cases)) +
  geom_line(color="darkgreen", size=1) +
  scale_y_log10() +
  labs(title="Log Scale COVID Cases") +
  theme_minimal()

Highest Cases Day

covid_clean %>%
  filter(new_cases == max(new_cases, na.rm = TRUE))
##   location continent       date total_cases new_cases total_deaths
## 1    World           2022-12-25   704630839  44236227      6695683
##   total_vaccinations
## 1        13156309935

Top Countries

top_countries <- covid_clean %>%
  group_by(location) %>%
  summarise(max_cases = max(total_cases, na.rm = TRUE)) %>%
  arrange(desc(max_cases)) %>%
  slice(1:10)

ggplot(top_countries, aes(x=reorder(location, max_cases), y=max_cases, fill=location)) +
  geom_col() +
  coord_flip() +
  labs(title="Top 10 Countries by Cases") +
  theme_minimal() +
  theme(legend.position="none")

Country Comparison

selected <- covid_clean %>%
  filter(location %in% c("India","United States","Brazil"))

ggplot(selected, aes(x=date, y=total_vaccinations, color=location)) +
  geom_line(size=1) +
  labs(title="Vaccination Trend Comparison") +
  theme_minimal()
## Warning: Removed 2051 rows containing missing values or values outside the scale range
## (`geom_line()`).

Death Rate

covid_clean <- covid_clean %>%
  mutate(death_rate = total_deaths / total_cases)

Correlation

cor_data <- covid_clean %>%
  select(total_cases, new_cases, total_deaths, total_vaccinations)

cor_matrix <- cor(na.omit(cor_data))
cor_matrix
##                    total_cases new_cases total_deaths total_vaccinations
## total_cases          1.0000000 0.1226220    0.9568953          0.9148502
## new_cases            0.1226220 1.0000000    0.1508531          0.1367483
## total_deaths         0.9568953 0.1508531    1.0000000          0.8673348
## total_vaccinations   0.9148502 0.1367483    0.8673348          1.0000000

Heatmap

cor_melt <- reshape2::melt(cor_matrix)

ggplot(cor_melt, aes(Var1, Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
  labs(title="Correlation Heatmap") +
  theme_minimal()

Scatter Plot

ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(color="purple", alpha=0.4) +
  labs(title="Cases vs Deaths") +
  theme_minimal()

Continent Analysis

continent_data <- covid_clean %>%
  group_by(continent) %>%
  summarise(total_cases = sum(total_cases, na.rm = TRUE))

ggplot(continent_data, aes(continent, total_cases, fill=continent)) +
  geom_col() +
  labs(title="COVID Cases by Continent") +
  theme_minimal() +
  theme(legend.position="none")

Death Rate by Continent

continent_death <- covid_clean %>%
  group_by(continent) %>%
  summarise(avg_death_rate = mean(death_rate, na.rm = TRUE))

ggplot(continent_death, aes(continent, avg_death_rate, fill=continent)) +
  geom_col() +
  labs(title="Death Rate by Continent") +
  theme_minimal() +
  theme(legend.position="none")

Linear Regression

lm_model1 <- lm(total_deaths ~ total_cases, data = covid_clean)
summary(lm_model1)
## 
## Call:
## lm(formula = total_deaths ~ total_cases, data = covid_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1184650   -12704   -12628   -11002  2829851 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263e+04  2.267e+02   55.74   <2e-16 ***
## total_cases 9.317e-03  4.995e-06 1865.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 143500 on 411802 degrees of freedom
## Multiple R-squared:  0.8942, Adjusted R-squared:  0.8942 
## F-statistic: 3.479e+06 on 1 and 411802 DF,  p-value: < 2.2e-16
ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm", color="blue") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Multiple Linear Regression

lm_model2 <- lm(total_deaths ~ total_cases + new_cases + total_vaccinations, data = covid_clean)
summary(lm_model2)
## 
## Call:
## lm(formula = total_deaths ~ total_cases + new_cases + total_vaccinations, 
##     data = covid_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3646755   -39675   -37350   -16728  2852416 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.821e+04  9.615e+02   39.74   <2e-16 ***
## total_cases         1.001e-02  2.622e-05  381.74   <2e-16 ***
## new_cases           6.770e-02  2.048e-03   33.06   <2e-16 ***
## total_vaccinations -2.658e-05  1.274e-06  -20.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252000 on 73528 degrees of freedom
##   (338272 observations deleted due to missingness)
## Multiple R-squared:  0.9173, Adjusted R-squared:  0.9173 
## F-statistic: 2.718e+05 on 3 and 73528 DF,  p-value: < 2.2e-16

Predictions

covid_clean$predicted_deaths <- predict(lm_model1)

Residual Analysis

covid_clean$residuals <- residuals(lm_model1)

ggplot(covid_clean, aes(predicted_deaths, residuals)) +
  geom_point(alpha=0.3, color="red") +
  geom_hline(yintercept=0, linetype="dashed") +
  theme_minimal()

Residual Distribution

ggplot(covid_clean, aes(residuals)) +
  geom_histogram(bins=30, fill="blue") +
  theme_minimal()

Additional Correlations

cor(covid_clean$total_cases, covid_clean$total_deaths, use="complete.obs")
## [1] 0.9456006
cor(covid_clean$total_cases, covid_clean$total_vaccinations, use="complete.obs")
## [1] 0.9148434

Pair Plot

pairs(cor_data)

Polynomial Regression

poly_model <- lm(total_deaths ~ poly(total_cases, 2), data = covid_clean)
summary(poly_model)
## 
## Call:
## lm(formula = total_deaths ~ poly(total_cases, 2), data = covid_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1299289    -9600    -9497    -8257  2713620 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            8.126e+04  2.215e+02  366.81   <2e-16 ***
## poly(total_cases, 2)1  2.677e+08  1.422e+05 1883.21   <2e-16 ***
## poly(total_cases, 2)2 -1.270e+07  1.422e+05  -89.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142200 on 411801 degrees of freedom
## Multiple R-squared:  0.8962, Adjusted R-squared:  0.8962 
## F-statistic: 1.777e+06 on 2 and 411801 DF,  p-value: < 2.2e-16
ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(alpha=0.3) +
  stat_smooth(method="lm", formula = y ~ poly(x, 2), color="darkgreen") +
  theme_minimal()

Higher Degree Polynomial

poly_model3 <- lm(total_deaths ~ poly(total_cases, 3), data = covid_clean)
summary(poly_model3)
## 
## Call:
## lm(formula = total_deaths ~ poly(total_cases, 3), data = covid_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1127453    -2897    -2443    -2098  2840667 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            8.126e+04  2.115e+02  384.24   <2e-16 ***
## poly(total_cases, 3)1  2.677e+08  1.357e+05 1972.70   <2e-16 ***
## poly(total_cases, 3)2 -1.270e+07  1.357e+05  -93.62   <2e-16 ***
## poly(total_cases, 3)3  2.717e+07  1.357e+05  200.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 135700 on 411800 degrees of freedom
## Multiple R-squared:  0.9054, Adjusted R-squared:  0.9054 
## F-statistic: 1.313e+06 on 3 and 411800 DF,  p-value: < 2.2e-16

Model Comparison

AIC(lm_model1, poly_model, poly_model3)
##             df      AIC
## lm_model1    3 10948435
## poly_model   4 10940527
## poly_model3  5 10902291

R-Squared

summary(lm_model1)$r.squared
## [1] 0.8941605
summary(poly_model)$r.squared
## [1] 0.8961742
summary(poly_model3)$r.squared
## [1] 0.9053809

—————- LOG REGRESSION (FINAL FIX) —————-

Create Log Data

log_data <- covid_clean %>%
  filter(total_cases > 0, total_deaths > 0) %>%
  drop_na(total_cases, total_deaths)

# Create Log Variables

log_data <- log_data %>%
  mutate(
    log_cases = log(total_cases),
    log_deaths = log(total_deaths))
# Build Log Regression Model
log_model <- lm(log_deaths ~ log_cases, data = log_data)
summary(log_model)
## 
## Call:
## lm(formula = log_deaths ~ log_cases, data = log_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4241 -0.5429  0.1320  0.7054 11.2170 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.1990672  0.0072302  -580.8   <2e-16 ***
## log_cases    0.9739806  0.0005935  1641.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.102 on 359420 degrees of freedom
## Multiple R-squared:  0.8822, Adjusted R-squared:  0.8822 
## F-statistic: 2.693e+06 on 1 and 359420 DF,  p-value: < 2.2e-16

#Plot Log Regression

  ggplot(log_data, aes(log_cases, log_deaths)) +
  geom_point(color="darkblue", alpha=0.4) +
  geom_smooth(method="lm", color="red", size=1) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Correlation by Continent

log_data <- covid_clean %>%
  filter(total_cases > 0, total_deaths > 0)

log_model <- lm(log(total_deaths) ~ log(total_cases), data = log_data)
summary(log_model)  
## 
## Call:
## lm(formula = log(total_deaths) ~ log(total_cases), data = log_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4241 -0.5429  0.1320  0.7054 11.2170 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.1990672  0.0072302  -580.8   <2e-16 ***
## log(total_cases)  0.9739806  0.0005935  1641.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.102 on 359420 degrees of freedom
## Multiple R-squared:  0.8822, Adjusted R-squared:  0.8822 
## F-statistic: 2.693e+06 on 1 and 359420 DF,  p-value: < 2.2e-16

#Correlated Log Regression

ggplot(log_data, aes(log(total_cases), log(total_deaths))) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm", color="purple") +
  theme_minimal()

Vaccination Regression

vacc_model <- lm(total_cases ~ total_vaccinations, data = covid_clean)
summary(vacc_model)
## 
## Call:
## lm(formula = total_cases ~ total_vaccinations, data = covid_clean)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -216248590   -3114841   -2854194    -876755  187699875 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.115e+06  1.347e+05   23.13   <2e-16 ***
## total_vaccinations 4.438e-02  7.224e-05  614.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35450000 on 73540 degrees of freedom
##   (338262 observations deleted due to missingness)
## Multiple R-squared:  0.8369, Adjusted R-squared:  0.8369 
## F-statistic: 3.775e+05 on 1 and 73540 DF,  p-value: < 2.2e-16
ggplot(covid_clean, aes(total_vaccinations, total_cases)) +
  geom_point(alpha=0.3, color="orange") +
  geom_smooth(method="lm") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 338262 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 338262 rows containing missing values or values outside the scale range
## (`geom_point()`).

Conclusion

The analysis shows increasing trends in COVID-19 cases and deaths, strong relationships between variables, and variation across countries and continents.