title: “Exploratory Data Analysis of Global COVID-19 Trends using R” author: “Amit Kumar Dubey” date: “2026-05-04” output: html_document: toc: true toc_float: true theme: cosmo ————

Load Libraries

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'tibble' was built under R version 4.5.3

## Warning: package 'tidyr' was built under R version 4.5.3

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'purrr' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.5.3

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

Load Data

covid_data <- read.csv("covid_data.csv")

Data Cleaning

covid_clean <- covid_data %>%
  select(location, continent, date, total_cases, new_cases, total_deaths, total_vaccinations) %>%
  mutate(date = as.Date(date)) %>%
  filter(!is.na(total_cases), !is.na(total_deaths)) %>%
  filter(is.finite(total_cases), is.finite(total_deaths))

Basic Info

dim(covid_clean)

## [1] 411804      7

Global Trend

global_trend <- covid_clean %>%
  group_by(date) %>%
  summarise(total_cases = sum(total_cases, na.rm = TRUE))

ggplot(global_trend, aes(date, total_cases)) +
  geom_line() +
  theme_minimal()

Scatter Plot

ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(color="darkred", alpha=0.5) +
  theme_minimal()

Correlation

cor_data <- covid_clean %>%
  select(total_cases, new_cases, total_deaths, total_vaccinations)

cor(na.omit(cor_data))

##                    total_cases new_cases total_deaths total_vaccinations
## total_cases          1.0000000 0.1226220    0.9568953          0.9148502
## new_cases            0.1226220 1.0000000    0.1508531          0.1367483
## total_deaths         0.9568953 0.1508531    1.0000000          0.8673348
## total_vaccinations   0.9148502 0.1367483    0.8673348          1.0000000

—————- LINEAR REGRESSION —————-

lm_model <- lm(total_deaths ~ total_cases, data = covid_clean)
summary(lm_model)

## 
## Call:
## lm(formula = total_deaths ~ total_cases, data = covid_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1184650   -12704   -12628   -11002  2829851 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263e+04  2.267e+02   55.74   <2e-16 ***
## total_cases 9.317e-03  4.995e-06 1865.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 143500 on 411802 degrees of freedom
## Multiple R-squared:  0.8942, Adjusted R-squared:  0.8942 
## F-statistic: 3.479e+06 on 1 and 411802 DF,  p-value: < 2.2e-16

Regression Plot

ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(color="steelblue", alpha=0.4) +
  geom_smooth(method="lm", color="orange", size=1) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

Extract F-Statistic

f_stat <- summary(lm_model)$fstatistic
f_stat

##   value   numdf   dendf 
## 3479014       1  411802

P-value of Model

pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)

## value 
##     0

—————- POLYNOMIAL REGRESSION —————-

poly_model <- lm(total_deaths ~ poly(total_cases, 2), data = covid_clean)
summary(poly_model)

## 
## Call:
## lm(formula = total_deaths ~ poly(total_cases, 2), data = covid_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1299289    -9600    -9497    -8257  2713620 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            8.126e+04  2.215e+02  366.81   <2e-16 ***
## poly(total_cases, 2)1  2.677e+08  1.422e+05 1883.21   <2e-16 ***
## poly(total_cases, 2)2 -1.270e+07  1.422e+05  -89.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142200 on 411801 degrees of freedom
## Multiple R-squared:  0.8962, Adjusted R-squared:  0.8962 
## F-statistic: 1.777e+06 on 2 and 411801 DF,  p-value: < 2.2e-16

ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(color="purple", alpha=0.4) +
  stat_smooth(method="lm", formula = y ~ poly(x, 2), color="darkgreen", size=1) +
  theme_minimal()

—————- LOG REGRESSION (FIXED) —————-

log_data <- covid_clean %>%
  filter(total_cases > 0, total_deaths > 0)

log_model <- lm(log(total_deaths) ~ log(total_cases), data = log_data)
summary(log_model)

## 
## Call:
## lm(formula = log(total_deaths) ~ log(total_cases), data = log_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4241 -0.5429  0.1320  0.7054 11.2170 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.1990672  0.0072302  -580.8   <2e-16 ***
## log(total_cases)  0.9739806  0.0005935  1641.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.102 on 359420 degrees of freedom
## Multiple R-squared:  0.8822, Adjusted R-squared:  0.8822 
## F-statistic: 2.693e+06 on 1 and 359420 DF,  p-value: < 2.2e-16

ggplot(log_data, aes(log(total_cases), log(total_deaths))) +
  geom_point(color="brown", alpha=0.4) +
  geom_smooth(method="lm", color="cyan", size=1) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

—————- MODEL COMPARISON —————-

AIC(lm_model, poly_model, log_model)

## Warning in AIC.default(lm_model, poly_model, log_model): models are not all
## fitted to the same number of observations

##            df      AIC
## lm_model    3 10948435
## poly_model  4 10940527
## log_model   3  1089854

R-Squared Comparison

summary(lm_model)$r.squared

## [1] 0.8941605

summary(poly_model)$r.squared

## [1] 0.8961742

summary(log_model)$r.squared

## [1] 0.8822466

—————- MODEL EFFICIENCY TEST —————-

Extract F-statistic components

f_stat <- summary(lm_model)$fstatistic

F_calculated <- f_stat[1] df1 <- f_stat[2] df2 <- f_stat[3]

F_calculated df1 df2

Critical F-value at 95% confidence

F_critical <- qf(0.95, df1, df2) F_critical

Decision Rule

if(F_calculated > F_critical){ print(“Model is statistically significant (Reject H0)”) } else { print(“Model is NOT statistically significant (Fail to Reject H0)”) }

P-value check

model_p_value <- pf(F_calculated, df1, df2, lower.tail = FALSE) model_p_value

if(model_p_value < 0.05){ print(“Model is significant based on p-value”) } else { print(“Model is NOT significant based on p-value”) }

—————- MODEL ACCURACY —————-

Adjusted R-Squared (better measure)

summary(lm_model)$adj.r.squared

RMSE (Root Mean Square Error)

rmse <- sqrt(mean(residuals(lm_model)^2)) rmse

Conclusion

Linear, polynomial, and log regression models were successfully applied. The models show strong relationships between total cases and total deaths.