title: “Exploratory Data Analysis of Global COVID-19 Trends using R” author: “Amit Kumar Dubey” date: “2026-05-04” output: html_document: toc: true toc_float: true theme: cosmo ————
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
covid_data <- read.csv("covid_data.csv")
covid_clean <- covid_data %>%
select(location, continent, date, total_cases, new_cases, total_deaths, total_vaccinations) %>%
mutate(date = as.Date(date)) %>%
filter(!is.na(total_cases), !is.na(total_deaths)) %>%
filter(is.finite(total_cases), is.finite(total_deaths))
dim(covid_clean)
## [1] 411804 7
global_trend <- covid_clean %>%
group_by(date) %>%
summarise(total_cases = sum(total_cases, na.rm = TRUE))
ggplot(global_trend, aes(date, total_cases)) +
geom_line() +
theme_minimal()
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(color="darkred", alpha=0.5) +
theme_minimal()
cor_data <- covid_clean %>%
select(total_cases, new_cases, total_deaths, total_vaccinations)
cor(na.omit(cor_data))
## total_cases new_cases total_deaths total_vaccinations
## total_cases 1.0000000 0.1226220 0.9568953 0.9148502
## new_cases 0.1226220 1.0000000 0.1508531 0.1367483
## total_deaths 0.9568953 0.1508531 1.0000000 0.8673348
## total_vaccinations 0.9148502 0.1367483 0.8673348 1.0000000
lm_model <- lm(total_deaths ~ total_cases, data = covid_clean)
summary(lm_model)
##
## Call:
## lm(formula = total_deaths ~ total_cases, data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1184650 -12704 -12628 -11002 2829851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263e+04 2.267e+02 55.74 <2e-16 ***
## total_cases 9.317e-03 4.995e-06 1865.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143500 on 411802 degrees of freedom
## Multiple R-squared: 0.8942, Adjusted R-squared: 0.8942
## F-statistic: 3.479e+06 on 1 and 411802 DF, p-value: < 2.2e-16
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(color="steelblue", alpha=0.4) +
geom_smooth(method="lm", color="orange", size=1) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
f_stat <- summary(lm_model)$fstatistic
f_stat
## value numdf dendf
## 3479014 1 411802
pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
## value
## 0
poly_model <- lm(total_deaths ~ poly(total_cases, 2), data = covid_clean)
summary(poly_model)
##
## Call:
## lm(formula = total_deaths ~ poly(total_cases, 2), data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1299289 -9600 -9497 -8257 2713620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.126e+04 2.215e+02 366.81 <2e-16 ***
## poly(total_cases, 2)1 2.677e+08 1.422e+05 1883.21 <2e-16 ***
## poly(total_cases, 2)2 -1.270e+07 1.422e+05 -89.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 142200 on 411801 degrees of freedom
## Multiple R-squared: 0.8962, Adjusted R-squared: 0.8962
## F-statistic: 1.777e+06 on 2 and 411801 DF, p-value: < 2.2e-16
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(color="purple", alpha=0.4) +
stat_smooth(method="lm", formula = y ~ poly(x, 2), color="darkgreen", size=1) +
theme_minimal()
log_data <- covid_clean %>%
filter(total_cases > 0, total_deaths > 0)
log_model <- lm(log(total_deaths) ~ log(total_cases), data = log_data)
summary(log_model)
##
## Call:
## lm(formula = log(total_deaths) ~ log(total_cases), data = log_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4241 -0.5429 0.1320 0.7054 11.2170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.1990672 0.0072302 -580.8 <2e-16 ***
## log(total_cases) 0.9739806 0.0005935 1641.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.102 on 359420 degrees of freedom
## Multiple R-squared: 0.8822, Adjusted R-squared: 0.8822
## F-statistic: 2.693e+06 on 1 and 359420 DF, p-value: < 2.2e-16
ggplot(log_data, aes(log(total_cases), log(total_deaths))) +
geom_point(color="brown", alpha=0.4) +
geom_smooth(method="lm", color="cyan", size=1) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
AIC(lm_model, poly_model, log_model)
## Warning in AIC.default(lm_model, poly_model, log_model): models are not all
## fitted to the same number of observations
## df AIC
## lm_model 3 10948435
## poly_model 4 10940527
## log_model 3 1089854
summary(lm_model)$r.squared
## [1] 0.8941605
summary(poly_model)$r.squared
## [1] 0.8961742
summary(log_model)$r.squared
## [1] 0.8822466
f_stat <- summary(lm_model)$fstatistic
F_calculated <- f_stat[1] df1 <- f_stat[2] df2 <- f_stat[3]
F_calculated df1 df2
F_critical <- qf(0.95, df1, df2) F_critical
if(F_calculated > F_critical){ print(“Model is statistically significant (Reject H0)”) } else { print(“Model is NOT statistically significant (Fail to Reject H0)”) }
model_p_value <- pf(F_calculated, df1, df2, lower.tail = FALSE) model_p_value
if(model_p_value < 0.05){ print(“Model is significant based on p-value”) } else { print(“Model is NOT significant based on p-value”) }
summary(lm_model)$adj.r.squared
rmse <- sqrt(mean(residuals(lm_model)^2)) rmse
Linear, polynomial, and log regression models were successfully applied. The models show strong relationships between total cases and total deaths.