This project performs advanced exploratory data analysis of global COVID-19 data using R.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
covid_data <- read.csv("covid_data.csv")
nrow(covid_data)
## [1] 429435
ncol(covid_data)
## [1] 67
dim(covid_data)
## [1] 429435 67
covid_clean <- covid_data %>%
select(location, continent, date, total_cases, new_cases, total_deaths, total_vaccinations)
covid_clean$date <- as.Date(covid_clean$date)
covid_clean <- covid_clean %>%
filter(!is.na(total_cases), !is.na(total_deaths))
global_trend <- covid_clean %>%
group_by(date) %>%
summarise(total_cases = sum(total_cases, na.rm = TRUE))
ggplot(global_trend, aes(x=date, y=total_cases)) +
geom_line(color="blue", size=1) +
labs(title="Global COVID-19 Cases Over Time",
x="Date", y="Total Cases") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(covid_clean, aes(x=date, y=new_cases)) +
geom_line(color="red", size=1) +
labs(title="Daily New COVID-19 Cases",
x="Date", y="New Cases") +
theme_light()
ggplot(covid_clean, aes(x=date, y=total_deaths)) +
geom_line(color="black", size=1) +
labs(title="Total Deaths Over Time",
x="Date", y="Deaths") +
theme_classic()
ggplot(global_trend, aes(x=date, y=total_cases)) +
geom_line(color="darkgreen", size=1) +
scale_y_log10() +
labs(title="Log Scale COVID Cases") +
theme_minimal()
covid_clean %>%
filter(new_cases == max(new_cases, na.rm = TRUE))
## location continent date total_cases new_cases total_deaths
## 1 World 2022-12-25 704630839 44236227 6695683
## total_vaccinations
## 1 13156309935
top_countries <- covid_clean %>%
group_by(location) %>%
summarise(max_cases = max(total_cases, na.rm = TRUE)) %>%
arrange(desc(max_cases)) %>%
slice(1:10)
ggplot(top_countries, aes(x=reorder(location, max_cases), y=max_cases, fill=location)) +
geom_col() +
coord_flip() +
labs(title="Top 10 Countries by Cases") +
theme_minimal() +
theme(legend.position="none")
selected <- covid_clean %>%
filter(location %in% c("India","United States","Brazil"))
ggplot(selected, aes(x=date, y=total_vaccinations, color=location)) +
geom_line(size=1) +
labs(title="Vaccination Trend Comparison") +
theme_minimal()
## Warning: Removed 2051 rows containing missing values or values outside the scale range
## (`geom_line()`).
covid_clean <- covid_clean %>%
mutate(death_rate = total_deaths / total_cases)
cor_data <- covid_clean %>%
select(total_cases, new_cases, total_deaths, total_vaccinations)
cor_matrix <- cor(na.omit(cor_data))
cor_matrix
## total_cases new_cases total_deaths total_vaccinations
## total_cases 1.0000000 0.1226220 0.9568953 0.9148502
## new_cases 0.1226220 1.0000000 0.1508531 0.1367483
## total_deaths 0.9568953 0.1508531 1.0000000 0.8673348
## total_vaccinations 0.9148502 0.1367483 0.8673348 1.0000000
cor_melt <- reshape2::melt(cor_matrix)
ggplot(cor_melt, aes(Var1, Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
labs(title="Correlation Heatmap") +
theme_minimal()
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(color="purple", alpha=0.4) +
labs(title="Cases vs Deaths") +
theme_minimal()
continent_data <- covid_clean %>%
group_by(continent) %>%
summarise(total_cases = sum(total_cases, na.rm = TRUE))
ggplot(continent_data, aes(continent, total_cases, fill=continent)) +
geom_col() +
labs(title="COVID Cases by Continent") +
theme_minimal() +
theme(legend.position="none")
continent_death <- covid_clean %>%
group_by(continent) %>%
summarise(avg_death_rate = mean(death_rate, na.rm = TRUE))
ggplot(continent_death, aes(continent, avg_death_rate, fill=continent)) +
geom_col() +
labs(title="Death Rate by Continent") +
theme_minimal() +
theme(legend.position="none")
lm_model1 <- lm(total_deaths ~ total_cases, data = covid_clean)
summary(lm_model1)
##
## Call:
## lm(formula = total_deaths ~ total_cases, data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1184650 -12704 -12628 -11002 2829851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263e+04 2.267e+02 55.74 <2e-16 ***
## total_cases 9.317e-03 4.995e-06 1865.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143500 on 411802 degrees of freedom
## Multiple R-squared: 0.8942, Adjusted R-squared: 0.8942
## F-statistic: 3.479e+06 on 1 and 411802 DF, p-value: < 2.2e-16
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm", color="blue") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
lm_model2 <- lm(total_deaths ~ total_cases + new_cases + total_vaccinations, data = covid_clean)
summary(lm_model2)
##
## Call:
## lm(formula = total_deaths ~ total_cases + new_cases + total_vaccinations,
## data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3646755 -39675 -37350 -16728 2852416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.821e+04 9.615e+02 39.74 <2e-16 ***
## total_cases 1.001e-02 2.622e-05 381.74 <2e-16 ***
## new_cases 6.770e-02 2.048e-03 33.06 <2e-16 ***
## total_vaccinations -2.658e-05 1.274e-06 -20.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252000 on 73528 degrees of freedom
## (338272 observations deleted due to missingness)
## Multiple R-squared: 0.9173, Adjusted R-squared: 0.9173
## F-statistic: 2.718e+05 on 3 and 73528 DF, p-value: < 2.2e-16
covid_clean$predicted_deaths <- predict(lm_model1)
covid_clean$residuals <- residuals(lm_model1)
ggplot(covid_clean, aes(predicted_deaths, residuals)) +
geom_point(alpha=0.3, color="red") +
geom_hline(yintercept=0, linetype="dashed") +
theme_minimal()
ggplot(covid_clean, aes(residuals)) +
geom_histogram(bins=30, fill="blue") +
theme_minimal()
cor(covid_clean$total_cases, covid_clean$total_deaths, use="complete.obs")
## [1] 0.9456006
cor(covid_clean$total_cases, covid_clean$total_vaccinations, use="complete.obs")
## [1] 0.9148434
pairs(cor_data)
poly_model <- lm(total_deaths ~ poly(total_cases, 2), data = covid_clean)
summary(poly_model)
##
## Call:
## lm(formula = total_deaths ~ poly(total_cases, 2), data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1299289 -9600 -9497 -8257 2713620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.126e+04 2.215e+02 366.81 <2e-16 ***
## poly(total_cases, 2)1 2.677e+08 1.422e+05 1883.21 <2e-16 ***
## poly(total_cases, 2)2 -1.270e+07 1.422e+05 -89.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 142200 on 411801 degrees of freedom
## Multiple R-squared: 0.8962, Adjusted R-squared: 0.8962
## F-statistic: 1.777e+06 on 2 and 411801 DF, p-value: < 2.2e-16
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(alpha=0.3) +
stat_smooth(method="lm", formula = y ~ poly(x, 2), color="darkgreen") +
theme_minimal()
poly_model3 <- lm(total_deaths ~ poly(total_cases, 3), data = covid_clean)
summary(poly_model3)
##
## Call:
## lm(formula = total_deaths ~ poly(total_cases, 3), data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1127453 -2897 -2443 -2098 2840667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.126e+04 2.115e+02 384.24 <2e-16 ***
## poly(total_cases, 3)1 2.677e+08 1.357e+05 1972.70 <2e-16 ***
## poly(total_cases, 3)2 -1.270e+07 1.357e+05 -93.62 <2e-16 ***
## poly(total_cases, 3)3 2.717e+07 1.357e+05 200.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 135700 on 411800 degrees of freedom
## Multiple R-squared: 0.9054, Adjusted R-squared: 0.9054
## F-statistic: 1.313e+06 on 3 and 411800 DF, p-value: < 2.2e-16
AIC(lm_model1, poly_model, poly_model3)
## df AIC
## lm_model1 3 10948435
## poly_model 4 10940527
## poly_model3 5 10902291
summary(lm_model1)$r.squared
## [1] 0.8941605
summary(poly_model)$r.squared
## [1] 0.8961742
summary(poly_model3)$r.squared
## [1] 0.9053809
log_data <- covid_clean %>%
filter(total_cases > 0, total_deaths > 0) %>%
drop_na(total_cases, total_deaths)
# Create Log Variables
log_data <- log_data %>%
mutate(
log_cases = log(total_cases),
log_deaths = log(total_deaths))
# Build Log Regression Model
log_model <- lm(log_deaths ~ log_cases, data = log_data)
summary(log_model)
##
## Call:
## lm(formula = log_deaths ~ log_cases, data = log_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4241 -0.5429 0.1320 0.7054 11.2170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.1990672 0.0072302 -580.8 <2e-16 ***
## log_cases 0.9739806 0.0005935 1641.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.102 on 359420 degrees of freedom
## Multiple R-squared: 0.8822, Adjusted R-squared: 0.8822
## F-statistic: 2.693e+06 on 1 and 359420 DF, p-value: < 2.2e-16
#Plot Log Regression
ggplot(log_data, aes(log_cases, log_deaths)) +
geom_point(color="darkblue", alpha=0.4) +
geom_smooth(method="lm", color="red", size=1) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
log_data <- covid_clean %>%
filter(total_cases > 0, total_deaths > 0)
log_model <- lm(log(total_deaths) ~ log(total_cases), data = log_data)
summary(log_model)
##
## Call:
## lm(formula = log(total_deaths) ~ log(total_cases), data = log_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4241 -0.5429 0.1320 0.7054 11.2170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.1990672 0.0072302 -580.8 <2e-16 ***
## log(total_cases) 0.9739806 0.0005935 1641.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.102 on 359420 degrees of freedom
## Multiple R-squared: 0.8822, Adjusted R-squared: 0.8822
## F-statistic: 2.693e+06 on 1 and 359420 DF, p-value: < 2.2e-16
#Correlated Log Regression
ggplot(log_data, aes(log(total_cases), log(total_deaths))) +
geom_point(alpha=0.3) +
geom_smooth(method="lm", color="purple") +
theme_minimal()
vacc_model <- lm(total_cases ~ total_vaccinations, data = covid_clean)
summary(vacc_model)
##
## Call:
## lm(formula = total_cases ~ total_vaccinations, data = covid_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -216248590 -3114841 -2854194 -876755 187699875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.115e+06 1.347e+05 23.13 <2e-16 ***
## total_vaccinations 4.438e-02 7.224e-05 614.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35450000 on 73540 degrees of freedom
## (338262 observations deleted due to missingness)
## Multiple R-squared: 0.8369, Adjusted R-squared: 0.8369
## F-statistic: 3.775e+05 on 1 and 73540 DF, p-value: < 2.2e-16
ggplot(covid_clean, aes(total_vaccinations, total_cases)) +
geom_point(alpha=0.3, color="orange") +
geom_smooth(method="lm") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 338262 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 338262 rows containing missing values or values outside the scale range
## (`geom_point()`).
The analysis shows increasing trends in COVID-19 cases and deaths, strong relationships between variables, and variation across countries and continents.