library(stringr)
library(dplyr)
library(covid19.analytics)
library(lubridate)
library(leaps)
library(zoo)
library(glmnet)
library(caTools)
library(olsrr)
library(pls)
set.seed(1)
options(max.print=999999)
dataset <- read.csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv", stringsAsFactors = T) %>%
select(-c(continent, location, weekly_icu_admissions, weekly_icu_admissions_per_million, weekly_hosp_admissions,
weekly_hosp_admissions_per_million, hosp_patients_per_million, total_boosters_per_hundred, continent,
location, weekly_icu_admissions, weekly_icu_admissions_per_million, weekly_hosp_admissions,
weekly_hosp_admissions_per_million, icu_patients_per_million, handwashing_facilities, total_deaths_per_million,
new_deaths_per_million, new_tests, excess_mortality_cumulative, total_vaccinations_per_hundred,
people_fully_vaccinated_per_hundred, new_vaccinations_smoothed_per_million, total_cases_per_million,
new_people_vaccinated_smoothed_per_hundred, excess_mortality_cumulative_absolute, new_cases_per_million,
excess_mortality_cumulative_per_million, excess_mortality_cumulative, excess_mortality, total_tests,
people_vaccinated_per_hundred, new_deaths_smoothed_per_million, new_cases_smoothed_per_million,
male_smokers, female_smokers, cardiovasc_death_rate, new_tests_smoothed, new_tests_smoothed_per_thousand,
new_people_vaccinated_smoothed, new_cases_smoothed, new_deaths_smoothed, new_vaccinations_smoothed)) %>%
filter(iso_code %in% c("USA", "IND", "BRA", "RUS", "GBR", "FRA", "TUR", "ITA", "COL", "DEU"))
dataset$iso_code <- factor(dataset$iso_code)
dataset[is.na(dataset)] = 0
dataset$date_year <- substr(dataset$date, 1, 4)
dataset$date_year <- as.factor(dataset$date_year)
dataset$date_month <- substr(dataset$date, 6, 7)
dataset$date_month <- as.factor(dataset$date_month)
dataset$date_day <- substr(dataset$date, 9, 10)
dataset$date_day <- as.factor(dataset$date_day)
dataset$total_cases_thousand <- dataset$total_cases / 1000
dataset$new_cases_thousand <- dataset$new_cases / 1000
dataset$new_deaths_thousand <- dataset$new_deaths / 1000
dataset$icu_patients_thousand <- dataset$icu_patients / 1000
dataset$hosp_patients_thousand <- dataset$hosp_patients / 1000
dataset$total_vaccinations_thousand <- dataset$total_vaccinations / 1000
dataset$people_vaccinated_thousand <- dataset$people_vaccinated / 1000
dataset$people_fully_vaccinated_thousand <- dataset$people_fully_vaccinated / 1000
dataset$total_boosters_thousand <- dataset$total_boosters / 1000
dataset$new_vaccinations_thousand <- dataset$new_vaccinations / 1000
dataset$population_thousand <- dataset$population / 1000
dataset$total_deaths_thousand <- dataset$total_deaths / 1000
dataset <- subset(dataset, select = -c(total_cases, new_cases, new_deaths, icu_patients, hosp_patients,
total_vaccinations, people_vaccinated, people_fully_vaccinated,
total_boosters, new_vaccinations, population, total_deaths, date))
dataset <- na.omit(dataset)
rownames(dataset) <- 1:nrow(dataset)
split <- sample.split(dataset, SplitRatio = 0.8)
train <- subset(dataset, split == T)
test <- subset(dataset, split == F)
model1 = glm(new_deaths_thousand ~ ., data=train)
results <- summary(model1)
# results
pvals <- data.frame(results$coefficients)
pvals <- filter(pvals, pvals$Pr...t.. < 0.05)
print(rownames(pvals))
## [1] "(Intercept)" "iso_codeCOL"
## [3] "iso_codeDEU" "iso_codeFRA"
## [5] "iso_codeGBR" "iso_codeIND"
## [7] "iso_codeITA" "iso_codeRUS"
## [9] "iso_codeTUR" "iso_codeUSA"
## [11] "reproduction_rate" "total_tests_per_thousand"
## [13] "positive_rate" "tests_per_case"
## [15] "tests_unitspeople tested" "tests_unitstests performed"
## [17] "stringency_index" "date_month03"
## [19] "date_month07" "date_month08"
## [21] "date_month09" "date_month10"
## [23] "date_month11" "date_month12"
## [25] "total_cases_thousand" "new_cases_thousand"
## [27] "icu_patients_thousand" "hosp_patients_thousand"
## [29] "total_vaccinations_thousand" "people_vaccinated_thousand"
## [31] "people_fully_vaccinated_thousand" "total_boosters_thousand"
## [33] "new_vaccinations_thousand" "total_deaths_thousand"
mlr_mse <- mean(model1$residuals^2)
mlr_predictions <- predict(model1, newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(mlr_predictions)
## 4 6 7 18 20 21
## 0.28451533 0.27212697 0.26488335 0.09767576 0.12510784 0.16675626
MLR MSE is shown as:
mlr_mse
## [1] 0.1076209
MLR shows significant variables are: new_tests_per_thousand, new_tests_smoothed_per_thousand, new_people_vaccinated_smoothed, new_cases_thousand, new_vaccinations_thousand, total_deaths_thousand, new_cases_smoothed_thousand, and new_deaths_smoothed_thousand.
x = model.matrix(new_deaths_thousand~., dataset)[,-1]
# model.matrix()[,-1] is for taking off the intercept
y = dataset$new_deaths_thousand # create the response vector
train = sample(6700, 5360) # 80% training
test = (-train) # 20% test
ridge.mod = glmnet(x[train,], y[train], alpha = 0) # fit ridge regression on training data
cv.out = cv.glmnet(x[train,], y[train], alpha = 0) # run cross validation to find the best lambda
plot(cv.out)
names(cv.out)
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "call" "name" "glmnet.fit" "lambda.min"
## [11] "lambda.1se" "index"
bestlam = cv.out$lambda.min
Best/Min. Lambda is shown as:
bestlam
## [1] 0.04972917
ridge.mod = glmnet(x[train,], y[train], alpha = 0,lambda = bestlam) # alpha = 0 indicates ridge regression method
ridge.pred = predict(ridge.mod, s = bestlam, newx = x[test,])
head(ridge.pred)
## s1
## 2 0.2625860
## 5 0.2311630
## 8 0.2604567
## 10 0.2450709
## 12 0.2696403
## 13 0.2843285
y.test = y[test] # response vector in the test data
ridge_mse <- mean((ridge.pred - y.test)^2)
Test MSE for the best lambda is shown as:
ridge_mse
## [1] 0.1250746
ridge.out = glmnet(x, y, alpha =0)
ridge_results <- predict(ridge.out, type = "coefficients", s = bestlam)#[1:20,]
# ridge_results
ridge_results <- as.matrix(ridge_results)
ridge_coef <- data.frame(ridge_results)
ridge_coef <- cbind(Coefficients = rownames(ridge_coef), ridge_coef)
rownames(ridge_coef) <- 1:nrow(ridge_coef)
ridge_coef <- filter(ridge_coef, ridge_coef$s1 < 1) # 1 is used since coefficients typically can not go to absolute zero
ridge_coef <- filter(ridge_coef, ridge_coef$s1 >= 0)
ridge_coef
## Coefficients s1
## 1 (Intercept) 9.339840e-01
## 2 iso_codeFRA 1.466479e-02
## 3 iso_codeUSA 6.983576e-02
## 4 positive_rate 5.912046e-01
## 5 tests_unitstests performed 1.166082e-01
## 6 tests_unitsunits unclear 0.000000e+00
## 7 stringency_index 3.532478e-03
## 8 date_year2021 4.233123e-02
## 9 date_month02 6.031554e-04
## 10 date_month04 2.286625e-02
## 11 date_month05 3.335466e-02
## 12 date_month06 6.871334e-03
## 13 date_month12 5.687551e-03
## 14 date_day02 7.442083e-03
## 15 date_day05 1.148983e-02
## 16 date_day09 2.089970e-02
## 17 date_day10 2.575950e-02
## 18 date_day12 2.068724e-02
## 19 date_day13 3.024748e-03
## 20 date_day15 1.549985e-02
## 21 date_day16 2.277378e-02
## 22 date_day18 4.715723e-04
## 23 date_day19 4.947867e-03
## 24 date_day20 1.545013e-02
## 25 date_day22 2.856118e-03
## 26 date_day23 9.162999e-03
## 27 date_day27 3.351535e-03
## 28 date_day29 1.517134e-02
## 29 date_day30 1.016048e-02
## 30 total_cases_thousand 5.429685e-06
## 31 new_cases_thousand 8.972875e-03
## 32 hosp_patients_thousand 1.373104e-03
## 33 people_vaccinated_thousand 7.531897e-08
## 34 new_vaccinations_thousand 3.790903e-05
## 35 population_thousand 1.551060e-08
## 36 total_deaths_thousand 4.179705e-04
Ridge results generates more estimators of the regression coefficients with reduced variability. Estimators that were reduced to zero are certain iso_codes and tests_units_unclear.
set.seed(1)
lasso.mod = glmnet(x[train,], y[train], alpha = 1) # alpha = 1 indicates lasso regression method
cv.out = cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv.out)
bestlam = cv.out$lambda.min
Best/Min. Lambda is shown as:
bestlam
## [1] 0.0003508294
lasso.mod = glmnet(x[train,], y[train], alpha = 1,lambda = bestlam)
lasso.pred = predict(lasso.mod, s = bestlam, newx = x[test ,])
head(lasso.pred)
## s1
## 2 0.2962766
## 5 0.2711268
## 8 0.2966374
## 10 0.2845742
## 12 0.3027936
## 13 0.3225116
lasso_mse <- mean((lasso.pred - y.test)^2)
Test MSE for the best lambda is shown as:
lasso_mse
## [1] 0.1212268
out = glmnet(x, y, alpha = 1)
plot(out)
lasso.coef = predict(out, type ="coefficients", s = bestlam)#[1:20,]
# lasso.coef
lasso.coef <- as.matrix(lasso.coef)
lasso_coef <- data.frame(lasso.coef)
lasso_coef <- cbind(Coefficients = rownames(lasso_coef), lasso_coef)
rownames(lasso_coef) <- 1:nrow(lasso_coef)
lasso_coef <- filter(lasso_coef, lasso_coef$s1 == 0)
lasso_coef
## Coefficients s1
## 1 iso_codeDEU 0
## 2 iso_codeGBR 0
## 3 iso_codeIND 0
## 4 iso_codeUSA 0
## 5 tests_unitsunits unclear 0
## 6 median_age 0
## 7 aged_65_older 0
## 8 extreme_poverty 0
## 9 diabetes_prevalence 0
## 10 life_expectancy 0
## 11 human_development_index 0
## 12 date_day04 0
## 13 date_day08 0
## 14 date_day28 0
## 15 people_vaccinated_thousand 0
## 16 population_thousand 0
Estimators that were reduced to zero are: certain iso_codes, reproduction_rate, tests_per_case, tests_unitstests, stringency_index, population_density, median_age, aged_65_older, aged_70_older, gdp_per_capita, extreme_poverty, diabetes_prevalence, life_expectancy, human_development_index, date_year, date_month, date_day, total_cases_thousand, icu_patients_thousand, hosp_patients_thousand, people_vaccinated_thousand, population_thousand, new_people_vaccinated_smoothed_thousand, and new_vaccinations_smoothed_thousand.
mlr_mse
## [1] 0.1076209
ridge_mse
## [1] 0.1250746
lasso_mse
## [1] 0.1212268
MLR and Lasso are approximately even for MSE.
dataset <- subset(dataset, select = -c(iso_code, tests_units, date_year, date_month, date_day))
x=model.matrix(new_deaths_thousand~.,dataset)[,-1]
y=dataset$new_deaths_thousand
set.seed(1)
train=sample (1: nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
pcr.fit=pcr(total_deaths_thousand~., data=dataset, scale=TRUE, validation ="CV")
summary (pcr.fit)
## Data: X dimension: 6730 27
## Y dimension: 6730 1
## Fit method: svdpc
## Number of components considered: 27
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 166.4 153.3 93.46 92.70 86.62 86.63 86.61
## adjCV 166.4 153.3 93.45 92.69 86.61 86.62 86.60
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 85.79 84.69 84.51 84.05 79.69 76.42 76.28
## adjCV 85.78 84.68 84.50 84.05 79.66 76.39 76.27
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 71.76 69.78 67.74 47.77 46.73 41.06 40.32
## adjCV 71.74 69.77 67.74 47.76 46.74 41.05 40.30
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 39.36 39.11 39.11 34.75 34.71 34.72 34.83
## adjCV 39.35 39.10 39.10 34.74 34.70 34.68 34.69
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 29.76 47.74 58.64 66.02 72.41 76.99
## total_deaths_thousand 15.18 68.53 69.06 72.99 73.00 73.02
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps
## X 80.66 84.07 87.27 89.72 91.88 93.83
## total_deaths_thousand 73.55 74.23 74.35 74.66 77.22 79.05
## 13 comps 14 comps 15 comps 16 comps 17 comps
## X 95.61 96.76 97.64 98.35 98.85
## total_deaths_thousand 79.12 81.52 82.54 83.57 91.81
## 18 comps 19 comps 20 comps 21 comps 22 comps
## X 99.24 99.59 99.74 99.87 99.98
## total_deaths_thousand 92.17 93.97 94.19 94.45 94.53
## 23 comps 24 comps 25 comps 26 comps 27 comps
## X 100.00 100.00 100.0 100.0 100.00
## total_deaths_thousand 94.53 95.67 95.7 95.7 95.71
validationplot(pcr.fit, val.type="MSEP")
set.seed(1)
pcr.fit=pcr(total_deaths_thousand~., data=dataset, subset=train, scale=TRUE, validation ="CV")
validationplot(pcr.fit, val.type="MSEP")
pcr.pred=predict (pcr.fit, x[test ,], ncomp =22)
mean((pcr.pred -y.test)^2)
## [1] 22219738577
pcr.fit=pcr(y~x, scale=TRUE, ncomp=22)
summary(pcr.fit)
## Data: X dimension: 6730 27
## Y dimension: 6730 1
## Fit method: svdpc
## Number of components considered: 22
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 30.094 49.71 59.49 66.95 73.24 77.77 81.36 84.78
## y 8.597 16.61 41.16 42.90 45.31 46.30 47.34 47.61
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 87.61 90.02 92.31 94.16 95.68 96.97 97.91
## y 54.20 55.62 56.28 57.47 68.21 68.41 68.65
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 98.70 99.12 99.49 99.66 99.79 99.90 99.98
## y 68.65 69.75 69.84 70.38 70.42 70.44 70.56