library(stringr)
library(dplyr)
library(covid19.analytics)
library(lubridate)
library(leaps)
library(zoo)
library(glmnet)
library(caTools)
library(olsrr)
library(pls)
set.seed(1)
options(max.print=999999)

Data Import/Unused Variable Removal/Filter of Top 10 Countries

dataset <- read.csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv", stringsAsFactors = T) %>%
  select(-c(continent, location, weekly_icu_admissions, weekly_icu_admissions_per_million, weekly_hosp_admissions,
            weekly_hosp_admissions_per_million, hosp_patients_per_million, total_boosters_per_hundred, continent,
            location, weekly_icu_admissions, weekly_icu_admissions_per_million, weekly_hosp_admissions,
            weekly_hosp_admissions_per_million, icu_patients_per_million, handwashing_facilities, total_deaths_per_million,
            new_deaths_per_million, new_tests, excess_mortality_cumulative, total_vaccinations_per_hundred,
            people_fully_vaccinated_per_hundred, new_vaccinations_smoothed_per_million, total_cases_per_million,
            new_people_vaccinated_smoothed_per_hundred, excess_mortality_cumulative_absolute, new_cases_per_million,
            excess_mortality_cumulative_per_million, excess_mortality_cumulative, excess_mortality, total_tests,
            people_vaccinated_per_hundred, new_deaths_smoothed_per_million, new_cases_smoothed_per_million,
            male_smokers, female_smokers, cardiovasc_death_rate, new_tests_smoothed, new_tests_smoothed_per_thousand,
            new_people_vaccinated_smoothed, new_cases_smoothed, new_deaths_smoothed, new_vaccinations_smoothed)) %>%
  filter(iso_code %in% c("USA", "IND", "BRA", "RUS", "GBR", "FRA", "TUR", "ITA", "COL", "DEU"))

Keep only the current factor levels

dataset$iso_code <- factor(dataset$iso_code)

Change NA’s to Zero

dataset[is.na(dataset)] = 0

Change Date to separate columns

dataset$date_year <- substr(dataset$date, 1, 4)
dataset$date_year <- as.factor(dataset$date_year)
dataset$date_month <- substr(dataset$date, 6, 7)
dataset$date_month <- as.factor(dataset$date_month)
dataset$date_day <- substr(dataset$date, 9, 10)
dataset$date_day <- as.factor(dataset$date_day)

Change Regular Variables to thousands

dataset$total_cases_thousand <- dataset$total_cases / 1000
dataset$new_cases_thousand <- dataset$new_cases / 1000
dataset$new_deaths_thousand <- dataset$new_deaths / 1000
dataset$icu_patients_thousand <- dataset$icu_patients / 1000
dataset$hosp_patients_thousand <- dataset$hosp_patients / 1000
dataset$total_vaccinations_thousand <- dataset$total_vaccinations / 1000
dataset$people_vaccinated_thousand <- dataset$people_vaccinated / 1000
dataset$people_fully_vaccinated_thousand <- dataset$people_fully_vaccinated / 1000
dataset$total_boosters_thousand <- dataset$total_boosters / 1000
dataset$new_vaccinations_thousand <- dataset$new_vaccinations / 1000
dataset$population_thousand <- dataset$population / 1000
dataset$total_deaths_thousand <- dataset$total_deaths / 1000

Removal of non-thousands

dataset <- subset(dataset, select = -c(total_cases, new_cases, new_deaths, icu_patients, hosp_patients,
                                       total_vaccinations, people_vaccinated, people_fully_vaccinated,
                                       total_boosters, new_vaccinations, population, total_deaths, date))

Remove NAs leftover and resetting ID column

dataset <- na.omit(dataset)
rownames(dataset) <- 1:nrow(dataset)

MLR Model Significant Variable View

split <- sample.split(dataset, SplitRatio = 0.8)
train <- subset(dataset, split == T)
test <- subset(dataset, split == F)
model1 = glm(new_deaths_thousand ~ ., data=train)
results <- summary(model1)
# results
pvals <- data.frame(results$coefficients)
pvals <- filter(pvals, pvals$Pr...t.. < 0.05)
print(rownames(pvals))
##  [1] "(Intercept)"                      "iso_codeCOL"                     
##  [3] "iso_codeDEU"                      "iso_codeFRA"                     
##  [5] "iso_codeGBR"                      "iso_codeIND"                     
##  [7] "iso_codeITA"                      "iso_codeRUS"                     
##  [9] "iso_codeTUR"                      "iso_codeUSA"                     
## [11] "reproduction_rate"                "total_tests_per_thousand"        
## [13] "positive_rate"                    "tests_per_case"                  
## [15] "tests_unitspeople tested"         "tests_unitstests performed"      
## [17] "stringency_index"                 "date_month03"                    
## [19] "date_month07"                     "date_month08"                    
## [21] "date_month09"                     "date_month10"                    
## [23] "date_month11"                     "date_month12"                    
## [25] "total_cases_thousand"             "new_cases_thousand"              
## [27] "icu_patients_thousand"            "hosp_patients_thousand"          
## [29] "total_vaccinations_thousand"      "people_vaccinated_thousand"      
## [31] "people_fully_vaccinated_thousand" "total_boosters_thousand"         
## [33] "new_vaccinations_thousand"        "total_deaths_thousand"
mlr_mse <- mean(model1$residuals^2)

MLR predictions of future values

mlr_predictions <- predict(model1, newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(mlr_predictions)
##          4          6          7         18         20         21 
## 0.28451533 0.27212697 0.26488335 0.09767576 0.12510784 0.16675626

MLR MSE is shown as:

mlr_mse
## [1] 0.1076209

MLR shows significant variables are: new_tests_per_thousand, new_tests_smoothed_per_thousand, new_people_vaccinated_smoothed, new_cases_thousand, new_vaccinations_thousand, total_deaths_thousand, new_cases_smoothed_thousand, and new_deaths_smoothed_thousand.

Ridge/Lasso Model

x = model.matrix(new_deaths_thousand~., dataset)[,-1]
# model.matrix()[,-1] is for taking off the intercept
y = dataset$new_deaths_thousand # create the response vector
train = sample(6700, 5360) # 80% training
test = (-train) # 20% test

Ridge Regression

ridge.mod = glmnet(x[train,], y[train], alpha = 0) # fit ridge regression on training data
cv.out = cv.glmnet(x[train,], y[train], alpha = 0) # run cross validation to find the best lambda
plot(cv.out)

names(cv.out)
##  [1] "lambda"     "cvm"        "cvsd"       "cvup"       "cvlo"      
##  [6] "nzero"      "call"       "name"       "glmnet.fit" "lambda.min"
## [11] "lambda.1se" "index"
bestlam = cv.out$lambda.min

Best/Min. Lambda is shown as:

bestlam
## [1] 0.04972917
ridge.mod = glmnet(x[train,], y[train], alpha = 0,lambda = bestlam) # alpha = 0 indicates ridge regression method
ridge.pred = predict(ridge.mod, s = bestlam, newx = x[test,])

Predictions for ridge regression model

head(ridge.pred)
##           s1
## 2  0.2625860
## 5  0.2311630
## 8  0.2604567
## 10 0.2450709
## 12 0.2696403
## 13 0.2843285
y.test = y[test] # response vector in the test data
ridge_mse <- mean((ridge.pred - y.test)^2)

Test MSE for the best lambda is shown as:

ridge_mse
## [1] 0.1250746
ridge.out = glmnet(x, y, alpha =0)
ridge_results <- predict(ridge.out, type = "coefficients", s = bestlam)#[1:20,]
# ridge_results
ridge_results <- as.matrix(ridge_results)
ridge_coef <- data.frame(ridge_results)
ridge_coef <- cbind(Coefficients = rownames(ridge_coef), ridge_coef)
rownames(ridge_coef) <- 1:nrow(ridge_coef)
ridge_coef <- filter(ridge_coef, ridge_coef$s1 < 1) # 1 is used since coefficients typically can not go to absolute zero
ridge_coef <- filter(ridge_coef, ridge_coef$s1 >= 0)
ridge_coef
##                  Coefficients           s1
## 1                 (Intercept) 9.339840e-01
## 2                 iso_codeFRA 1.466479e-02
## 3                 iso_codeUSA 6.983576e-02
## 4               positive_rate 5.912046e-01
## 5  tests_unitstests performed 1.166082e-01
## 6    tests_unitsunits unclear 0.000000e+00
## 7            stringency_index 3.532478e-03
## 8               date_year2021 4.233123e-02
## 9                date_month02 6.031554e-04
## 10               date_month04 2.286625e-02
## 11               date_month05 3.335466e-02
## 12               date_month06 6.871334e-03
## 13               date_month12 5.687551e-03
## 14                 date_day02 7.442083e-03
## 15                 date_day05 1.148983e-02
## 16                 date_day09 2.089970e-02
## 17                 date_day10 2.575950e-02
## 18                 date_day12 2.068724e-02
## 19                 date_day13 3.024748e-03
## 20                 date_day15 1.549985e-02
## 21                 date_day16 2.277378e-02
## 22                 date_day18 4.715723e-04
## 23                 date_day19 4.947867e-03
## 24                 date_day20 1.545013e-02
## 25                 date_day22 2.856118e-03
## 26                 date_day23 9.162999e-03
## 27                 date_day27 3.351535e-03
## 28                 date_day29 1.517134e-02
## 29                 date_day30 1.016048e-02
## 30       total_cases_thousand 5.429685e-06
## 31         new_cases_thousand 8.972875e-03
## 32     hosp_patients_thousand 1.373104e-03
## 33 people_vaccinated_thousand 7.531897e-08
## 34  new_vaccinations_thousand 3.790903e-05
## 35        population_thousand 1.551060e-08
## 36      total_deaths_thousand 4.179705e-04

Ridge results generates more estimators of the regression coefficients with reduced variability. Estimators that were reduced to zero are certain iso_codes and tests_units_unclear.

Lasso

set.seed(1)
lasso.mod = glmnet(x[train,], y[train], alpha = 1) # alpha = 1 indicates lasso regression method
cv.out = cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv.out)

bestlam = cv.out$lambda.min

Best/Min. Lambda is shown as:

bestlam
## [1] 0.0003508294
lasso.mod = glmnet(x[train,], y[train], alpha = 1,lambda = bestlam)
lasso.pred = predict(lasso.mod, s = bestlam, newx = x[test ,])

Predictions for lasso regression model

head(lasso.pred)
##           s1
## 2  0.2962766
## 5  0.2711268
## 8  0.2966374
## 10 0.2845742
## 12 0.3027936
## 13 0.3225116
lasso_mse <- mean((lasso.pred - y.test)^2)

Test MSE for the best lambda is shown as:

lasso_mse
## [1] 0.1212268
out = glmnet(x, y, alpha = 1)
plot(out)

lasso.coef = predict(out, type ="coefficients", s = bestlam)#[1:20,]
# lasso.coef
lasso.coef <- as.matrix(lasso.coef)
lasso_coef <- data.frame(lasso.coef)
lasso_coef <- cbind(Coefficients = rownames(lasso_coef), lasso_coef)
rownames(lasso_coef) <- 1:nrow(lasso_coef)
lasso_coef <- filter(lasso_coef, lasso_coef$s1 == 0)
lasso_coef
##                  Coefficients s1
## 1                 iso_codeDEU  0
## 2                 iso_codeGBR  0
## 3                 iso_codeIND  0
## 4                 iso_codeUSA  0
## 5    tests_unitsunits unclear  0
## 6                  median_age  0
## 7               aged_65_older  0
## 8             extreme_poverty  0
## 9         diabetes_prevalence  0
## 10            life_expectancy  0
## 11    human_development_index  0
## 12                 date_day04  0
## 13                 date_day08  0
## 14                 date_day28  0
## 15 people_vaccinated_thousand  0
## 16        population_thousand  0

Estimators that were reduced to zero are: certain iso_codes, reproduction_rate, tests_per_case, tests_unitstests, stringency_index, population_density, median_age, aged_65_older, aged_70_older, gdp_per_capita, extreme_poverty, diabetes_prevalence, life_expectancy, human_development_index, date_year, date_month, date_day, total_cases_thousand, icu_patients_thousand, hosp_patients_thousand, people_vaccinated_thousand, population_thousand, new_people_vaccinated_smoothed_thousand, and new_vaccinations_smoothed_thousand.

Compare Ridge Results to MLR

mlr_mse
## [1] 0.1076209
ridge_mse
## [1] 0.1250746
lasso_mse
## [1] 0.1212268

MLR and Lasso are approximately even for MSE.

PCR Code

dataset <- subset(dataset, select = -c(iso_code, tests_units, date_year, date_month, date_day))
x=model.matrix(new_deaths_thousand~.,dataset)[,-1]
y=dataset$new_deaths_thousand

set.seed(1)
train=sample (1: nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]

pcr.fit=pcr(total_deaths_thousand~., data=dataset, scale=TRUE,  validation ="CV")
summary (pcr.fit)
## Data:    X dimension: 6730 27 
##  Y dimension: 6730 1
## Fit method: svdpc
## Number of components considered: 27
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           166.4    153.3    93.46    92.70    86.62    86.63    86.61
## adjCV        166.4    153.3    93.45    92.69    86.61    86.62    86.60
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       85.79    84.69    84.51     84.05     79.69     76.42     76.28
## adjCV    85.78    84.68    84.50     84.05     79.66     76.39     76.27
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps  20 comps
## CV        71.76     69.78     67.74     47.77     46.73     41.06     40.32
## adjCV     71.74     69.77     67.74     47.76     46.74     41.05     40.30
##        21 comps  22 comps  23 comps  24 comps  25 comps  26 comps  27 comps
## CV        39.36     39.11     39.11     34.75     34.71     34.72     34.83
## adjCV     39.35     39.10     39.10     34.74     34.70     34.68     34.69
## 
## TRAINING: % variance explained
##                        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## X                        29.76    47.74    58.64    66.02    72.41    76.99
## total_deaths_thousand    15.18    68.53    69.06    72.99    73.00    73.02
##                        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## X                        80.66    84.07    87.27     89.72     91.88     93.83
## total_deaths_thousand    73.55    74.23    74.35     74.66     77.22     79.05
##                        13 comps  14 comps  15 comps  16 comps  17 comps
## X                         95.61     96.76     97.64     98.35     98.85
## total_deaths_thousand     79.12     81.52     82.54     83.57     91.81
##                        18 comps  19 comps  20 comps  21 comps  22 comps
## X                         99.24     99.59     99.74     99.87     99.98
## total_deaths_thousand     92.17     93.97     94.19     94.45     94.53
##                        23 comps  24 comps  25 comps  26 comps  27 comps
## X                        100.00    100.00     100.0     100.0    100.00
## total_deaths_thousand     94.53     95.67      95.7      95.7     95.71
validationplot(pcr.fit, val.type="MSEP")

set.seed(1)
pcr.fit=pcr(total_deaths_thousand~., data=dataset, subset=train, scale=TRUE,  validation ="CV")
validationplot(pcr.fit, val.type="MSEP")

pcr.pred=predict (pcr.fit, x[test ,], ncomp =22)
mean((pcr.pred -y.test)^2)
## [1] 22219738577
pcr.fit=pcr(y~x, scale=TRUE, ncomp=22)
summary(pcr.fit)
## Data:    X dimension: 6730 27 
##  Y dimension: 6730 1
## Fit method: svdpc
## Number of components considered: 22
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X   30.094    49.71    59.49    66.95    73.24    77.77    81.36    84.78
## y    8.597    16.61    41.16    42.90    45.31    46.30    47.34    47.61
##    9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X    87.61     90.02     92.31     94.16     95.68     96.97     97.91
## y    54.20     55.62     56.28     57.47     68.21     68.41     68.65
##    16 comps  17 comps  18 comps  19 comps  20 comps  21 comps  22 comps
## X     98.70     99.12     99.49     99.66     99.79     99.90     99.98
## y     68.65     69.75     69.84     70.38     70.42     70.44     70.56