Install Packages

Install the required libraries

library(broom) # convert analysis objects from R into tidy tibbles
library(tidyverse) # tidy data
library(psych) # EDA
library(countrycode) # get the continents
library(lubridate) # adjust the date variable
library(usmap) # find out the US states
library(readxl) # read MS excel file
library(corrplot) #for visualization of correlation
library(leaps) # regsubsets
library(car) # ncvTest
library(corrplot) # multicollinearity plot
library(performance) # multicollinearity table

Web Scraping COVID-19 Data

The data source is from Johns Hopkins University. Here is the link

url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

Data Scraping

+ time_series_covid19_confirmed_global.csv
+ time_series_covid19_deaths_global.csv
+ time_series_covid19_confirmed_US.csv
+ time_series_covid19_deaths_US.csv

df <- tibble(file_names = c("time_series_covid19_confirmed_global.csv",
                            "time_series_covid19_deaths_global.csv",
                            "time_series_covid19_confirmed_US.csv",
                            "time_series_covid19_deaths_US.csv")) -> df

Data Mapping

Using Regex for mapping

df %>%
  mutate(url = str_c(url_in, file_names, sep = "")) -> df

df %>%
  mutate(data = map(url, ~read_csv(., na = ""))) -> df

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso3 = col_character(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso3 = col_character(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

df %>%
  mutate(case_types = as.factor(str_extract(file_names, "[:alpha:]*_[gU][:alpha:]*"))) -> 
  df
# alpha = Any letter, [A-Za-z]
# reference: https://www.petefreitag.com/cheatsheets/regex/character-classes/

df %>%
  dplyr::select(case_types, data) -> df

Clean Data

df %>%
  mutate(vars = map(df$data, names)) -> df
# map(df$vars, ~unlist(.)[1:15]) for checking

fix_names <- function(df, pattern, rePattern){
  stopifnot(is.data.frame(df), is.character(pattern), is.character(rePattern))
  names(df) <- str_replace_all(names(df), pattern, rePattern)
  return(df)
}

df %>%
  mutate(data = map(data, ~fix_names(., "([ey])/", "\\1_")),
         data = map(data, ~fix_names(., "Admin2", "County")),
         data = map(data, ~fix_names(., "Long_", "Long")),
         data = map_if(data, str_detect(df$case_types, "US"),
                   ~dplyr::select(., -c("UID", "iso2", "iso3", 
                                 "code3", "FIPS", "Combined_Key"))),
         data = map_if(data, str_detect(df$case_types, "global"),
                      ~mutate(., County = "NA")),
         data = map_if(data, !str_detect(df$case_types, "deaths_US"),
                      ~mutate(., Population = 0)),
         data = map(data, ~unite(., "Country_State", 
                                 c("Country_Region", "Province_State"),
                                 remove = FALSE, na.rm = TRUE,
                                 sep = "_"))
         ) -> df

df %>%
  mutate(vars = map(df$data, names)) -> df # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking

df %>%
  mutate(data = map(data, ~pivot_longer(data = ., cols = contains("/"),
                                        names_to = "Date",
                                        values_to = "dailyValues",
                                        names_transform = list(Date = mdy)))
         ) -> df
# df$data <- map(df$data, names) # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking 

# crate a function to fix in type of Date
mdyDate <- function(df, varsDate){
  # stopifnot(is.data.frame(df), is.character(varsDate))
  df[[varsDate]] <- ymd(df[[varsDate]])
  return(df)
}

df %>%
  mutate(data = map(data, ~mdyDate(., "Date"))) -> df_long

# str(df_long) # check the data set

Add Continents and fix NAs

df_long %>%
  mutate(data = map(data, ~mutate(., Continent = countrycode(Country_Region,
                                               origin = "country.name",
                                               destination = "continent")))
         ) -> df_long

df_long %>%
  mutate(data = map(data, ~mutate(., Continent = case_when(
                                               Country_Region == "Diamond Princess" ~ "Asia",
                                               Country_Region == "Kosovo" ~ "Americas",
                                               Country_Region == "MS Zaandam" ~ "Europe",
                                               TRUE ~ Continent)
                                  ))) -> df_long

map(df_long$data, ~unique(.$Continent))

## [[1]]
## [1] "Asia"     "Europe"   "Africa"   "Americas" "Oceania"  NA        
## 
## [[2]]
## [1] "Asia"     "Europe"   "Africa"   "Americas" "Oceania"  NA        
## 
## [[3]]
## [1] "Americas"
## 
## [[4]]
## [1] "Americas"

Unnest the Data Frames

# 1
df_long %>%
  unnest(cols = data) %>%
  ungroup() -> df_all

# 2
remove(df, df_long)

# 3
df_all %>%
  dplyr::select(-vars) -> df_all

Get World Population Data

source: UN source

# 1
df_pop <- read_csv("../data/WPP2019_TotalPopulation.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   LocID = col_double(),
##   Location = col_character(),
##   PopTotal = col_double(),
##   PopDensity = col_double()
## )

# summarize(df_pop, across(everything(), ~sum(is.na(.)))) # check NAs

# 2 
semi_join(df_pop, df_all, by = c("Location" = "Country_Region")) -> df_pop

# 3
df_pop %>% 
  mutate(rank_p = rank(-PopTotal, na.last = TRUE),
         rank_d = rank(-PopDensity, na.last = TRUE),
         PopTotal = (PopTotal*1000)) -> df_pop

df_all %>%
  inner_join(df_pop, by = c("Country_Region" = "Location")) -> df_all

Tidy Data

Because COVID-19 data is a time series data, we only focus on 2020/01/22 - 2021/01/22 for our experiment.

#df_all %>% 
  #filter(case_types == "confirmed_US") %>%
  #select(Date, Province_State, County, dailyValues) %>%
  #tail()

# extract one year
df_all %>%
  filter(case_types == "confirmed_US" & as_date(Date) <= as_date("2021-01-22") | case_types == "deaths_US" & as_date(Date) <= as_date("2021-01-22")) -> covid

names(covid)

##  [1] "case_types"     "Country_State"  "Province_State" "Country_Region"
##  [5] "Lat"            "Long"           "County"         "Population"    
##  [9] "Date"           "dailyValues"    "Continent"      "LocID"         
## [13] "PopTotal"       "PopDensity"     "rank_p"         "rank_d"

Find out each US state using usmap

state_map <- us_map(regions = "states")
state_map %>%
  distinct(full) %>%
  rename("Province_State" = "full") -> USstates

Obtain the number of confirmed cases for each state

covid %>%
  filter(case_types == "confirmed_US" & as_date(Date) == as_date("2021-01-22")) %>%
  dplyr::select(Province_State, County, dailyValues) %>%
  group_by(Province_State) %>%
  tally(dailyValues) %>%
  right_join(USstates) %>%
  rename("confirmed" = "n") -> confirmed

## Joining, by = "Province_State"

Obtain the number of death cases for each state

covid %>%
  filter(case_types == "deaths_US" & as_date(Date) == as_date("2021-01-22")) %>%
   dplyr::select(Province_State, County, dailyValues) %>%
  group_by(Province_State) %>%
  tally(dailyValues) %>%
  right_join(USstates) %>%
  rename("deaths" = "n") -> deathes

## Joining, by = "Province_State"

full_join(confirmed, deathes) -> covid

## Joining, by = "Province_State"

Read 2019 American community survey estimate by race by state

Source: https://www.governing.com/now/State-Population-By-Race-Ethnicity-Data.html

race <- read_csv("../data/2019_state_community_by_race.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   State = col_character(),
##   `American Indian and Alaska Native alone` = col_double(),
##   `Asian alone` = col_double(),
##   `Black or African American alone` = col_double(),
##   `Native Hawaiian and Other Pacific Islander alone` = col_double(),
##   `Some other race alone` = col_double(),
##   `Total Population` = col_double(),
##   `Two or more races` = col_double(),
##   `White alone` = col_double()
## )

race %>% 
  rename(Province_State = State) -> race

covid %>% 
  left_join(race, by = "Province_State") %>%
  rename(American_Indian_and_Alaska_Native_alone = "American Indian and Alaska Native alone",
         Asian_alone = "Asian alone",
         Black_or_African_American_alone = "Black or African American alone",
         Native_Hawaiian_and_Other_Pacific_Islander_alone = "Native Hawaiian and Other Pacific Islander alone",
         Some_other_race_alone = "Some other race alone",
         Total_Population = "Total Population",
         Two_or_more_races = "Two or more races",
         White_alone = "White alone") -> covid

#race %>% 
  #anti_join(covidForRegression, by = "Province_State") #-->Check if there are some diff value of state

# The 2020 American Community Survey (ACS) 1-year estimates will be released on September 23, 2021.
# Since the 2020 survey does not yet release, we use the 2019 survey here

Read 2021 Household income

This data is tided fromWorld Population Review, and the original, and source is from US Census

householdIncome2021 <- read_csv("../data/MedianHouseholdIncome2021.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   State = col_character(),
##   HouseholdIncome = col_double()
## )

householdIncome2021

householdIncome2021  %>% 
  rename(Province_State = State) -> householdIncome2021

covid %>% 
  left_join(householdIncome2021, by = "Province_State") -> covid

# Recheck
#covid %>%
  #dplyr::select(Province_State, HouseholdIncome) %>%
   #dplyr::arrange(desc(HouseholdIncome))

Exploratory Data Analysis

Data Analysis and Visualization

str(covid)

## tibble [51 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Province_State                                  : chr [1:51] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ confirmed                                       : num [1:51] 436087 52926 708041 281382 3147207 ...
##  $ deaths                                          : num [1:51] 6486 254 12001 4549 36615 ...
##  $ American_Indian_and_Alaska_Native_alone         : num [1:51] 23265 115544 332273 17216 321112 ...
##  $ Asian_alone                                     : num [1:51] 66129 43678 241721 46078 5865435 ...
##  $ Black_or_African_American_alone                 : num [1:51] 1319551 22551 343729 467468 2282144 ...
##  $ Native_Hawaiian_and_Other_Pacific_Islander_alone: num [1:51] 1892 9923 14168 12829 155871 ...
##  $ Some_other_race_alone                           : num [1:51] 74451 12602 364442 75590 5424558 ...
##  $ Total_Population                                : num [1:51] 4903185 731545 7278717 3017804 39512223 ...
##  $ Two_or_more_races                               : num [1:51] 91522 57476 280574 83603 1978145 ...
##  $ White_alone                                     : num [1:51] 3326375 469771 5701810 2315020 23484958 ...
##  $ HouseholdIncome                                 : num [1:51] 50536 77640 58945 47597 75235 ...

covid %>% head()

Top 5 Confirmed Covid-19 Cases (Jan. 22, 2020 to Jan. 22, 2021)

covid %>%
   dplyr::select(1, 2) %>%
  arrange(desc(confirmed)) %>%
  head(5)

Top 5 Death Cases (Jan. 22, 2020 to Jan. 22, 2021)

covid %>%
   dplyr::select(1, 3) %>%
  arrange(desc(deaths)) %>%
  head(5)

Total Population vs Top 5 Median Income

covid %>%
  dplyr::arrange(desc(HouseholdIncome)) %>%
  head(5) %>%
  ggplot(aes(x = Total_Population, y = HouseholdIncome, color = Province_State)) +
  geom_point() +
  theme_bw() +
  labs(x = "Total Population", y = "2021 Median Income", 
       title = "Total Population vs Top 5 Median Income")

In order to run the linear regression model smoothly, we deleted the Province Statevariable from our covid data and saved it as covidForRegression dataframe. The reason of removing these two is :

The Province State variable has 51 different categorical types, and each observation has its own type, making it difficult to run a regression model.
We refer to the existing paper1 & 2 and select some similar independent variables as predictors.

covid %>%
  dplyr::select(-Province_State) -> covidForRegression
names(covidForRegression)

##  [1] "confirmed"                                       
##  [2] "deaths"                                          
##  [3] "American_Indian_and_Alaska_Native_alone"         
##  [4] "Asian_alone"                                     
##  [5] "Black_or_African_American_alone"                 
##  [6] "Native_Hawaiian_and_Other_Pacific_Islander_alone"
##  [7] "Some_other_race_alone"                           
##  [8] "Total_Population"                                
##  [9] "Two_or_more_races"                               
## [10] "White_alone"                                     
## [11] "HouseholdIncome"

Descriptive Statistics

describe(covidForRegression)

Linear Model Assumptions

Multicollinearity

Plot

correlation <- cor(covidForRegression) 
corrplot(correlation, method="color", addCoef.col = "black", number.cex = 0.5, type = "lower")

Table

Check the inversely related values of Tolerance and VIF. Tolerance has to be > 0.10 and VIF < 10. If these stipulated are not fulfilled, multicollinearity is at hand.

reg <- lm(confirmed ~ ., data = covidForRegression)
check_collinearity(reg)

## Warning: Model matrix is rank deficient. VIFs may not be sensible.

Remediation - Removing Highly Correlated Predictors

Remove highly correlated predictors from the model. If you have two or more factors with a high VIF, remove one from the model.
According to the result, we firstly remove the high correlation variables: Some_other_race_alone, Total_Population, and Two_or_more_races, and keep the majority of race variables.
After we removed, now the independent variables have very low correlation with each other.

covidForRegression %>%
  dplyr::select(-Total_Population, -Two_or_more_races, -Some_other_race_alone) -> covidForRegression2
names(covidForRegression2)

## [1] "confirmed"                                       
## [2] "deaths"                                          
## [3] "American_Indian_and_Alaska_Native_alone"         
## [4] "Asian_alone"                                     
## [5] "Black_or_African_American_alone"                 
## [6] "Native_Hawaiian_and_Other_Pacific_Islander_alone"
## [7] "White_alone"                                     
## [8] "HouseholdIncome"

reg <- lm(confirmed ~ ., data = covidForRegression2)
check_collinearity(reg)

Independence

We would need to know more from the data providers to really assess this. We will assume it holds.

Linearity

The lack of fit F test works only with simple linear regression so we see the residual plots. As for the residuals versus fitted plot below, there may be no pattern indicating non-linearity in the data, but we attempt to remove some potential outliers.

par(mfrow = c(2,2))
# summary(reg)
plot(reg)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Remediation - Removing Influential Outliers

Get other diagnostics measures

Take away the largest values of cooks and leverage. That is, observations 5 (California), is no longer present.

leverage <- hatvalues(reg)
student <- rstudent(reg)
dfs <- dffits(reg)
cooksd <- cooks.distance(reg)
data.frame(confirmed = covidForRegression2$confirmed, fitted = reg$fitted,
           residual = reg$residual, leverage, student, dffits = dfs, cooksd) -> diag

par(mfrow=c(2,2))
plot(leverage,type='h')
abline(h=0)
plot(student,type='h')
abline(h=0)
plot(dfs,type='h',ylab='dffit')
abline(h=0)
plot(cooksd,type='h')
abline(h=0)

diag %>%
  arrange(desc(leverage)) %>%
  head(3)

diag %>%
  arrange(desc(cooksd)) %>%
  head(3)

Look at the Residual vs Fitted plot; linearity has occurred.

reg1 <- lm(confirmed ~ ., data = covidForRegression2[-5,])
par(mfrow=c(2,2))
plot(reg1)

Homoscedasticity

With a small p-value, we have evidence that the variances are non-constant

# test homoscedasticity
ncvTest(reg1)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 18.35783, Df = 1, p = 1.8307e-05

Remediation - Square Root of Y

With a high p-value of 0.84229, there is no evidence of non-constant variance.

covidForRegression2 %>%
  mutate(confirmed = sqrt(confirmed)) -> covidForRegressionSQRT

reg2 <- lm(confirmed ~ ., data = covidForRegressionSQRT[-5,])
ncvTest(reg2)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.03886796, Df = 1, p = 0.84371

Normality

The p-value of the Shapiro-Wilk Test 0.1822 is greater than \(\alpha\) 0.05 so the data is follow a normal distribution.

# test normality
shapiro.test(rstudent(reg2))

## 
##  Shapiro-Wilk normality test
## 
## data:  rstudent(reg2)
## W = 0.96754, p-value = 0.1834

Linear Model Selection

Our final mission is to select the fewest predictors and the determine by the lowest mean squared error in the linear model.

Adding all variables as full model first. We can see that only three variables in the linear model are significant at the beginning, which is deaths, Black_or_African_American_alone, and White_alone.

summary(reg2)

## 
## Call:
## lm(formula = confirmed ~ ., data = covidForRegressionSQRT[-5, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -222.17  -76.67   10.27   68.87  164.56 
## 
## Coefficients:
##                                                    Estimate Std. Error t value
## (Intercept)                                       3.624e+02  1.103e+02   3.286
## deaths                                            1.191e-02  5.482e-03   2.173
## American_Indian_and_Alaska_Native_alone           3.500e-04  2.184e-04   1.603
## Asian_alone                                      -1.804e-04  1.218e-04  -1.482
## Black_or_African_American_alone                   5.843e-05  2.727e-05   2.143
## Native_Hawaiian_and_Other_Pacific_Islander_alone  1.018e-05  7.940e-04   0.013
## White_alone                                       4.287e-05  8.550e-06   5.014
## HouseholdIncome                                  -1.027e-03  1.640e-03  -0.626
##                                                  Pr(>|t|)    
## (Intercept)                                       0.00206 ** 
## deaths                                            0.03546 *  
## American_Indian_and_Alaska_Native_alone           0.11652    
## Asian_alone                                       0.14590    
## Black_or_African_American_alone                   0.03797 *  
## Native_Hawaiian_and_Other_Pacific_Islander_alone  0.98983    
## White_alone                                      1.02e-05 ***
## HouseholdIncome                                   0.53449    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98.64 on 42 degrees of freedom
## Multiple R-squared:  0.9041, Adjusted R-squared:  0.8882 
## F-statistic: 56.58 on 7 and 42 DF,  p-value: < 2.2e-16

Exhaustive Search

Using algorithm to select the best model exhaustively. Also, to use all X-variables available, change the nvmax option. Because I am too lazy to count variables, I entered a much larger number, such as my favorite number 69 to do it.

reg_fitExhaustive <- regsubsets(confirmed ~ ., data = covidForRegressionSQRT[-5,], nvmax = 69)
summary(reg_fitExhaustive)

## Subset selection object
## Call: regsubsets.formula(confirmed ~ ., data = covidForRegressionSQRT[-5, 
##     ], nvmax = 69)
## 7 Variables  (and intercept)
##                                                  Forced in Forced out
## deaths                                               FALSE      FALSE
## American_Indian_and_Alaska_Native_alone              FALSE      FALSE
## Asian_alone                                          FALSE      FALSE
## Black_or_African_American_alone                      FALSE      FALSE
## Native_Hawaiian_and_Other_Pacific_Islander_alone     FALSE      FALSE
## White_alone                                          FALSE      FALSE
## HouseholdIncome                                      FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
##          deaths American_Indian_and_Alaska_Native_alone Asian_alone
## 1  ( 1 ) " "    " "                                     " "        
## 2  ( 1 ) " "    " "                                     " "        
## 3  ( 1 ) "*"    " "                                     "*"        
## 4  ( 1 ) "*"    " "                                     "*"        
## 5  ( 1 ) "*"    "*"                                     "*"        
## 6  ( 1 ) "*"    "*"                                     "*"        
## 7  ( 1 ) "*"    "*"                                     "*"        
##          Black_or_African_American_alone
## 1  ( 1 ) " "                            
## 2  ( 1 ) "*"                            
## 3  ( 1 ) " "                            
## 4  ( 1 ) "*"                            
## 5  ( 1 ) "*"                            
## 6  ( 1 ) "*"                            
## 7  ( 1 ) "*"                            
##          Native_Hawaiian_and_Other_Pacific_Islander_alone White_alone
## 1  ( 1 ) " "                                              "*"        
## 2  ( 1 ) " "                                              "*"        
## 3  ( 1 ) " "                                              "*"        
## 4  ( 1 ) " "                                              "*"        
## 5  ( 1 ) " "                                              "*"        
## 6  ( 1 ) " "                                              "*"        
## 7  ( 1 ) "*"                                              "*"        
##          HouseholdIncome
## 1  ( 1 ) " "            
## 2  ( 1 ) " "            
## 3  ( 1 ) " "            
## 4  ( 1 ) " "            
## 5  ( 1 ) " "            
## 6  ( 1 ) "*"            
## 7  ( 1 ) "*"

Select the variables and choose the optimal model

summary(reg_fitExhaustive)$adjr2

## [1] 0.8574573 0.8711181 0.8792228 0.8866016 0.8922248 0.8907517 0.8881510

summary(reg_fitExhaustive)$cp

## [1] 15.172193 10.157383  7.671876  5.623361  4.397402  6.000164  8.000000

summary(reg_fitExhaustive)$bic

## [1] -90.61260 -92.79050 -93.20125 -93.54017 -93.29479 -89.85345 -85.94162

which.max(summary(reg_fitExhaustive)$adjr2)

## [1] 5

which.min(summary(reg_fitExhaustive)$cp)

## [1] 5

which.min(summary(reg_fitExhaustive)$bic)

## [1] 4

Sequential Search

We can also choose the best model by means of a stepwise procedure, starting with one model and ending with another

Forward Method

Forward addition can be used to perform variable selection.

reg_fitForward <- regsubsets(confirmed ~ ., 
                             data = covidForRegressionSQRT[-c(5),], 
                             method = "forward")
summary(reg_fitForward)

## Subset selection object
## Call: regsubsets.formula(confirmed ~ ., data = covidForRegressionSQRT[-c(5), 
##     ], method = "forward")
## 7 Variables  (and intercept)
##                                                  Forced in Forced out
## deaths                                               FALSE      FALSE
## American_Indian_and_Alaska_Native_alone              FALSE      FALSE
## Asian_alone                                          FALSE      FALSE
## Black_or_African_American_alone                      FALSE      FALSE
## Native_Hawaiian_and_Other_Pacific_Islander_alone     FALSE      FALSE
## White_alone                                          FALSE      FALSE
## HouseholdIncome                                      FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: forward
##          deaths American_Indian_and_Alaska_Native_alone Asian_alone
## 1  ( 1 ) " "    " "                                     " "        
## 2  ( 1 ) " "    " "                                     " "        
## 3  ( 1 ) " "    " "                                     " "        
## 4  ( 1 ) "*"    " "                                     " "        
## 5  ( 1 ) "*"    " "                                     "*"        
## 6  ( 1 ) "*"    "*"                                     "*"        
## 7  ( 1 ) "*"    "*"                                     "*"        
##          Black_or_African_American_alone
## 1  ( 1 ) " "                            
## 2  ( 1 ) "*"                            
## 3  ( 1 ) "*"                            
## 4  ( 1 ) "*"                            
## 5  ( 1 ) "*"                            
## 6  ( 1 ) "*"                            
## 7  ( 1 ) "*"                            
##          Native_Hawaiian_and_Other_Pacific_Islander_alone White_alone
## 1  ( 1 ) " "                                              "*"        
## 2  ( 1 ) " "                                              "*"        
## 3  ( 1 ) " "                                              "*"        
## 4  ( 1 ) " "                                              "*"        
## 5  ( 1 ) " "                                              "*"        
## 6  ( 1 ) " "                                              "*"        
## 7  ( 1 ) "*"                                              "*"        
##          HouseholdIncome
## 1  ( 1 ) " "            
## 2  ( 1 ) " "            
## 3  ( 1 ) "*"            
## 4  ( 1 ) "*"            
## 5  ( 1 ) "*"            
## 6  ( 1 ) "*"            
## 7  ( 1 ) "*"

Select the variables and choose the optimal model

summary(reg_fitForward)$adjr2

## [1] 0.8574573 0.8711181 0.8780890 0.8831546 0.8867013 0.8907517 0.8881510

summary(reg_fitForward)$cp

## [1] 15.172193 10.157383  8.138162  7.010188  6.570284  6.000164  8.000000

summary(reg_fitForward)$bic

## [1] -90.61260 -92.79050 -92.73407 -92.04295 -90.79578 -89.85345 -85.94162

which.max(summary(reg_fitForward)$adjr2)

## [1] 6

which.min(summary(reg_fitForward)$cp)

## [1] 6

which.min(summary(reg_fitForward)$bic)

## [1] 2

Backward Method

Backward elimination can be used to perform variable selection.

reg_fitBackward <- regsubsets(confirmed ~ ., 
                              data = covidForRegressionSQRT[-c(5),], method = "backward")
summary(reg_fitBackward)

## Subset selection object
## Call: regsubsets.formula(confirmed ~ ., data = covidForRegressionSQRT[-c(5), 
##     ], method = "backward")
## 7 Variables  (and intercept)
##                                                  Forced in Forced out
## deaths                                               FALSE      FALSE
## American_Indian_and_Alaska_Native_alone              FALSE      FALSE
## Asian_alone                                          FALSE      FALSE
## Black_or_African_American_alone                      FALSE      FALSE
## Native_Hawaiian_and_Other_Pacific_Islander_alone     FALSE      FALSE
## White_alone                                          FALSE      FALSE
## HouseholdIncome                                      FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: backward
##          deaths American_Indian_and_Alaska_Native_alone Asian_alone
## 1  ( 1 ) " "    " "                                     " "        
## 2  ( 1 ) "*"    " "                                     " "        
## 3  ( 1 ) "*"    " "                                     "*"        
## 4  ( 1 ) "*"    " "                                     "*"        
## 5  ( 1 ) "*"    "*"                                     "*"        
## 6  ( 1 ) "*"    "*"                                     "*"        
## 7  ( 1 ) "*"    "*"                                     "*"        
##          Black_or_African_American_alone
## 1  ( 1 ) " "                            
## 2  ( 1 ) " "                            
## 3  ( 1 ) " "                            
## 4  ( 1 ) "*"                            
## 5  ( 1 ) "*"                            
## 6  ( 1 ) "*"                            
## 7  ( 1 ) "*"                            
##          Native_Hawaiian_and_Other_Pacific_Islander_alone White_alone
## 1  ( 1 ) " "                                              "*"        
## 2  ( 1 ) " "                                              "*"        
## 3  ( 1 ) " "                                              "*"        
## 4  ( 1 ) " "                                              "*"        
## 5  ( 1 ) " "                                              "*"        
## 6  ( 1 ) " "                                              "*"        
## 7  ( 1 ) "*"                                              "*"        
##          HouseholdIncome
## 1  ( 1 ) " "            
## 2  ( 1 ) " "            
## 3  ( 1 ) " "            
## 4  ( 1 ) " "            
## 5  ( 1 ) " "            
## 6  ( 1 ) "*"            
## 7  ( 1 ) "*"

Select the variables and choose the optimal model

summary(reg_fitBackward)$adjr2

## [1] 0.8574573 0.8651936 0.8792228 0.8866016 0.8922248 0.8907517 0.8881510

summary(reg_fitBackward)$cp

## [1] 15.172193 12.646898  7.671876  5.623361  4.397402  6.000164  8.000000

summary(reg_fitBackward)$bic

## [1] -90.61260 -90.54335 -93.20125 -93.54017 -93.29479 -89.85345 -85.94162

which.max(summary(reg_fitBackward)$adjr2)

## [1] 5

which.min(summary(reg_fitBackward)$cp)

## [1] 5

which.min(summary(reg_fitBackward)$bic)

## [1] 4

Stepwise Method

We use algorithm to considers either adding or removing variables at each step to final the best model. Lower AIC (Akaike information criterion) values indicate a better-fit model.

null = lm(confirmed ~ 1, data = covidForRegressionSQRT[-c(5),])
full = lm(confirmed ~ ., data = covidForRegressionSQRT[-c(5),]) 
step(null, scope = list(lower = null, upper = full), direction = "both")

## Start:  AIC=569.67
## confirmed ~ 1
## 
##                                                    Df Sum of Sq     RSS    AIC
## + White_alone                                       1   3667200  595170 473.23
## + deaths                                            1   3266316  996054 498.98
## + Black_or_African_American_alone                   1   2944490 1317880 512.98
## + Asian_alone                                       1   2115961 2146409 537.36
## + American_Indian_and_Alaska_Native_alone           1    265309 3997061 568.45
## <none>                                                          4262370 569.67
## + HouseholdIncome                                   1    121852 4140518 570.22
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     19189 4243181 571.44
## 
## Step:  AIC=473.23
## confirmed ~ White_alone
## 
##                                                    Df Sum of Sq     RSS    AIC
## + Black_or_African_American_alone                   1     68250  526920 469.14
## + HouseholdIncome                                   1     46407  548764 471.17
## + deaths                                            1     44029  551142 471.39
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     35741  559429 472.13
## <none>                                                           595170 473.23
## + American_Indian_and_Alaska_Native_alone           1     12741  582429 474.15
## + Asian_alone                                       1      6779  588392 474.66
## - White_alone                                       1   3667200 4262370 569.67
## 
## Step:  AIC=469.14
## confirmed ~ White_alone + Black_or_African_American_alone
## 
##                                                    Df Sum of Sq     RSS    AIC
## + HouseholdIncome                                   1     39105  487816 467.28
## + American_Indian_and_Alaska_Native_alone           1     28034  498887 468.41
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     26719  500201 468.54
## <none>                                                           526920 469.14
## + deaths                                            1     17944  508976 469.41
## + Asian_alone                                       1     16544  510377 469.54
## - Black_or_African_American_alone                   1     68250  595170 473.23
## - White_alone                                       1    790959 1317880 512.98
## 
## Step:  AIC=467.28
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome
## 
##                                                    Df Sum of Sq     RSS    AIC
## + deaths                                            1     30433  457382 466.06
## <none>                                                           487816 467.28
## + American_Indian_and_Alaska_Native_alone           1     17844  469972 467.42
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     13595  474221 467.87
## - HouseholdIncome                                   1     39105  526920 469.14
## + Asian_alone                                       1       482  487334 469.23
## - Black_or_African_American_alone                   1     60948  548764 471.17
## - White_alone                                       1    796518 1284334 513.69
## 
## Step:  AIC=466.06
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome + 
##     deaths
## 
##                                                    Df Sum of Sq    RSS    AIC
## + Asian_alone                                       1     23739 433643 465.40
## <none>                                                          457382 466.06
## + American_Indian_and_Alaska_Native_alone           1     17872 439510 466.07
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1      8133 449249 467.17
## - deaths                                            1     30433 487816 467.28
## - Black_or_African_American_alone                   1     30551 487933 467.30
## - HouseholdIncome                                   1     51594 508976 469.41
## - White_alone                                       1    274232 731614 487.55
## 
## Step:  AIC=465.4
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome + 
##     deaths + Asian_alone
## 
##                                                    Df Sum of Sq    RSS    AIC
## + American_Indian_and_Alaska_Native_alone           1     25006 408638 464.43
## - HouseholdIncome                                   1     10246 443889 464.57
## <none>                                                          433643 465.40
## - Asian_alone                                       1     23739 457382 466.06
## - Black_or_African_American_alone                   1     33456 467099 467.11
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1        19 433625 467.40
## - deaths                                            1     53690 487334 469.23
## - White_alone                                       1    291149 724792 489.08
## 
## Step:  AIC=464.43
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone
## 
##                                                    Df Sum of Sq    RSS    AIC
## - HouseholdIncome                                   1      3865 412502 462.90
## <none>                                                          408638 464.43
## - American_Indian_and_Alaska_Native_alone           1     25006 433643 465.40
## - Asian_alone                                       1     30873 439510 466.07
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1         2 408636 466.43
## - Black_or_African_American_alone                   1     44906 453543 467.64
## - deaths                                            1     59882 468520 469.27
## - White_alone                                       1    248959 657596 486.22
## 
## Step:  AIC=462.9
## confirmed ~ White_alone + Black_or_African_American_alone + deaths + 
##     Asian_alone + American_Indian_and_Alaska_Native_alone
## 
##                                                    Df Sum of Sq    RSS    AIC
## <none>                                                          412502 462.90
## + HouseholdIncome                                   1      3865 408638 464.43
## - American_Indian_and_Alaska_Native_alone           1     31387 443889 464.57
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1        50 412453 464.89
## - Black_or_African_American_alone                   1     51301 463803 466.76
## - Asian_alone                                       1     66809 479311 468.40
## - deaths                                            1     70052 482555 468.74
## - White_alone                                       1    260300 672802 485.36

## 
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Coefficients:
##                             (Intercept)  
##                               2.954e+02  
##                             White_alone  
##                               4.353e-05  
##         Black_or_African_American_alone  
##                               6.143e-05  
##                                  deaths  
##                               1.254e-02  
##                             Asian_alone  
##                              -2.161e-04  
## American_Indian_and_Alaska_Native_alone  
##                               3.815e-04

Comparison

We will select the fewest variable for each set, compare their MSE, and finally select the one with the local minimum MSE.

From exhaustive search, the fewest predictors is 4 in smallest BIC (Bayesian Information Criterion).

regExhaustive <- lm(confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + White_alone
                    , data = covidForRegressionSQRT[-c(5),])
summary(regExhaustive)

## 
## Call:
## lm(formula = confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + 
##     White_alone, data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -232.97  -70.37   13.41   69.36  159.57 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.085e+02  2.043e+01  15.100  < 2e-16 ***
## deaths                           1.220e-02  4.701e-03   2.596   0.0127 *  
## Asian_alone                     -2.132e-04  8.301e-05  -2.569   0.0136 *  
## Black_or_African_American_alone  5.300e-05  2.652e-05   1.998   0.0517 .  
## White_alone                      4.698e-05  8.250e-06   5.695 8.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.32 on 45 degrees of freedom
## Multiple R-squared:  0.8959, Adjusted R-squared:  0.8866 
## F-statistic: 96.78 on 4 and 45 DF,  p-value: < 2.2e-16

For forward method in sequential search, the fewest predictors is 2 in smallest BIC (Bayesian Information Criterion).

regForward <- lm(confirmed ~ Black_or_African_American_alone + White_alone,
                 data = covidForRegressionSQRT[-c(5),])
summary(regForward)

## 
## Call:
## lm(formula = confirmed ~ Black_or_African_American_alone + White_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -237.00  -63.71   10.56   80.07  204.06 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     3.087e+02  2.170e+01  14.228  < 2e-16 ***
## Black_or_African_American_alone 6.614e-05  2.681e-05   2.467   0.0173 *  
## White_alone                     5.366e-05  6.388e-06   8.400  6.5e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105.9 on 47 degrees of freedom
## Multiple R-squared:  0.8764, Adjusted R-squared:  0.8711 
## F-statistic: 166.6 on 2 and 47 DF,  p-value: < 2.2e-16

For backward method in sequential search, the fewest predictors is 4 in smallest BIC (Bayesian Information Criterion).

regBackward <-  lm(confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + White_alone
                    , data = covidForRegressionSQRT[-c(5),])
summary(regBackward)

## 
## Call:
## lm(formula = confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + 
##     White_alone, data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -232.97  -70.37   13.41   69.36  159.57 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.085e+02  2.043e+01  15.100  < 2e-16 ***
## deaths                           1.220e-02  4.701e-03   2.596   0.0127 *  
## Asian_alone                     -2.132e-04  8.301e-05  -2.569   0.0136 *  
## Black_or_African_American_alone  5.300e-05  2.652e-05   1.998   0.0517 .  
## White_alone                      4.698e-05  8.250e-06   5.695 8.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.32 on 45 degrees of freedom
## Multiple R-squared:  0.8959, Adjusted R-squared:  0.8866 
## F-statistic: 96.78 on 4 and 45 DF,  p-value: < 2.2e-16

For stepwise method, the lowest AIC value is 462.9, reporting the best model with 5 variables is: lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, data = covidForRegressionSQRT[-c(5), ]).

regStepwise <- lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
                    deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
                  data = covidForRegressionSQRT[-c(5), ])
summary(regStepwise)

## 
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -219.073  -71.292    8.241   68.218  164.253 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              2.954e+02  2.116e+01  13.956  < 2e-16
## White_alone                              4.353e-05  8.261e-06   5.269 3.94e-06
## Black_or_African_American_alone          6.143e-05  2.626e-05   2.339  0.02393
## deaths                                   1.254e-02  4.586e-03   2.734  0.00899
## Asian_alone                             -2.161e-04  8.095e-05  -2.669  0.01060
## American_Indian_and_Alaska_Native_alone  3.815e-04  2.085e-04   1.830  0.07407
##                                            
## (Intercept)                             ***
## White_alone                             ***
## Black_or_African_American_alone         *  
## deaths                                  ** 
## Asian_alone                             *  
## American_Indian_and_Alaska_Native_alone .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.82 on 44 degrees of freedom
## Multiple R-squared:  0.9032, Adjusted R-squared:  0.8922 
## F-statistic: 82.13 on 5 and 44 DF,  p-value: < 2.2e-16

Let us start to find the minimum MSE

#calculate MSE
anova(regExhaustive) %>% tidy() # 9863.275

anova(regForward) %>% tidy() # 11211.07

anova(regBackward) %>% tidy() # 9863.275

anova(regStepwise) %>% tidy() # 9375.065

Final Selection

regStepwise has the smallest MSE value (9375.065) so we select it as the best model. Therefore, our final predictors of square root of COVID confirmed cases by US States is deaths, White_alone,Black_or_African_American_alone, Asian_alone and American_Indian_and_Alaska_Native_alone.

finalReg <- lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
    deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
    data = covidForRegressionSQRT[-c(5), ])
summary(finalReg)

## 
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -219.073  -71.292    8.241   68.218  164.253 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              2.954e+02  2.116e+01  13.956  < 2e-16
## White_alone                              4.353e-05  8.261e-06   5.269 3.94e-06
## Black_or_African_American_alone          6.143e-05  2.626e-05   2.339  0.02393
## deaths                                   1.254e-02  4.586e-03   2.734  0.00899
## Asian_alone                             -2.161e-04  8.095e-05  -2.669  0.01060
## American_Indian_and_Alaska_Native_alone  3.815e-04  2.085e-04   1.830  0.07407
##                                            
## (Intercept)                             ***
## White_alone                             ***
## Black_or_African_American_alone         *  
## deaths                                  ** 
## Asian_alone                             *  
## American_Indian_and_Alaska_Native_alone .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.82 on 44 degrees of freedom
## Multiple R-squared:  0.9032, Adjusted R-squared:  0.8922 
## F-statistic: 82.13 on 5 and 44 DF,  p-value: < 2.2e-16

Model Equation

Interpretation

With the large p-value 0.07426 of \(b_5\), the prodictor American_Indian_and_Alaska_Native_alone is not in the significant level. In other words, we fail to reject the null (H0: \(\beta_5\) = 0 cannot be rejected) so we can conclude \(b_5\) is 0. Therefore, \(b_5\) can be dropped from the model.

Linear Equation

Thus, the expected value of square root of confirmed cases is: \[ \sqrt{\hat{Confirmed}} = 295.4 + 0.00004354White + 0.0000614Black + 0.01254Deaths - 0.0002163Asian \]

References

Sehra, S., Fundin, S., Lavery, C., & Baker, J. (2020). Differences in race and other state‐level characteristics and associations with mortality from COVID‐19 infection. Journal of Medical Virology, 92(11), 2406–2408. https://doi.org/10.1002/jmv.26095
Sa, Filipa G., Socioeconomic Determinants of Covid-19 Infections and Mortality: Evidence from England and Wales (May 2020). CEPR Discussion Paper No. DP14781, Available at SSRN: https://ssrn.com/abstract=3612850

Evaluating US Socioeconomic Factors of COVID Confirmed Cases

Keywords: Web Scrapping, Linear Regression, COVID-19

Yunting Chiu, Chiyun Liu, Ana Lim

2021-05-20

Install Packages

Web Scraping COVID-19 Data

Data Scraping

Data Mapping

Using Regex for mapping

Clean Data

Add Continents and fix NAs

Unnest the Data Frames

Get World Population Data

Tidy Data

Find out each US state using usmap

Obtain the number of confirmed cases for each state

Obtain the number of death cases for each state

Read 2019 American community survey estimate by race by state

Read 2021 Household income

Exploratory Data Analysis

Data Analysis and Visualization

Top 5 Confirmed Covid-19 Cases (Jan. 22, 2020 to Jan. 22, 2021)

Top 5 Death Cases (Jan. 22, 2020 to Jan. 22, 2021)

Total Population vs Top 5 Median Income

Descriptive Statistics

Linear Model Assumptions

Multicollinearity

Plot

Table

Remediation - Removing Highly Correlated Predictors

Independence

Linearity

Remediation - Removing Influential Outliers

Homoscedasticity

Remediation - Square Root of Y

Normality

Linear Model Selection

Exhaustive Search

Sequential Search

Forward Method

Backward Method

Stepwise Method

Comparison

Final Selection

Model Equation

Interpretation

Linear Equation

References