Install Packages

Install the required libraries

Web Scraping COVID-19 Data

url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

Data Scraping

+ time_series_covid19_confirmed_global.csv
+ time_series_covid19_deaths_global.csv
+ time_series_covid19_confirmed_US.csv
+ time_series_covid19_deaths_US.csv
df <- tibble(file_names = c("time_series_covid19_confirmed_global.csv",
                            "time_series_covid19_deaths_global.csv",
                            "time_series_covid19_confirmed_US.csv",
                            "time_series_covid19_deaths_US.csv")) -> df

Data Mapping

Using Regex for mapping

df %>%
  mutate(url = str_c(url_in, file_names, sep = "")) -> df
df %>%
  mutate(data = map(url, ~read_csv(., na = ""))) -> df
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   `Province/State` = col_character(),
##   `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso3 = col_character(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso3 = col_character(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
df %>%
  mutate(case_types = as.factor(str_extract(file_names, "[:alpha:]*_[gU][:alpha:]*"))) -> 
  df
# alpha = Any letter, [A-Za-z]
# reference: https://www.petefreitag.com/cheatsheets/regex/character-classes/
df %>%
  dplyr::select(case_types, data) -> df

Clean Data

df %>%
  mutate(vars = map(df$data, names)) -> df
# map(df$vars, ~unlist(.)[1:15]) for checking

fix_names <- function(df, pattern, rePattern){
  stopifnot(is.data.frame(df), is.character(pattern), is.character(rePattern))
  names(df) <- str_replace_all(names(df), pattern, rePattern)
  return(df)
}

df %>%
  mutate(data = map(data, ~fix_names(., "([ey])/", "\\1_")),
         data = map(data, ~fix_names(., "Admin2", "County")),
         data = map(data, ~fix_names(., "Long_", "Long")),
         data = map_if(data, str_detect(df$case_types, "US"),
                   ~dplyr::select(., -c("UID", "iso2", "iso3", 
                                 "code3", "FIPS", "Combined_Key"))),
         data = map_if(data, str_detect(df$case_types, "global"),
                      ~mutate(., County = "NA")),
         data = map_if(data, !str_detect(df$case_types, "deaths_US"),
                      ~mutate(., Population = 0)),
         data = map(data, ~unite(., "Country_State", 
                                 c("Country_Region", "Province_State"),
                                 remove = FALSE, na.rm = TRUE,
                                 sep = "_"))
         ) -> df

df %>%
  mutate(vars = map(df$data, names)) -> df # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking 
df %>%
  mutate(data = map(data, ~pivot_longer(data = ., cols = contains("/"),
                                        names_to = "Date",
                                        values_to = "dailyValues",
                                        names_transform = list(Date = mdy)))
         ) -> df
# df$data <- map(df$data, names) # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking 

# crate a function to fix in type of Date
mdyDate <- function(df, varsDate){
  # stopifnot(is.data.frame(df), is.character(varsDate))
  df[[varsDate]] <- ymd(df[[varsDate]])
  return(df)
}

df %>%
  mutate(data = map(data, ~mdyDate(., "Date"))) -> df_long

# str(df_long) # check the data set

Add Continents and fix NAs

df_long %>%
  mutate(data = map(data, ~mutate(., Continent = countrycode(Country_Region,
                                               origin = "country.name",
                                               destination = "continent")))
         ) -> df_long
df_long %>%
  mutate(data = map(data, ~mutate(., Continent = case_when(
                                               Country_Region == "Diamond Princess" ~ "Asia",
                                               Country_Region == "Kosovo" ~ "Americas",
                                               Country_Region == "MS Zaandam" ~ "Europe",
                                               TRUE ~ Continent)
                                  ))) -> df_long

map(df_long$data, ~unique(.$Continent))
## [[1]]
## [1] "Asia"     "Europe"   "Africa"   "Americas" "Oceania"  NA        
## 
## [[2]]
## [1] "Asia"     "Europe"   "Africa"   "Americas" "Oceania"  NA        
## 
## [[3]]
## [1] "Americas"
## 
## [[4]]
## [1] "Americas"

Unnest the Data Frames

# 1
df_long %>%
  unnest(cols = data) %>%
  ungroup() -> df_all

# 2
remove(df, df_long)

# 3
df_all %>%
  dplyr::select(-vars) -> df_all

Get World Population Data

# 1
df_pop <- read_csv("../data/WPP2019_TotalPopulation.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   LocID = col_double(),
##   Location = col_character(),
##   PopTotal = col_double(),
##   PopDensity = col_double()
## )
# summarize(df_pop, across(everything(), ~sum(is.na(.)))) # check NAs

# 2 
semi_join(df_pop, df_all, by = c("Location" = "Country_Region")) -> df_pop

# 3
df_pop %>% 
  mutate(rank_p = rank(-PopTotal, na.last = TRUE),
         rank_d = rank(-PopDensity, na.last = TRUE),
         PopTotal = (PopTotal*1000)) -> df_pop
df_all %>%
  inner_join(df_pop, by = c("Country_Region" = "Location")) -> df_all

Tidy Data

  • Because COVID-19 data is a time series data, we only focus on 2020/01/22 - 2021/01/22 for our experiment.
#df_all %>% 
  #filter(case_types == "confirmed_US") %>%
  #select(Date, Province_State, County, dailyValues) %>%
  #tail()

# extract one year
df_all %>%
  filter(case_types == "confirmed_US" & as_date(Date) <= as_date("2021-01-22") | case_types == "deaths_US" & as_date(Date) <= as_date("2021-01-22")) -> covid

names(covid)
##  [1] "case_types"     "Country_State"  "Province_State" "Country_Region"
##  [5] "Lat"            "Long"           "County"         "Population"    
##  [9] "Date"           "dailyValues"    "Continent"      "LocID"         
## [13] "PopTotal"       "PopDensity"     "rank_p"         "rank_d"

Find out each US state using usmap

state_map <- us_map(regions = "states")
state_map %>%
  distinct(full) %>%
  rename("Province_State" = "full") -> USstates

Obtain the number of confirmed cases for each state

covid %>%
  filter(case_types == "confirmed_US" & as_date(Date) == as_date("2021-01-22")) %>%
  dplyr::select(Province_State, County, dailyValues) %>%
  group_by(Province_State) %>%
  tally(dailyValues) %>%
  right_join(USstates) %>%
  rename("confirmed" = "n") -> confirmed
## Joining, by = "Province_State"

Obtain the number of death cases for each state

covid %>%
  filter(case_types == "deaths_US" & as_date(Date) == as_date("2021-01-22")) %>%
   dplyr::select(Province_State, County, dailyValues) %>%
  group_by(Province_State) %>%
  tally(dailyValues) %>%
  right_join(USstates) %>%
  rename("deaths" = "n") -> deathes
## Joining, by = "Province_State"
full_join(confirmed, deathes) -> covid
## Joining, by = "Province_State"

Read 2019 American community survey estimate by race by state

race <- read_csv("../data/2019_state_community_by_race.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   State = col_character(),
##   `American Indian and Alaska Native alone` = col_double(),
##   `Asian alone` = col_double(),
##   `Black or African American alone` = col_double(),
##   `Native Hawaiian and Other Pacific Islander alone` = col_double(),
##   `Some other race alone` = col_double(),
##   `Total Population` = col_double(),
##   `Two or more races` = col_double(),
##   `White alone` = col_double()
## )
race %>% 
  rename(Province_State = State) -> race

covid %>% 
  left_join(race, by = "Province_State") %>%
  rename(American_Indian_and_Alaska_Native_alone = "American Indian and Alaska Native alone",
         Asian_alone = "Asian alone",
         Black_or_African_American_alone = "Black or African American alone",
         Native_Hawaiian_and_Other_Pacific_Islander_alone = "Native Hawaiian and Other Pacific Islander alone",
         Some_other_race_alone = "Some other race alone",
         Total_Population = "Total Population",
         Two_or_more_races = "Two or more races",
         White_alone = "White alone") -> covid

#race %>% 
  #anti_join(covidForRegression, by = "Province_State") #-->Check if there are some diff value of state

# The 2020 American Community Survey (ACS) 1-year estimates will be released on September 23, 2021.
# Since the 2020 survey does not yet release, we use the 2019 survey here 

Read 2021 Household income

householdIncome2021 <- read_csv("../data/MedianHouseholdIncome2021.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   State = col_character(),
##   HouseholdIncome = col_double()
## )
householdIncome2021 
householdIncome2021  %>% 
  rename(Province_State = State) -> householdIncome2021

covid %>% 
  left_join(householdIncome2021, by = "Province_State") -> covid

# Recheck
#covid %>%
  #dplyr::select(Province_State, HouseholdIncome) %>%
   #dplyr::arrange(desc(HouseholdIncome))

Exploratory Data Analysis

Data Analysis and Visualization

str(covid)
## tibble [51 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Province_State                                  : chr [1:51] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ confirmed                                       : num [1:51] 436087 52926 708041 281382 3147207 ...
##  $ deaths                                          : num [1:51] 6486 254 12001 4549 36615 ...
##  $ American_Indian_and_Alaska_Native_alone         : num [1:51] 23265 115544 332273 17216 321112 ...
##  $ Asian_alone                                     : num [1:51] 66129 43678 241721 46078 5865435 ...
##  $ Black_or_African_American_alone                 : num [1:51] 1319551 22551 343729 467468 2282144 ...
##  $ Native_Hawaiian_and_Other_Pacific_Islander_alone: num [1:51] 1892 9923 14168 12829 155871 ...
##  $ Some_other_race_alone                           : num [1:51] 74451 12602 364442 75590 5424558 ...
##  $ Total_Population                                : num [1:51] 4903185 731545 7278717 3017804 39512223 ...
##  $ Two_or_more_races                               : num [1:51] 91522 57476 280574 83603 1978145 ...
##  $ White_alone                                     : num [1:51] 3326375 469771 5701810 2315020 23484958 ...
##  $ HouseholdIncome                                 : num [1:51] 50536 77640 58945 47597 75235 ...
covid %>% head()

Top 5 Confirmed Covid-19 Cases (Jan. 22, 2020 to Jan. 22, 2021)

covid %>%
   dplyr::select(1, 2) %>%
  arrange(desc(confirmed)) %>%
  head(5)

Top 5 Death Cases (Jan. 22, 2020 to Jan. 22, 2021)

covid %>%
   dplyr::select(1, 3) %>%
  arrange(desc(deaths)) %>%
  head(5)

Total Population vs Top 5 Median Income

covid %>%
  dplyr::arrange(desc(HouseholdIncome)) %>%
  head(5) %>%
  ggplot(aes(x = Total_Population, y = HouseholdIncome, color = Province_State)) +
  geom_point() +
  theme_bw() +
  labs(x = "Total Population", y = "2021 Median Income", 
       title = "Total Population vs Top 5 Median Income")

In order to run the linear regression model smoothly, we deleted the Province Statevariable from our covid data and saved it as covidForRegression dataframe. The reason of removing these two is :

  • The Province State variable has 51 different categorical types, and each observation has its own type, making it difficult to run a regression model.
  • We refer to the existing paper1 & 2 and select some similar independent variables as predictors.
covid %>%
  dplyr::select(-Province_State) -> covidForRegression
names(covidForRegression)
##  [1] "confirmed"                                       
##  [2] "deaths"                                          
##  [3] "American_Indian_and_Alaska_Native_alone"         
##  [4] "Asian_alone"                                     
##  [5] "Black_or_African_American_alone"                 
##  [6] "Native_Hawaiian_and_Other_Pacific_Islander_alone"
##  [7] "Some_other_race_alone"                           
##  [8] "Total_Population"                                
##  [9] "Two_or_more_races"                               
## [10] "White_alone"                                     
## [11] "HouseholdIncome"

Descriptive Statistics

describe(covidForRegression) 

Linear Model Assumptions

Multicollinearity

Plot

correlation <- cor(covidForRegression) 
corrplot(correlation, method="color", addCoef.col = "black", number.cex = 0.5, type = "lower")

Table

Check the inversely related values of Tolerance and VIF. Tolerance has to be > 0.10 and VIF < 10. If these stipulated are not fulfilled, multicollinearity is at hand.

reg <- lm(confirmed ~ ., data = covidForRegression)
check_collinearity(reg)
## Warning: Model matrix is rank deficient. VIFs may not be sensible.

Remediation - Removing Highly Correlated Predictors

  • Remove highly correlated predictors from the model. If you have two or more factors with a high VIF, remove one from the model.
  • According to the result, we firstly remove the high correlation variables: Some_other_race_alone, Total_Population, and Two_or_more_races, and keep the majority of race variables.
  • After we removed, now the independent variables have very low correlation with each other.
covidForRegression %>%
  dplyr::select(-Total_Population, -Two_or_more_races, -Some_other_race_alone) -> covidForRegression2
names(covidForRegression2)
## [1] "confirmed"                                       
## [2] "deaths"                                          
## [3] "American_Indian_and_Alaska_Native_alone"         
## [4] "Asian_alone"                                     
## [5] "Black_or_African_American_alone"                 
## [6] "Native_Hawaiian_and_Other_Pacific_Islander_alone"
## [7] "White_alone"                                     
## [8] "HouseholdIncome"
reg <- lm(confirmed ~ ., data = covidForRegression2)
check_collinearity(reg)

Independence

We would need to know more from the data providers to really assess this. We will assume it holds.

Linearity

The lack of fit F test works only with simple linear regression so we see the residual plots. As for the residuals versus fitted plot below, there may be no pattern indicating non-linearity in the data, but we attempt to remove some potential outliers.

par(mfrow = c(2,2))
# summary(reg)
plot(reg)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Remediation - Removing Influential Outliers

  • Get other diagnostics measures

Take away the largest values of cooks and leverage. That is, observations 5 (California), is no longer present.

leverage <- hatvalues(reg)
student <- rstudent(reg)
dfs <- dffits(reg)
cooksd <- cooks.distance(reg)
data.frame(confirmed = covidForRegression2$confirmed, fitted = reg$fitted,
           residual = reg$residual, leverage, student, dffits = dfs, cooksd) -> diag

par(mfrow=c(2,2))
plot(leverage,type='h')
abline(h=0)
plot(student,type='h')
abline(h=0)
plot(dfs,type='h',ylab='dffit')
abline(h=0)
plot(cooksd,type='h')
abline(h=0)

diag %>%
  arrange(desc(leverage)) %>%
  head(3)
diag %>%
  arrange(desc(cooksd)) %>%
  head(3)
  • Look at the Residual vs Fitted plot; linearity has occurred.
reg1 <- lm(confirmed ~ ., data = covidForRegression2[-5,])
par(mfrow=c(2,2))
plot(reg1)

Homoscedasticity

With a small p-value, we have evidence that the variances are non-constant

# test homoscedasticity
ncvTest(reg1)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 18.35783, Df = 1, p = 1.8307e-05

Remediation - Square Root of Y

With a high p-value of 0.84229, there is no evidence of non-constant variance.

covidForRegression2 %>%
  mutate(confirmed = sqrt(confirmed)) -> covidForRegressionSQRT

reg2 <- lm(confirmed ~ ., data = covidForRegressionSQRT[-5,])
ncvTest(reg2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.03886796, Df = 1, p = 0.84371

Normality

  • The p-value of the Shapiro-Wilk Test 0.1822 is greater than \(\alpha\) 0.05 so the data is follow a normal distribution.
# test normality
shapiro.test(rstudent(reg2))
## 
##  Shapiro-Wilk normality test
## 
## data:  rstudent(reg2)
## W = 0.96754, p-value = 0.1834

Linear Model Selection

Our final mission is to select the fewest predictors and the determine by the lowest mean squared error in the linear model.

Adding all variables as full model first. We can see that only three variables in the linear model are significant at the beginning, which is deaths, Black_or_African_American_alone, and White_alone.

summary(reg2)
## 
## Call:
## lm(formula = confirmed ~ ., data = covidForRegressionSQRT[-5, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -222.17  -76.67   10.27   68.87  164.56 
## 
## Coefficients:
##                                                    Estimate Std. Error t value
## (Intercept)                                       3.624e+02  1.103e+02   3.286
## deaths                                            1.191e-02  5.482e-03   2.173
## American_Indian_and_Alaska_Native_alone           3.500e-04  2.184e-04   1.603
## Asian_alone                                      -1.804e-04  1.218e-04  -1.482
## Black_or_African_American_alone                   5.843e-05  2.727e-05   2.143
## Native_Hawaiian_and_Other_Pacific_Islander_alone  1.018e-05  7.940e-04   0.013
## White_alone                                       4.287e-05  8.550e-06   5.014
## HouseholdIncome                                  -1.027e-03  1.640e-03  -0.626
##                                                  Pr(>|t|)    
## (Intercept)                                       0.00206 ** 
## deaths                                            0.03546 *  
## American_Indian_and_Alaska_Native_alone           0.11652    
## Asian_alone                                       0.14590    
## Black_or_African_American_alone                   0.03797 *  
## Native_Hawaiian_and_Other_Pacific_Islander_alone  0.98983    
## White_alone                                      1.02e-05 ***
## HouseholdIncome                                   0.53449    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98.64 on 42 degrees of freedom
## Multiple R-squared:  0.9041, Adjusted R-squared:  0.8882 
## F-statistic: 56.58 on 7 and 42 DF,  p-value: < 2.2e-16

Stepwise Method

We use algorithm to considers either adding or removing variables at each step to final the best model. Lower AIC (Akaike information criterion) values indicate a better-fit model.

null = lm(confirmed ~ 1, data = covidForRegressionSQRT[-c(5),])
full = lm(confirmed ~ ., data = covidForRegressionSQRT[-c(5),]) 
step(null, scope = list(lower = null, upper = full), direction = "both")
## Start:  AIC=569.67
## confirmed ~ 1
## 
##                                                    Df Sum of Sq     RSS    AIC
## + White_alone                                       1   3667200  595170 473.23
## + deaths                                            1   3266316  996054 498.98
## + Black_or_African_American_alone                   1   2944490 1317880 512.98
## + Asian_alone                                       1   2115961 2146409 537.36
## + American_Indian_and_Alaska_Native_alone           1    265309 3997061 568.45
## <none>                                                          4262370 569.67
## + HouseholdIncome                                   1    121852 4140518 570.22
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     19189 4243181 571.44
## 
## Step:  AIC=473.23
## confirmed ~ White_alone
## 
##                                                    Df Sum of Sq     RSS    AIC
## + Black_or_African_American_alone                   1     68250  526920 469.14
## + HouseholdIncome                                   1     46407  548764 471.17
## + deaths                                            1     44029  551142 471.39
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     35741  559429 472.13
## <none>                                                           595170 473.23
## + American_Indian_and_Alaska_Native_alone           1     12741  582429 474.15
## + Asian_alone                                       1      6779  588392 474.66
## - White_alone                                       1   3667200 4262370 569.67
## 
## Step:  AIC=469.14
## confirmed ~ White_alone + Black_or_African_American_alone
## 
##                                                    Df Sum of Sq     RSS    AIC
## + HouseholdIncome                                   1     39105  487816 467.28
## + American_Indian_and_Alaska_Native_alone           1     28034  498887 468.41
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     26719  500201 468.54
## <none>                                                           526920 469.14
## + deaths                                            1     17944  508976 469.41
## + Asian_alone                                       1     16544  510377 469.54
## - Black_or_African_American_alone                   1     68250  595170 473.23
## - White_alone                                       1    790959 1317880 512.98
## 
## Step:  AIC=467.28
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome
## 
##                                                    Df Sum of Sq     RSS    AIC
## + deaths                                            1     30433  457382 466.06
## <none>                                                           487816 467.28
## + American_Indian_and_Alaska_Native_alone           1     17844  469972 467.42
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1     13595  474221 467.87
## - HouseholdIncome                                   1     39105  526920 469.14
## + Asian_alone                                       1       482  487334 469.23
## - Black_or_African_American_alone                   1     60948  548764 471.17
## - White_alone                                       1    796518 1284334 513.69
## 
## Step:  AIC=466.06
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome + 
##     deaths
## 
##                                                    Df Sum of Sq    RSS    AIC
## + Asian_alone                                       1     23739 433643 465.40
## <none>                                                          457382 466.06
## + American_Indian_and_Alaska_Native_alone           1     17872 439510 466.07
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1      8133 449249 467.17
## - deaths                                            1     30433 487816 467.28
## - Black_or_African_American_alone                   1     30551 487933 467.30
## - HouseholdIncome                                   1     51594 508976 469.41
## - White_alone                                       1    274232 731614 487.55
## 
## Step:  AIC=465.4
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome + 
##     deaths + Asian_alone
## 
##                                                    Df Sum of Sq    RSS    AIC
## + American_Indian_and_Alaska_Native_alone           1     25006 408638 464.43
## - HouseholdIncome                                   1     10246 443889 464.57
## <none>                                                          433643 465.40
## - Asian_alone                                       1     23739 457382 466.06
## - Black_or_African_American_alone                   1     33456 467099 467.11
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1        19 433625 467.40
## - deaths                                            1     53690 487334 469.23
## - White_alone                                       1    291149 724792 489.08
## 
## Step:  AIC=464.43
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone
## 
##                                                    Df Sum of Sq    RSS    AIC
## - HouseholdIncome                                   1      3865 412502 462.90
## <none>                                                          408638 464.43
## - American_Indian_and_Alaska_Native_alone           1     25006 433643 465.40
## - Asian_alone                                       1     30873 439510 466.07
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1         2 408636 466.43
## - Black_or_African_American_alone                   1     44906 453543 467.64
## - deaths                                            1     59882 468520 469.27
## - White_alone                                       1    248959 657596 486.22
## 
## Step:  AIC=462.9
## confirmed ~ White_alone + Black_or_African_American_alone + deaths + 
##     Asian_alone + American_Indian_and_Alaska_Native_alone
## 
##                                                    Df Sum of Sq    RSS    AIC
## <none>                                                          412502 462.90
## + HouseholdIncome                                   1      3865 408638 464.43
## - American_Indian_and_Alaska_Native_alone           1     31387 443889 464.57
## + Native_Hawaiian_and_Other_Pacific_Islander_alone  1        50 412453 464.89
## - Black_or_African_American_alone                   1     51301 463803 466.76
## - Asian_alone                                       1     66809 479311 468.40
## - deaths                                            1     70052 482555 468.74
## - White_alone                                       1    260300 672802 485.36
## 
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Coefficients:
##                             (Intercept)  
##                               2.954e+02  
##                             White_alone  
##                               4.353e-05  
##         Black_or_African_American_alone  
##                               6.143e-05  
##                                  deaths  
##                               1.254e-02  
##                             Asian_alone  
##                              -2.161e-04  
## American_Indian_and_Alaska_Native_alone  
##                               3.815e-04

Comparison

We will select the fewest variable for each set, compare their MSE, and finally select the one with the local minimum MSE.

  1. From exhaustive search, the fewest predictors is 4 in smallest BIC (Bayesian Information Criterion).
regExhaustive <- lm(confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + White_alone
                    , data = covidForRegressionSQRT[-c(5),])
summary(regExhaustive)
## 
## Call:
## lm(formula = confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + 
##     White_alone, data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -232.97  -70.37   13.41   69.36  159.57 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.085e+02  2.043e+01  15.100  < 2e-16 ***
## deaths                           1.220e-02  4.701e-03   2.596   0.0127 *  
## Asian_alone                     -2.132e-04  8.301e-05  -2.569   0.0136 *  
## Black_or_African_American_alone  5.300e-05  2.652e-05   1.998   0.0517 .  
## White_alone                      4.698e-05  8.250e-06   5.695 8.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.32 on 45 degrees of freedom
## Multiple R-squared:  0.8959, Adjusted R-squared:  0.8866 
## F-statistic: 96.78 on 4 and 45 DF,  p-value: < 2.2e-16
  1. For forward method in sequential search, the fewest predictors is 2 in smallest BIC (Bayesian Information Criterion).
regForward <- lm(confirmed ~ Black_or_African_American_alone + White_alone,
                 data = covidForRegressionSQRT[-c(5),])
summary(regForward)
## 
## Call:
## lm(formula = confirmed ~ Black_or_African_American_alone + White_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -237.00  -63.71   10.56   80.07  204.06 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     3.087e+02  2.170e+01  14.228  < 2e-16 ***
## Black_or_African_American_alone 6.614e-05  2.681e-05   2.467   0.0173 *  
## White_alone                     5.366e-05  6.388e-06   8.400  6.5e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105.9 on 47 degrees of freedom
## Multiple R-squared:  0.8764, Adjusted R-squared:  0.8711 
## F-statistic: 166.6 on 2 and 47 DF,  p-value: < 2.2e-16
  1. For backward method in sequential search, the fewest predictors is 4 in smallest BIC (Bayesian Information Criterion).
regBackward <-  lm(confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + White_alone
                    , data = covidForRegressionSQRT[-c(5),])
summary(regBackward)
## 
## Call:
## lm(formula = confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + 
##     White_alone, data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -232.97  -70.37   13.41   69.36  159.57 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.085e+02  2.043e+01  15.100  < 2e-16 ***
## deaths                           1.220e-02  4.701e-03   2.596   0.0127 *  
## Asian_alone                     -2.132e-04  8.301e-05  -2.569   0.0136 *  
## Black_or_African_American_alone  5.300e-05  2.652e-05   1.998   0.0517 .  
## White_alone                      4.698e-05  8.250e-06   5.695 8.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.32 on 45 degrees of freedom
## Multiple R-squared:  0.8959, Adjusted R-squared:  0.8866 
## F-statistic: 96.78 on 4 and 45 DF,  p-value: < 2.2e-16
  1. For stepwise method, the lowest AIC value is 462.9, reporting the best model with 5 variables is: lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, data = covidForRegressionSQRT[-c(5), ]).
regStepwise <- lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
                    deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
                  data = covidForRegressionSQRT[-c(5), ])
summary(regStepwise)
## 
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -219.073  -71.292    8.241   68.218  164.253 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              2.954e+02  2.116e+01  13.956  < 2e-16
## White_alone                              4.353e-05  8.261e-06   5.269 3.94e-06
## Black_or_African_American_alone          6.143e-05  2.626e-05   2.339  0.02393
## deaths                                   1.254e-02  4.586e-03   2.734  0.00899
## Asian_alone                             -2.161e-04  8.095e-05  -2.669  0.01060
## American_Indian_and_Alaska_Native_alone  3.815e-04  2.085e-04   1.830  0.07407
##                                            
## (Intercept)                             ***
## White_alone                             ***
## Black_or_African_American_alone         *  
## deaths                                  ** 
## Asian_alone                             *  
## American_Indian_and_Alaska_Native_alone .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.82 on 44 degrees of freedom
## Multiple R-squared:  0.9032, Adjusted R-squared:  0.8922 
## F-statistic: 82.13 on 5 and 44 DF,  p-value: < 2.2e-16
  1. Let us start to find the minimum MSE
#calculate MSE
anova(regExhaustive) %>% tidy() # 9863.275
anova(regForward) %>% tidy() # 11211.07 
anova(regBackward) %>% tidy() # 9863.275    
anova(regStepwise) %>% tidy() # 9375.065    

Final Selection

regStepwise has the smallest MSE value (9375.065) so we select it as the best model. Therefore, our final predictors of square root of COVID confirmed cases by US States is deaths, White_alone,Black_or_African_American_alone, Asian_alone and American_Indian_and_Alaska_Native_alone.

finalReg <- lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
    deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
    data = covidForRegressionSQRT[-c(5), ])
summary(finalReg) 
## 
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + 
##     deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, 
##     data = covidForRegressionSQRT[-c(5), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -219.073  -71.292    8.241   68.218  164.253 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              2.954e+02  2.116e+01  13.956  < 2e-16
## White_alone                              4.353e-05  8.261e-06   5.269 3.94e-06
## Black_or_African_American_alone          6.143e-05  2.626e-05   2.339  0.02393
## deaths                                   1.254e-02  4.586e-03   2.734  0.00899
## Asian_alone                             -2.161e-04  8.095e-05  -2.669  0.01060
## American_Indian_and_Alaska_Native_alone  3.815e-04  2.085e-04   1.830  0.07407
##                                            
## (Intercept)                             ***
## White_alone                             ***
## Black_or_African_American_alone         *  
## deaths                                  ** 
## Asian_alone                             *  
## American_Indian_and_Alaska_Native_alone .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.82 on 44 degrees of freedom
## Multiple R-squared:  0.9032, Adjusted R-squared:  0.8922 
## F-statistic: 82.13 on 5 and 44 DF,  p-value: < 2.2e-16

Model Equation

Interpretation

With the large p-value 0.07426 of \(b_5\), the prodictor American_Indian_and_Alaska_Native_alone is not in the significant level. In other words, we fail to reject the null (H0: \(\beta_5\) = 0 cannot be rejected) so we can conclude \(b_5\) is 0. Therefore, \(b_5\) can be dropped from the model.

Linear Equation

Thus, the expected value of square root of confirmed cases is: \[ \sqrt{\hat{Confirmed}} = 295.4 + 0.00004354White + 0.0000614Black + 0.01254Deaths - 0.0002163Asian \]

References

  1. Sehra, S., Fundin, S., Lavery, C., & Baker, J. (2020). Differences in race and other state‐level characteristics and associations with mortality from COVID‐19 infection. Journal of Medical Virology, 92(11), 2406–2408. https://doi.org/10.1002/jmv.26095

  2. Sa, Filipa G., Socioeconomic Determinants of Covid-19 Infections and Mortality: Evidence from England and Wales (May 2020). CEPR Discussion Paper No. DP14781, Available at SSRN: https://ssrn.com/abstract=3612850