Install the required libraries
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
+ time_series_covid19_confirmed_global.csv
+ time_series_covid19_deaths_global.csv
+ time_series_covid19_confirmed_US.csv
+ time_series_covid19_deaths_US.csv
df <- tibble(file_names = c("time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_global.csv",
"time_series_covid19_confirmed_US.csv",
"time_series_covid19_deaths_US.csv")) -> df
df %>%
mutate(url = str_c(url_in, file_names, sep = "")) -> df
df %>%
mutate(data = map(url, ~read_csv(., na = ""))) -> df
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso3 = col_character(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso3 = col_character(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
df %>%
mutate(case_types = as.factor(str_extract(file_names, "[:alpha:]*_[gU][:alpha:]*"))) ->
df
# alpha = Any letter, [A-Za-z]
# reference: https://www.petefreitag.com/cheatsheets/regex/character-classes/
df %>%
dplyr::select(case_types, data) -> df
df %>%
mutate(vars = map(df$data, names)) -> df
# map(df$vars, ~unlist(.)[1:15]) for checking
fix_names <- function(df, pattern, rePattern){
stopifnot(is.data.frame(df), is.character(pattern), is.character(rePattern))
names(df) <- str_replace_all(names(df), pattern, rePattern)
return(df)
}
df %>%
mutate(data = map(data, ~fix_names(., "([ey])/", "\\1_")),
data = map(data, ~fix_names(., "Admin2", "County")),
data = map(data, ~fix_names(., "Long_", "Long")),
data = map_if(data, str_detect(df$case_types, "US"),
~dplyr::select(., -c("UID", "iso2", "iso3",
"code3", "FIPS", "Combined_Key"))),
data = map_if(data, str_detect(df$case_types, "global"),
~mutate(., County = "NA")),
data = map_if(data, !str_detect(df$case_types, "deaths_US"),
~mutate(., Population = 0)),
data = map(data, ~unite(., "Country_State",
c("Country_Region", "Province_State"),
remove = FALSE, na.rm = TRUE,
sep = "_"))
) -> df
df %>%
mutate(vars = map(df$data, names)) -> df # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking
df %>%
mutate(data = map(data, ~pivot_longer(data = ., cols = contains("/"),
names_to = "Date",
values_to = "dailyValues",
names_transform = list(Date = mdy)))
) -> df
# df$data <- map(df$data, names) # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking
# crate a function to fix in type of Date
mdyDate <- function(df, varsDate){
# stopifnot(is.data.frame(df), is.character(varsDate))
df[[varsDate]] <- ymd(df[[varsDate]])
return(df)
}
df %>%
mutate(data = map(data, ~mdyDate(., "Date"))) -> df_long
# str(df_long) # check the data set
df_long %>%
mutate(data = map(data, ~mutate(., Continent = countrycode(Country_Region,
origin = "country.name",
destination = "continent")))
) -> df_long
df_long %>%
mutate(data = map(data, ~mutate(., Continent = case_when(
Country_Region == "Diamond Princess" ~ "Asia",
Country_Region == "Kosovo" ~ "Americas",
Country_Region == "MS Zaandam" ~ "Europe",
TRUE ~ Continent)
))) -> df_long
map(df_long$data, ~unique(.$Continent))
## [[1]]
## [1] "Asia" "Europe" "Africa" "Americas" "Oceania" NA
##
## [[2]]
## [1] "Asia" "Europe" "Africa" "Americas" "Oceania" NA
##
## [[3]]
## [1] "Americas"
##
## [[4]]
## [1] "Americas"
# 1
df_long %>%
unnest(cols = data) %>%
ungroup() -> df_all
# 2
remove(df, df_long)
# 3
df_all %>%
dplyr::select(-vars) -> df_all
# 1
df_pop <- read_csv("../data/WPP2019_TotalPopulation.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## LocID = col_double(),
## Location = col_character(),
## PopTotal = col_double(),
## PopDensity = col_double()
## )
# summarize(df_pop, across(everything(), ~sum(is.na(.)))) # check NAs
# 2
semi_join(df_pop, df_all, by = c("Location" = "Country_Region")) -> df_pop
# 3
df_pop %>%
mutate(rank_p = rank(-PopTotal, na.last = TRUE),
rank_d = rank(-PopDensity, na.last = TRUE),
PopTotal = (PopTotal*1000)) -> df_pop
df_all %>%
inner_join(df_pop, by = c("Country_Region" = "Location")) -> df_all
#df_all %>%
#filter(case_types == "confirmed_US") %>%
#select(Date, Province_State, County, dailyValues) %>%
#tail()
# extract one year
df_all %>%
filter(case_types == "confirmed_US" & as_date(Date) <= as_date("2021-01-22") | case_types == "deaths_US" & as_date(Date) <= as_date("2021-01-22")) -> covid
names(covid)
## [1] "case_types" "Country_State" "Province_State" "Country_Region"
## [5] "Lat" "Long" "County" "Population"
## [9] "Date" "dailyValues" "Continent" "LocID"
## [13] "PopTotal" "PopDensity" "rank_p" "rank_d"
state_map <- us_map(regions = "states")
state_map %>%
distinct(full) %>%
rename("Province_State" = "full") -> USstates
covid %>%
filter(case_types == "confirmed_US" & as_date(Date) == as_date("2021-01-22")) %>%
dplyr::select(Province_State, County, dailyValues) %>%
group_by(Province_State) %>%
tally(dailyValues) %>%
right_join(USstates) %>%
rename("confirmed" = "n") -> confirmed
## Joining, by = "Province_State"
covid %>%
filter(case_types == "deaths_US" & as_date(Date) == as_date("2021-01-22")) %>%
dplyr::select(Province_State, County, dailyValues) %>%
group_by(Province_State) %>%
tally(dailyValues) %>%
right_join(USstates) %>%
rename("deaths" = "n") -> deathes
## Joining, by = "Province_State"
full_join(confirmed, deathes) -> covid
## Joining, by = "Province_State"
race <- read_csv("../data/2019_state_community_by_race.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## State = col_character(),
## `American Indian and Alaska Native alone` = col_double(),
## `Asian alone` = col_double(),
## `Black or African American alone` = col_double(),
## `Native Hawaiian and Other Pacific Islander alone` = col_double(),
## `Some other race alone` = col_double(),
## `Total Population` = col_double(),
## `Two or more races` = col_double(),
## `White alone` = col_double()
## )
race %>%
rename(Province_State = State) -> race
covid %>%
left_join(race, by = "Province_State") %>%
rename(American_Indian_and_Alaska_Native_alone = "American Indian and Alaska Native alone",
Asian_alone = "Asian alone",
Black_or_African_American_alone = "Black or African American alone",
Native_Hawaiian_and_Other_Pacific_Islander_alone = "Native Hawaiian and Other Pacific Islander alone",
Some_other_race_alone = "Some other race alone",
Total_Population = "Total Population",
Two_or_more_races = "Two or more races",
White_alone = "White alone") -> covid
#race %>%
#anti_join(covidForRegression, by = "Province_State") #-->Check if there are some diff value of state
# The 2020 American Community Survey (ACS) 1-year estimates will be released on September 23, 2021.
# Since the 2020 survey does not yet release, we use the 2019 survey here
householdIncome2021 <- read_csv("../data/MedianHouseholdIncome2021.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## State = col_character(),
## HouseholdIncome = col_double()
## )
householdIncome2021
householdIncome2021 %>%
rename(Province_State = State) -> householdIncome2021
covid %>%
left_join(householdIncome2021, by = "Province_State") -> covid
# Recheck
#covid %>%
#dplyr::select(Province_State, HouseholdIncome) %>%
#dplyr::arrange(desc(HouseholdIncome))
str(covid)
## tibble [51 × 12] (S3: tbl_df/tbl/data.frame)
## $ Province_State : chr [1:51] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ confirmed : num [1:51] 436087 52926 708041 281382 3147207 ...
## $ deaths : num [1:51] 6486 254 12001 4549 36615 ...
## $ American_Indian_and_Alaska_Native_alone : num [1:51] 23265 115544 332273 17216 321112 ...
## $ Asian_alone : num [1:51] 66129 43678 241721 46078 5865435 ...
## $ Black_or_African_American_alone : num [1:51] 1319551 22551 343729 467468 2282144 ...
## $ Native_Hawaiian_and_Other_Pacific_Islander_alone: num [1:51] 1892 9923 14168 12829 155871 ...
## $ Some_other_race_alone : num [1:51] 74451 12602 364442 75590 5424558 ...
## $ Total_Population : num [1:51] 4903185 731545 7278717 3017804 39512223 ...
## $ Two_or_more_races : num [1:51] 91522 57476 280574 83603 1978145 ...
## $ White_alone : num [1:51] 3326375 469771 5701810 2315020 23484958 ...
## $ HouseholdIncome : num [1:51] 50536 77640 58945 47597 75235 ...
covid %>% head()
covid %>%
dplyr::select(1, 2) %>%
arrange(desc(confirmed)) %>%
head(5)
covid %>%
dplyr::select(1, 3) %>%
arrange(desc(deaths)) %>%
head(5)
covid %>%
dplyr::arrange(desc(HouseholdIncome)) %>%
head(5) %>%
ggplot(aes(x = Total_Population, y = HouseholdIncome, color = Province_State)) +
geom_point() +
theme_bw() +
labs(x = "Total Population", y = "2021 Median Income",
title = "Total Population vs Top 5 Median Income")
In order to run the linear regression model smoothly, we deleted the Province State
variable from our covid data and saved it as covidForRegression
dataframe. The reason of removing these two is :
Province State
variable has 51 different categorical types, and each observation has its own type, making it difficult to run a regression model.covid %>%
dplyr::select(-Province_State) -> covidForRegression
names(covidForRegression)
## [1] "confirmed"
## [2] "deaths"
## [3] "American_Indian_and_Alaska_Native_alone"
## [4] "Asian_alone"
## [5] "Black_or_African_American_alone"
## [6] "Native_Hawaiian_and_Other_Pacific_Islander_alone"
## [7] "Some_other_race_alone"
## [8] "Total_Population"
## [9] "Two_or_more_races"
## [10] "White_alone"
## [11] "HouseholdIncome"
describe(covidForRegression)
correlation <- cor(covidForRegression)
corrplot(correlation, method="color", addCoef.col = "black", number.cex = 0.5, type = "lower")
Check the inversely related values of Tolerance and VIF. Tolerance has to be > 0.10 and VIF < 10. If these stipulated are not fulfilled, multicollinearity is at hand.
reg <- lm(confirmed ~ ., data = covidForRegression)
check_collinearity(reg)
## Warning: Model matrix is rank deficient. VIFs may not be sensible.
We would need to know more from the data providers to really assess this. We will assume it holds.
The lack of fit F test works only with simple linear regression so we see the residual plots. As for the residuals versus fitted plot below, there may be no pattern indicating non-linearity in the data, but we attempt to remove some potential outliers.
par(mfrow = c(2,2))
# summary(reg)
plot(reg)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Take away the largest values of cooks and leverage. That is, observations 5 (California), is no longer present.
leverage <- hatvalues(reg)
student <- rstudent(reg)
dfs <- dffits(reg)
cooksd <- cooks.distance(reg)
data.frame(confirmed = covidForRegression2$confirmed, fitted = reg$fitted,
residual = reg$residual, leverage, student, dffits = dfs, cooksd) -> diag
par(mfrow=c(2,2))
plot(leverage,type='h')
abline(h=0)
plot(student,type='h')
abline(h=0)
plot(dfs,type='h',ylab='dffit')
abline(h=0)
plot(cooksd,type='h')
abline(h=0)
diag %>%
arrange(desc(leverage)) %>%
head(3)
diag %>%
arrange(desc(cooksd)) %>%
head(3)
reg1 <- lm(confirmed ~ ., data = covidForRegression2[-5,])
par(mfrow=c(2,2))
plot(reg1)
With a small p-value, we have evidence that the variances are non-constant
# test homoscedasticity
ncvTest(reg1)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 18.35783, Df = 1, p = 1.8307e-05
With a high p-value of 0.84229, there is no evidence of non-constant variance.
covidForRegression2 %>%
mutate(confirmed = sqrt(confirmed)) -> covidForRegressionSQRT
reg2 <- lm(confirmed ~ ., data = covidForRegressionSQRT[-5,])
ncvTest(reg2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.03886796, Df = 1, p = 0.84371
# test normality
shapiro.test(rstudent(reg2))
##
## Shapiro-Wilk normality test
##
## data: rstudent(reg2)
## W = 0.96754, p-value = 0.1834
Our final mission is to select the fewest predictors and the determine by the lowest mean squared error in the linear model.
Adding all variables as full model first. We can see that only three variables in the linear model are significant at the beginning, which is deaths
, Black_or_African_American_alone
, and White_alone
.
summary(reg2)
##
## Call:
## lm(formula = confirmed ~ ., data = covidForRegressionSQRT[-5,
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -222.17 -76.67 10.27 68.87 164.56
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 3.624e+02 1.103e+02 3.286
## deaths 1.191e-02 5.482e-03 2.173
## American_Indian_and_Alaska_Native_alone 3.500e-04 2.184e-04 1.603
## Asian_alone -1.804e-04 1.218e-04 -1.482
## Black_or_African_American_alone 5.843e-05 2.727e-05 2.143
## Native_Hawaiian_and_Other_Pacific_Islander_alone 1.018e-05 7.940e-04 0.013
## White_alone 4.287e-05 8.550e-06 5.014
## HouseholdIncome -1.027e-03 1.640e-03 -0.626
## Pr(>|t|)
## (Intercept) 0.00206 **
## deaths 0.03546 *
## American_Indian_and_Alaska_Native_alone 0.11652
## Asian_alone 0.14590
## Black_or_African_American_alone 0.03797 *
## Native_Hawaiian_and_Other_Pacific_Islander_alone 0.98983
## White_alone 1.02e-05 ***
## HouseholdIncome 0.53449
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98.64 on 42 degrees of freedom
## Multiple R-squared: 0.9041, Adjusted R-squared: 0.8882
## F-statistic: 56.58 on 7 and 42 DF, p-value: < 2.2e-16
Using algorithm to select the best model exhaustively. Also, to use all X-variables available, change the nvmax option
. Because I am too lazy to count variables, I entered a much larger number, such as my favorite number 69
to do it.
reg_fitExhaustive <- regsubsets(confirmed ~ ., data = covidForRegressionSQRT[-5,], nvmax = 69)
summary(reg_fitExhaustive)
## Subset selection object
## Call: regsubsets.formula(confirmed ~ ., data = covidForRegressionSQRT[-5,
## ], nvmax = 69)
## 7 Variables (and intercept)
## Forced in Forced out
## deaths FALSE FALSE
## American_Indian_and_Alaska_Native_alone FALSE FALSE
## Asian_alone FALSE FALSE
## Black_or_African_American_alone FALSE FALSE
## Native_Hawaiian_and_Other_Pacific_Islander_alone FALSE FALSE
## White_alone FALSE FALSE
## HouseholdIncome FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
## deaths American_Indian_and_Alaska_Native_alone Asian_alone
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) "*" " " "*"
## 4 ( 1 ) "*" " " "*"
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## Black_or_African_American_alone
## 1 ( 1 ) " "
## 2 ( 1 ) "*"
## 3 ( 1 ) " "
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
## Native_Hawaiian_and_Other_Pacific_Islander_alone White_alone
## 1 ( 1 ) " " "*"
## 2 ( 1 ) " " "*"
## 3 ( 1 ) " " "*"
## 4 ( 1 ) " " "*"
## 5 ( 1 ) " " "*"
## 6 ( 1 ) " " "*"
## 7 ( 1 ) "*" "*"
## HouseholdIncome
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
Select the variables and choose the optimal model
summary(reg_fitExhaustive)$adjr2
## [1] 0.8574573 0.8711181 0.8792228 0.8866016 0.8922248 0.8907517 0.8881510
summary(reg_fitExhaustive)$cp
## [1] 15.172193 10.157383 7.671876 5.623361 4.397402 6.000164 8.000000
summary(reg_fitExhaustive)$bic
## [1] -90.61260 -92.79050 -93.20125 -93.54017 -93.29479 -89.85345 -85.94162
which.max(summary(reg_fitExhaustive)$adjr2)
## [1] 5
which.min(summary(reg_fitExhaustive)$cp)
## [1] 5
which.min(summary(reg_fitExhaustive)$bic)
## [1] 4
We can also choose the best model by means of a stepwise procedure, starting with one model and ending with another
Forward addition can be used to perform variable selection.
reg_fitForward <- regsubsets(confirmed ~ .,
data = covidForRegressionSQRT[-c(5),],
method = "forward")
summary(reg_fitForward)
## Subset selection object
## Call: regsubsets.formula(confirmed ~ ., data = covidForRegressionSQRT[-c(5),
## ], method = "forward")
## 7 Variables (and intercept)
## Forced in Forced out
## deaths FALSE FALSE
## American_Indian_and_Alaska_Native_alone FALSE FALSE
## Asian_alone FALSE FALSE
## Black_or_African_American_alone FALSE FALSE
## Native_Hawaiian_and_Other_Pacific_Islander_alone FALSE FALSE
## White_alone FALSE FALSE
## HouseholdIncome FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: forward
## deaths American_Indian_and_Alaska_Native_alone Asian_alone
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) "*" " " " "
## 5 ( 1 ) "*" " " "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## Black_or_African_American_alone
## 1 ( 1 ) " "
## 2 ( 1 ) "*"
## 3 ( 1 ) "*"
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
## Native_Hawaiian_and_Other_Pacific_Islander_alone White_alone
## 1 ( 1 ) " " "*"
## 2 ( 1 ) " " "*"
## 3 ( 1 ) " " "*"
## 4 ( 1 ) " " "*"
## 5 ( 1 ) " " "*"
## 6 ( 1 ) " " "*"
## 7 ( 1 ) "*" "*"
## HouseholdIncome
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) "*"
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
Select the variables and choose the optimal model
summary(reg_fitForward)$adjr2
## [1] 0.8574573 0.8711181 0.8780890 0.8831546 0.8867013 0.8907517 0.8881510
summary(reg_fitForward)$cp
## [1] 15.172193 10.157383 8.138162 7.010188 6.570284 6.000164 8.000000
summary(reg_fitForward)$bic
## [1] -90.61260 -92.79050 -92.73407 -92.04295 -90.79578 -89.85345 -85.94162
which.max(summary(reg_fitForward)$adjr2)
## [1] 6
which.min(summary(reg_fitForward)$cp)
## [1] 6
which.min(summary(reg_fitForward)$bic)
## [1] 2
Backward elimination can be used to perform variable selection.
reg_fitBackward <- regsubsets(confirmed ~ .,
data = covidForRegressionSQRT[-c(5),], method = "backward")
summary(reg_fitBackward)
## Subset selection object
## Call: regsubsets.formula(confirmed ~ ., data = covidForRegressionSQRT[-c(5),
## ], method = "backward")
## 7 Variables (and intercept)
## Forced in Forced out
## deaths FALSE FALSE
## American_Indian_and_Alaska_Native_alone FALSE FALSE
## Asian_alone FALSE FALSE
## Black_or_African_American_alone FALSE FALSE
## Native_Hawaiian_and_Other_Pacific_Islander_alone FALSE FALSE
## White_alone FALSE FALSE
## HouseholdIncome FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: backward
## deaths American_Indian_and_Alaska_Native_alone Asian_alone
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" " " "*"
## 4 ( 1 ) "*" " " "*"
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## Black_or_African_American_alone
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
## Native_Hawaiian_and_Other_Pacific_Islander_alone White_alone
## 1 ( 1 ) " " "*"
## 2 ( 1 ) " " "*"
## 3 ( 1 ) " " "*"
## 4 ( 1 ) " " "*"
## 5 ( 1 ) " " "*"
## 6 ( 1 ) " " "*"
## 7 ( 1 ) "*" "*"
## HouseholdIncome
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
Select the variables and choose the optimal model
summary(reg_fitBackward)$adjr2
## [1] 0.8574573 0.8651936 0.8792228 0.8866016 0.8922248 0.8907517 0.8881510
summary(reg_fitBackward)$cp
## [1] 15.172193 12.646898 7.671876 5.623361 4.397402 6.000164 8.000000
summary(reg_fitBackward)$bic
## [1] -90.61260 -90.54335 -93.20125 -93.54017 -93.29479 -89.85345 -85.94162
which.max(summary(reg_fitBackward)$adjr2)
## [1] 5
which.min(summary(reg_fitBackward)$cp)
## [1] 5
which.min(summary(reg_fitBackward)$bic)
## [1] 4
We use algorithm to considers either adding or removing variables at each step to final the best model. Lower AIC (Akaike information criterion) values indicate a better-fit model.
null = lm(confirmed ~ 1, data = covidForRegressionSQRT[-c(5),])
full = lm(confirmed ~ ., data = covidForRegressionSQRT[-c(5),])
step(null, scope = list(lower = null, upper = full), direction = "both")
## Start: AIC=569.67
## confirmed ~ 1
##
## Df Sum of Sq RSS AIC
## + White_alone 1 3667200 595170 473.23
## + deaths 1 3266316 996054 498.98
## + Black_or_African_American_alone 1 2944490 1317880 512.98
## + Asian_alone 1 2115961 2146409 537.36
## + American_Indian_and_Alaska_Native_alone 1 265309 3997061 568.45
## <none> 4262370 569.67
## + HouseholdIncome 1 121852 4140518 570.22
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 19189 4243181 571.44
##
## Step: AIC=473.23
## confirmed ~ White_alone
##
## Df Sum of Sq RSS AIC
## + Black_or_African_American_alone 1 68250 526920 469.14
## + HouseholdIncome 1 46407 548764 471.17
## + deaths 1 44029 551142 471.39
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 35741 559429 472.13
## <none> 595170 473.23
## + American_Indian_and_Alaska_Native_alone 1 12741 582429 474.15
## + Asian_alone 1 6779 588392 474.66
## - White_alone 1 3667200 4262370 569.67
##
## Step: AIC=469.14
## confirmed ~ White_alone + Black_or_African_American_alone
##
## Df Sum of Sq RSS AIC
## + HouseholdIncome 1 39105 487816 467.28
## + American_Indian_and_Alaska_Native_alone 1 28034 498887 468.41
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 26719 500201 468.54
## <none> 526920 469.14
## + deaths 1 17944 508976 469.41
## + Asian_alone 1 16544 510377 469.54
## - Black_or_African_American_alone 1 68250 595170 473.23
## - White_alone 1 790959 1317880 512.98
##
## Step: AIC=467.28
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome
##
## Df Sum of Sq RSS AIC
## + deaths 1 30433 457382 466.06
## <none> 487816 467.28
## + American_Indian_and_Alaska_Native_alone 1 17844 469972 467.42
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 13595 474221 467.87
## - HouseholdIncome 1 39105 526920 469.14
## + Asian_alone 1 482 487334 469.23
## - Black_or_African_American_alone 1 60948 548764 471.17
## - White_alone 1 796518 1284334 513.69
##
## Step: AIC=466.06
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome +
## deaths
##
## Df Sum of Sq RSS AIC
## + Asian_alone 1 23739 433643 465.40
## <none> 457382 466.06
## + American_Indian_and_Alaska_Native_alone 1 17872 439510 466.07
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 8133 449249 467.17
## - deaths 1 30433 487816 467.28
## - Black_or_African_American_alone 1 30551 487933 467.30
## - HouseholdIncome 1 51594 508976 469.41
## - White_alone 1 274232 731614 487.55
##
## Step: AIC=465.4
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome +
## deaths + Asian_alone
##
## Df Sum of Sq RSS AIC
## + American_Indian_and_Alaska_Native_alone 1 25006 408638 464.43
## - HouseholdIncome 1 10246 443889 464.57
## <none> 433643 465.40
## - Asian_alone 1 23739 457382 466.06
## - Black_or_African_American_alone 1 33456 467099 467.11
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 19 433625 467.40
## - deaths 1 53690 487334 469.23
## - White_alone 1 291149 724792 489.08
##
## Step: AIC=464.43
## confirmed ~ White_alone + Black_or_African_American_alone + HouseholdIncome +
## deaths + Asian_alone + American_Indian_and_Alaska_Native_alone
##
## Df Sum of Sq RSS AIC
## - HouseholdIncome 1 3865 412502 462.90
## <none> 408638 464.43
## - American_Indian_and_Alaska_Native_alone 1 25006 433643 465.40
## - Asian_alone 1 30873 439510 466.07
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 2 408636 466.43
## - Black_or_African_American_alone 1 44906 453543 467.64
## - deaths 1 59882 468520 469.27
## - White_alone 1 248959 657596 486.22
##
## Step: AIC=462.9
## confirmed ~ White_alone + Black_or_African_American_alone + deaths +
## Asian_alone + American_Indian_and_Alaska_Native_alone
##
## Df Sum of Sq RSS AIC
## <none> 412502 462.90
## + HouseholdIncome 1 3865 408638 464.43
## - American_Indian_and_Alaska_Native_alone 1 31387 443889 464.57
## + Native_Hawaiian_and_Other_Pacific_Islander_alone 1 50 412453 464.89
## - Black_or_African_American_alone 1 51301 463803 466.76
## - Asian_alone 1 66809 479311 468.40
## - deaths 1 70052 482555 468.74
## - White_alone 1 260300 672802 485.36
##
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone +
## deaths + Asian_alone + American_Indian_and_Alaska_Native_alone,
## data = covidForRegressionSQRT[-c(5), ])
##
## Coefficients:
## (Intercept)
## 2.954e+02
## White_alone
## 4.353e-05
## Black_or_African_American_alone
## 6.143e-05
## deaths
## 1.254e-02
## Asian_alone
## -2.161e-04
## American_Indian_and_Alaska_Native_alone
## 3.815e-04
We will select the fewest variable for each set, compare their MSE, and finally select the one with the local minimum MSE.
regExhaustive <- lm(confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + White_alone
, data = covidForRegressionSQRT[-c(5),])
summary(regExhaustive)
##
## Call:
## lm(formula = confirmed ~ deaths + Asian_alone + Black_or_African_American_alone +
## White_alone, data = covidForRegressionSQRT[-c(5), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -232.97 -70.37 13.41 69.36 159.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.085e+02 2.043e+01 15.100 < 2e-16 ***
## deaths 1.220e-02 4.701e-03 2.596 0.0127 *
## Asian_alone -2.132e-04 8.301e-05 -2.569 0.0136 *
## Black_or_African_American_alone 5.300e-05 2.652e-05 1.998 0.0517 .
## White_alone 4.698e-05 8.250e-06 5.695 8.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.32 on 45 degrees of freedom
## Multiple R-squared: 0.8959, Adjusted R-squared: 0.8866
## F-statistic: 96.78 on 4 and 45 DF, p-value: < 2.2e-16
regForward <- lm(confirmed ~ Black_or_African_American_alone + White_alone,
data = covidForRegressionSQRT[-c(5),])
summary(regForward)
##
## Call:
## lm(formula = confirmed ~ Black_or_African_American_alone + White_alone,
## data = covidForRegressionSQRT[-c(5), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -237.00 -63.71 10.56 80.07 204.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.087e+02 2.170e+01 14.228 < 2e-16 ***
## Black_or_African_American_alone 6.614e-05 2.681e-05 2.467 0.0173 *
## White_alone 5.366e-05 6.388e-06 8.400 6.5e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105.9 on 47 degrees of freedom
## Multiple R-squared: 0.8764, Adjusted R-squared: 0.8711
## F-statistic: 166.6 on 2 and 47 DF, p-value: < 2.2e-16
regBackward <- lm(confirmed ~ deaths + Asian_alone + Black_or_African_American_alone + White_alone
, data = covidForRegressionSQRT[-c(5),])
summary(regBackward)
##
## Call:
## lm(formula = confirmed ~ deaths + Asian_alone + Black_or_African_American_alone +
## White_alone, data = covidForRegressionSQRT[-c(5), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -232.97 -70.37 13.41 69.36 159.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.085e+02 2.043e+01 15.100 < 2e-16 ***
## deaths 1.220e-02 4.701e-03 2.596 0.0127 *
## Asian_alone -2.132e-04 8.301e-05 -2.569 0.0136 *
## Black_or_African_American_alone 5.300e-05 2.652e-05 1.998 0.0517 .
## White_alone 4.698e-05 8.250e-06 5.695 8.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.32 on 45 degrees of freedom
## Multiple R-squared: 0.8959, Adjusted R-squared: 0.8866
## F-statistic: 96.78 on 4 and 45 DF, p-value: < 2.2e-16
lm(formula = confirmed ~ White_alone + Black_or_African_American_alone + deaths + Asian_alone + American_Indian_and_Alaska_Native_alone, data = covidForRegressionSQRT[-c(5), ])
.regStepwise <- lm(formula = confirmed ~ White_alone + Black_or_African_American_alone +
deaths + Asian_alone + American_Indian_and_Alaska_Native_alone,
data = covidForRegressionSQRT[-c(5), ])
summary(regStepwise)
##
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone +
## deaths + Asian_alone + American_Indian_and_Alaska_Native_alone,
## data = covidForRegressionSQRT[-c(5), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -219.073 -71.292 8.241 68.218 164.253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.954e+02 2.116e+01 13.956 < 2e-16
## White_alone 4.353e-05 8.261e-06 5.269 3.94e-06
## Black_or_African_American_alone 6.143e-05 2.626e-05 2.339 0.02393
## deaths 1.254e-02 4.586e-03 2.734 0.00899
## Asian_alone -2.161e-04 8.095e-05 -2.669 0.01060
## American_Indian_and_Alaska_Native_alone 3.815e-04 2.085e-04 1.830 0.07407
##
## (Intercept) ***
## White_alone ***
## Black_or_African_American_alone *
## deaths **
## Asian_alone *
## American_Indian_and_Alaska_Native_alone .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96.82 on 44 degrees of freedom
## Multiple R-squared: 0.9032, Adjusted R-squared: 0.8922
## F-statistic: 82.13 on 5 and 44 DF, p-value: < 2.2e-16
#calculate MSE
anova(regExhaustive) %>% tidy() # 9863.275
anova(regForward) %>% tidy() # 11211.07
anova(regBackward) %>% tidy() # 9863.275
anova(regStepwise) %>% tidy() # 9375.065
regStepwise
has the smallest MSE value (9375.065) so we select it as the best model. Therefore, our final predictors of square root of COVID confirmed cases by US States is deaths
, White_alone
,Black_or_African_American_alone
, Asian_alone
and American_Indian_and_Alaska_Native_alone
.
finalReg <- lm(formula = confirmed ~ White_alone + Black_or_African_American_alone +
deaths + Asian_alone + American_Indian_and_Alaska_Native_alone,
data = covidForRegressionSQRT[-c(5), ])
summary(finalReg)
##
## Call:
## lm(formula = confirmed ~ White_alone + Black_or_African_American_alone +
## deaths + Asian_alone + American_Indian_and_Alaska_Native_alone,
## data = covidForRegressionSQRT[-c(5), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -219.073 -71.292 8.241 68.218 164.253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.954e+02 2.116e+01 13.956 < 2e-16
## White_alone 4.353e-05 8.261e-06 5.269 3.94e-06
## Black_or_African_American_alone 6.143e-05 2.626e-05 2.339 0.02393
## deaths 1.254e-02 4.586e-03 2.734 0.00899
## Asian_alone -2.161e-04 8.095e-05 -2.669 0.01060
## American_Indian_and_Alaska_Native_alone 3.815e-04 2.085e-04 1.830 0.07407
##
## (Intercept) ***
## White_alone ***
## Black_or_African_American_alone *
## deaths **
## Asian_alone *
## American_Indian_and_Alaska_Native_alone .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96.82 on 44 degrees of freedom
## Multiple R-squared: 0.9032, Adjusted R-squared: 0.8922
## F-statistic: 82.13 on 5 and 44 DF, p-value: < 2.2e-16
With the large p-value 0.07426 of \(b_5\), the prodictor American_Indian_and_Alaska_Native_alone
is not in the significant level. In other words, we fail to reject the null (H0: \(\beta_5\) = 0 cannot be rejected) so we can conclude \(b_5\) is 0. Therefore, \(b_5\) can be dropped from the model.
Thus, the expected value of square root of confirmed cases is: \[ \sqrt{\hat{Confirmed}} = 295.4 + 0.00004354White + 0.0000614Black + 0.01254Deaths - 0.0002163Asian \]
Sehra, S., Fundin, S., Lavery, C., & Baker, J. (2020). Differences in race and other state‐level characteristics and associations with mortality from COVID‐19 infection. Journal of Medical Virology, 92(11), 2406–2408. https://doi.org/10.1002/jmv.26095
Sa, Filipa G., Socioeconomic Determinants of Covid-19 Infections and Mortality: Evidence from England and Wales (May 2020). CEPR Discussion Paper No. DP14781, Available at SSRN: https://ssrn.com/abstract=3612850