library(tidyverse)
library(zoo)
library(GGally)
library(skimr)
library(magrittr)
library(moderndive)
library(corrplot)
library(DT)
# load("G:/My Drive/homework/Cooper P/MA_COVID19_21_03_09.RData")
load("G:/My Drive/homework/Cooper P/MA_county_variables_2020.RData")
counties <-
read_csv(
"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## date = col_date(format = ""),
## county = col_character(),
## state = col_character(),
## fips = col_character(),
## cases = col_double(),
## deaths = col_double()
## )
mas_cov <-
counties %>%
filter(state == "Massachusetts") %>%
select(!c(state, fips)) %>%
# mutate(date = as.Date(date)) %>%
mutate(new_cases = cases - lag(cases, k = 1), .after = cases) %>%
mutate(new_deaths = deaths - lag(deaths, k = 1), .after = deaths) %>%
mutate(new_cases_07_day = rollmean(new_cases, k = 7, fill = NA),
.after = new_cases) %>%
mutate(new_deaths_07_day = rollmean(new_deaths, k = 7, fill = NA),
.after = new_deaths) %>%
full_join(ma_extra) %>%
mutate(cases_per_100000 = cases / X2019_pop_est * 100000,
.after = new_cases_07_day) %>%
mutate(new_cases_per_100000 = new_cases / X2019_pop_est * 100000,
.after = cases_per_100000) %>%
mutate(new_cases_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
.after = new_cases_per_100000) %>%
mutate(deaths_per_100000 = deaths / X2019_pop_est * 100000,
.after = new_deaths_07_day) %>%
mutate(new_deaths_per_100000 = new_deaths / X2019_pop_est * 100000,
.after = deaths_per_100000) %>%
mutate(new_deaths_per_100000_07_day = new_cases_07_day / X2019_pop_est * 100000,
.after = new_deaths_per_100000) %>%
# mutate(month = month(date, label = TRUE), .after = date) %>%
# mutate(year = year(date), .after = month) %>%
mutate(region = if_else(county == "Hampshire" |
county == "Franklin" |
county == "Hampden" |
county == "Berkshire" |
county == "Worcester",
"West", "East") %>% as_factor(),
.after = county) %>%
mutate(pct_over_85 = X2019_age_85_plus / X2019_pop_est * 100,
.after = X2019_age_85_plus) %>%
mutate(pct_NHWA_2019 = pct_NHWA_2019 * 100)
## Joining, by = "county"
sum(!complete.cases(mas_cov))
## [1] 386
mas_cov %<>% drop_na()
https://moderndive.com/6-multiple-regression.html#model4interactiontable
mas_cov %>%
select(deaths_per_100000,
people_per_Housing,
region) %>%
skim_without_charts()
Name | Piped data |
Number of rows | 5393 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
region | 0 | 1 | FALSE | 2 | Eas: 3481, Wes: 1912 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
deaths_per_100000 | 0 | 1 | 106.28 | 77.67 | 0.0 | 32.01 | 112.54 | 157.53 | 309.41 |
people_per_Housing | 0 | 1 | 2.08 | 0.57 | 0.9 | 1.81 | 2.41 | 2.50 | 2.51 |
mas_cov %>%
select(deaths_per_100000,
people_per_Housing,
region) %>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model.21a <-
mas_cov %>%
lm(deaths_per_100000 ~ people_per_Housing * region, data = .)
model.21a %>% get_regression_table() %>% datatable()
model.21a %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing * region,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.693 -30.570 -3.343 33.371 185.568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -51.953 3.442 -15.095 <2e-16 ***
## people_per_Housing 78.001 1.635 47.707 <2e-16 ***
## regionWest -17.723 12.686 -1.397 0.162
## people_per_Housing:regionWest 2.453 5.648 0.434 0.664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.16 on 5389 degrees of freedom
## Multiple R-squared: 0.3179, Adjusted R-squared: 0.3175
## F-statistic: 837.2 on 3 and 5389 DF, p-value: < 2.2e-16
model.coef <- model.21a %>% coefficients()
mas_cov %>%
ggplot(aes(people_per_Housing, deaths_per_100000, color = region)) +
scale_color_manual(values = c("East" = "salmon", "West" = "cyan")) +
geom_point() +
# East
geom_abline(intercept = model.coef[1],
slope = model.coef[2],
color = "salmon",
size = 1.25,
linetype = "longdash") +
# West
geom_abline(intercept = model.coef[1] + model.coef[3],
slope = model.coef[2] + model.coef[4],
color = "cyan",
size = 1.25,
linetype = "longdash") +
# Linear Model
geom_smooth(method = "lm",
se = FALSE,
aes(people_per_Housing, deaths_per_100000),
inherit.aes = FALSE,
color = "grey25",
size = 1.25,
linetype = "longdash") +
ggtitle("Interaction & Linear Models")
## `geom_smooth()` using formula 'y ~ x'
https://moderndive.com/6-multiple-regression.html#model4table
mas_cov %>%
ggplot(aes(people_per_Housing, deaths_per_100000, color = region)) +
scale_color_manual(values = c("East" = "salmon", "West" = "cyan")) +
geom_point() +
geom_parallel_slopes(se = FALSE,
size = 1.25,
linetype = "longdash") +
ggtitle("Parallel Slopes Model")
model.21e <-
mas_cov %>%
lm(deaths_per_100000 ~ people_per_Housing + region, data = .)
model.21e %>% get_regression_table() %>% datatable()
model.21e %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing + region,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.798 -30.477 -3.259 33.481 185.934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.364 3.309 -15.824 < 2e-16 ***
## people_per_Housing 78.206 1.565 49.976 < 2e-16 ***
## regionWest -12.274 1.866 -6.577 5.26e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.16 on 5390 degrees of freedom
## Multiple R-squared: 0.3179, Adjusted R-squared: 0.3176
## F-statistic: 1256 on 2 and 5390 DF, p-value: < 2.2e-16
https://www.statology.org/sst-ssr-sse-in-r/
# Homework 5
model.23c <-
mas_cov %>%
lm(deaths_per_100000 ~ people_per_Housing, data = .)
# Homework 5
model.24c <-
mas_cov %>%
lm(deaths_per_100000 ~ region, data = .)
SSE.23c <- sum((fitted(model.23c) - mas_cov$deaths_per_100000)^2)
SSE.24c <- sum((fitted(model.24c) - mas_cov$deaths_per_100000)^2)
SSE.21a <- sum((fitted(model.21a) - mas_cov$deaths_per_100000)^2)
SSE.21e <- sum((fitted(model.21e) - mas_cov$deaths_per_100000)^2)
knitr::kable(c(SSE.23c, SSE.24c, SSE.21a, SSE.21e))
x |
---|
22365968 |
32469367 |
22187133 |
22187910 |
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
skim_without_charts()
Name | Piped data |
Number of rows | 5393 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
deaths_per_100000 | 0 | 1 | 106.28 | 77.67 | 0.00 | 32.01 | 112.54 | 157.53 | 309.41 |
pct_NHWA_2019 | 0 | 1 | 76.04 | 12.22 | 45.18 | 71.09 | 75.69 | 86.91 | 90.38 |
people_per_Housing | 0 | 1 | 2.08 | 0.57 | 0.90 | 1.81 | 2.41 | 2.50 | 2.51 |
Median_Household_Income_2018 | 0 | 1 | 75336.37 | 15176.56 | 52682.00 | 66005.00 | 70224.00 | 89678.00 | 100374.00 |
X2019_median_age | 0 | 1 | 42.18 | 5.26 | 33.30 | 39.10 | 40.80 | 47.20 | 54.30 |
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
ggpairs()
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
cor() %>%
corrplot.mixed(tl.pos = "lt")
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
pivot_longer(cols = !deaths_per_100000) %>%
ggplot(aes(value, deaths_per_100000)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
facet_wrap(vars(name), scales = "free_x")
## `geom_smooth()` using formula 'y ~ x'
model.22d <-
mas_cov %>%
lm(deaths_per_100000 ~ X2019_median_age, data = .)
model.22d %>% get_regression_table() %>% datatable()
model.22d %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ X2019_median_age, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138.983 -63.011 6.244 47.242 191.784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 261.6040 8.2745 31.62 <2e-16 ***
## X2019_median_age -3.6823 0.1947 -18.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.22 on 5391 degrees of freedom
## Multiple R-squared: 0.06225, Adjusted R-squared: 0.06208
## F-statistic: 357.9 on 1 and 5391 DF, p-value: < 2.2e-16
model.22e <-
mas_cov %>%
lm(deaths_per_100000 ~ X2019_median_age + people_per_Housing, data = .)
model.22e %>% get_regression_table() %>% datatable()
model.22e %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ X2019_median_age + people_per_Housing,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142.287 -32.934 -0.307 30.775 181.127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -150.1949 11.4642 -13.101 <2e-16 ***
## X2019_median_age 1.8284 0.2051 8.915 <2e-16 ***
## people_per_Housing 86.0533 1.8917 45.491 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.95 on 5390 degrees of freedom
## Multiple R-squared: 0.3224, Adjusted R-squared: 0.3221
## F-statistic: 1282 on 2 and 5390 DF, p-value: < 2.2e-16
model.22g <-
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
X2019_median_age) %>%
lm(deaths_per_100000 ~ ., data = .)
model.22g %>% get_regression_table() %>% datatable()
model.22g %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154.02 -23.89 0.02 26.05 173.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -195.9742 11.3246 -17.30 <2e-16 ***
## pct_NHWA_2019 -2.2274 0.1140 -19.54 <2e-16 ***
## people_per_Housing 95.3823 1.8895 50.48 <2e-16 ***
## X2019_median_age 6.4681 0.3093 20.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.8 on 5389 degrees of freedom
## Multiple R-squared: 0.3672, Adjusted R-squared: 0.3669
## F-statistic: 1042 on 3 and 5389 DF, p-value: < 2.2e-16
mas_cov %>%
select(people_per_Housing,
deaths,
deaths_per_100000) %>%
cor() %>%
corrplot.mixed(tl.pos = "lt")