library(tidyverse)
library(zoo)
library(GGally)
library(skimr)
library(magrittr)
Recycle/Reduce/Reuse Homework 4 Code
load("G:/My Drive/homework/Cooper P/MA_county_variables_2020.RData")
# load("G:/My Drive/homework/Cooper P/MA_COVID19_21_03_02.RData")
counties <-
read_csv(
"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## date = col_date(format = ""),
## county = col_character(),
## state = col_character(),
## fips = col_character(),
## cases = col_double(),
## deaths = col_double()
## )
mas_cov <-
counties %>%
filter(state == "Massachusetts") %>%
select(!c(state, fips)) %>%
mutate(date = as.Date(date))%>%
full_join(ma_extra) %>%
mutate(deaths_per_100000 = deaths / X2019_pop_est * 100000,
.after = deaths) %>%
drop_na()
## Joining, by = "county"
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
ggpairs()
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
glimpse()
## Rows: 5,371
## Columns: 5
## $ deaths_per_100000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ pct_NHWA_2019 <dbl> 0.4518321, 0.4518321, 0.4518321, 0.451832~
## $ people_per_Housing <dbl> 2.327372, 2.327372, 2.327372, 2.327372, 2~
## $ Median_Household_Income_2018 <int> 68743, 68743, 68743, 68743, 68743, 68743,~
## $ X2019_median_age <dbl> 33.3, 33.3, 33.3, 33.3, 33.3, 33.3, 33.3,~
mas_cov %>%
select(deaths_per_100000,
pct_NHWA_2019,
people_per_Housing,
Median_Household_Income_2018,
X2019_median_age) %>%
skim_without_charts()
Name | Piped data |
Number of rows | 5371 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
deaths_per_100000 | 0 | 1 | 105.74 | 77.31 | 0.00 | 31.15 | 111.92 | 157.04 | 308.77 |
pct_NHWA_2019 | 0 | 1 | 0.76 | 0.12 | 0.45 | 0.71 | 0.76 | 0.87 | 0.90 |
people_per_Housing | 0 | 1 | 2.08 | 0.57 | 0.90 | 1.81 | 2.39 | 2.50 | 2.51 |
Median_Household_Income_2018 | 0 | 1 | 75329.69 | 15169.53 | 52682.00 | 66005.00 | 70224.00 | 89678.00 | 100374.00 |
X2019_median_age | 0 | 1 | 42.17 | 5.27 | 33.30 | 39.10 | 40.80 | 47.20 | 54.30 |
mas_cov %>%
ggplot(aes(people_per_Housing, deaths_per_100000)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, col = "red")
## `geom_smooth()` using formula 'y ~ x'
model.fit <-
mas_cov %>%
lm(deaths_per_100000 ~ people_per_Housing, data = .)
model.fit %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137.846 -31.860 -1.992 33.541 178.718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.186 3.316 -15.74 <2e-16 ***
## people_per_Housing 75.762 1.534 49.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.12 on 5369 degrees of freedom
## Multiple R-squared: 0.3123, Adjusted R-squared: 0.3122
## F-statistic: 2438 on 1 and 5369 DF, p-value: < 2.2e-16
mas_cov %>%
ggplot(aes(sample = deaths_per_100000)) +
geom_qq() +
geom_qq_line()
data.frame(Residuals = residuals(model.fit)) %>%
ggplot(aes(sample = Residuals)) +
geom_qq() +
geom_qq_line()
mas_cov %>%
select(deaths_per_100000, people_per_Housing) %>%
bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
pivot_longer(cols = c(deaths_per_100000, people_per_Housing)) %>%
ggplot(aes(Residuals, value)) +
geom_point() +
facet_wrap(vars(name), scales = "free")
mas_cov %>%
bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
select(county, Residuals) %>%
group_by(county) %>%
summarise(across(.cols = Residuals,
list(Min = min,
Mean = mean,
Median = median,
Max = max),
.names = "{.fn}"
)
)
## # A tibble: 14 x 5
## county Min Mean Median Max
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Barnstable -45.8 42.1 34.9 161.
## 2 Berkshire -84.6 -15.3 -45.4 136.
## 3 Bristol -129. 3.66 -7.55 159.
## 4 Dukes -20.2 -19.3 -20.2 -8.64
## 5 Essex -138. 20.3 22.2 151.
## 6 Franklin -103. -9.07 -7.78 49.2
## 7 Hampden -130. 37.7 35.5 179.
## 8 Hampshire -137. -48.1 -49.4 38.3
## 9 Middlesex -137. -11.0 -6.18 88.3
## 10 Nantucket -15.9 -6.42 -7.18 1.60
## 11 Norfolk -138. 1.54 8.38 105.
## 12 Plymouth -136. 9.61 9.56 127.
## 13 Suffolk -124. -3.24 13.4 97.7
## 14 Worcester -134. -3.06 -4.04 124.
mas_cov %>%
bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
select(county, Residuals) %>%
ggplot(aes(Residuals)) +
geom_density() +
facet_wrap(vars(county))
mas_cov %>%
bind_cols(data.frame(Residuals = residuals(model.fit))) %>%
select(county, Residuals) %>%
ggplot(aes(Residuals)) +
geom_density() +
facet_wrap(vars(county), scales = "free_y")
mas_cov %<>%
mutate(region = if_else(county == "Hampshire" |
county == "Franklin" |
county == "Hampden" |
county == "Berkshire" |
county == "Worcester",
"West", "East") %>%
as_factor(),
.after = county)
mas_cov %>%
select(county, region) %>%
distinct(county, .keep_all = TRUE) %>%
arrange(region, county)
## # A tibble: 14 x 2
## county region
## <chr> <fct>
## 1 Barnstable East
## 2 Bristol East
## 3 Dukes East
## 4 Essex East
## 5 Middlesex East
## 6 Nantucket East
## 7 Norfolk East
## 8 Plymouth East
## 9 Suffolk East
## 10 Berkshire West
## 11 Franklin West
## 12 Hampden West
## 13 Hampshire West
## 14 Worcester West
mas_cov$region %>% levels()
## [1] "East" "West"
mas_cov %>%
select(deaths_per_100000, region) %>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model.fit <-
mas_cov %>%
lm(deaths_per_100000 ~ region, data = .)
model.fit %>% summary()
##
## Call:
## lm(formula = deaths_per_100000 ~ region, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -110.212 -77.422 5.702 51.489 198.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.283 1.312 78.737 < 2e-16 ***
## regionWest 6.929 2.204 3.144 0.00168 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77.25 on 5369 degrees of freedom
## Multiple R-squared: 0.001838, Adjusted R-squared: 0.001652
## F-statistic: 9.885 on 1 and 5369 DF, p-value: 0.001675
data.frame(Residuals = residuals(model.fit)) %>%
ggplot(aes(sample = Residuals)) +
geom_qq() +
geom_qq_line()
mas_cov %>%
ggplot(aes(residuals(model.fit), deaths_per_100000)) +
geom_point()
mas_cov %>%
ggplot(aes(residuals(model.fit), region)) +
geom_point()
mas_cov %>%
group_by(region) %>%
summarise(Mean_deaths_per_100000 = mean(deaths_per_100000))
## # A tibble: 2 x 2
## region Mean_deaths_per_100000
## <fct> <dbl>
## 1 East 103.
## 2 West 110.
mas_cov %>%
filter(region == "East") %>%
group_by(county) %>%
summarise(Mean_deaths_per_100000 = mean(deaths_per_100000))
## # A tibble: 9 x 2
## county Mean_deaths_per_100000
## <chr> <dbl>
## 1 Barnstable 87.9
## 2 Bristol 132.
## 3 Dukes 0.876
## 4 Essex 158.
## 5 Middlesex 126.
## 6 Nantucket 9.53
## 7 Norfolk 139.
## 8 Plymouth 146.
## 9 Suffolk 121.
mas_cov %>%
filter(region == "West") %>%
group_by(county) %>%
summarise(Mean_deaths_per_100000 = mean(deaths_per_100000))
## # A tibble: 5 x 2
## county Mean_deaths_per_100000
## <chr> <dbl>
## 1 Berkshire 69.3
## 2 Franklin 94.2
## 3 Hampden 168.
## 4 Hampshire 89.0
## 5 Worcester 131.