The Script Below:
Here explains the inspiration and the potential bias I bring to my data analysis.
#install.packages(c(“tidycensus”, “tidyverse”))
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.1.3 ✔ stringr 1.4.0
## ✔ readr 1.4.0 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
jh_covid_repo <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"
us_cd_deaths <- read_csv(jh_covid_repo)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso3 = col_character(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
mi_cd_deaths <- filter(us_cd_deaths, Province_State == "Michigan" )
# deaths are cumulative, pull the last column for the most current sum
mi_cd_deaths <- mi_cd_deaths %>%
select(Admin2, Province_State, Lat, Long_, Population, FIPS, length(names(mi_cd_deaths))) %>%
#update column names from dates to death, and change combined key to county for joins later
rename(
County = "Admin2",
state_name = "Province_State",
county_fips = "FIPS"
)
names(mi_cd_deaths)[7] <- "Total_Deaths"
mi_cd_deaths <- mutate(mi_cd_deaths, Deaths_Per_Pop_Thousand = (Total_Deaths / (Population / 1000 )))
mi_cd_deaths <- mutate_at(mi_cd_deaths, 8, round, 2)
#micdd_tidy is now ready to merge with census data
library(tidycensus)
#find the following variables by county and create a csv for reference
v20<- load_variables(2020, "acs5", cache = TRUE)
write.csv(v20,"~/STA 518/BrookemWalters-Portfolio/Stats 518 Final Project/data/variable_names.csv")
estimates_only_by_GEOID <- function(x){
x <- select(x, GEOID, ends_with("E"))
}
For steps 2.4 - 2.6 I’ll pull the ACS variables in two chunks, it’s easier to manage the calculations this way - set one economic factors - set two racial population estimates
ac_eccon_factors <- get_acs(
geography = "county",
state = "MI",
variables = c(population19 = "B01001_001",
households = "B09001_002",
median_age = "B01002_001",
median_income = "B19013_001",
unemployed_in_lf = "B23025_005",
labor_force_pop = "B23025_002",
hh_on_assistance = "B09010_002",
bach_degree_plus_a25= "DP02_0068P"),
output = "wide"
)
## Getting data from the 2016-2020 5-year ACS
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
# use the function created above to select the relevant estimates
ac_eccon_factors <- estimates_only_by_GEOID(ac_eccon_factors) %>%
#create the unemployment rate
mutate(unemployment_rate = (unemployed_in_lfE / labor_force_popE)*100) %>%
mutate_at(11, round, 1) %>%
select(-labor_force_popE, -unemployed_in_lfE) %>%
# create the public assistance rate
mutate(public_assist_rate = (hh_on_assistanceE / householdsE)*100) %>%
mutate_at(10, round, 1) %>%
select(-hh_on_assistanceE)
glimpse(ac_eccon_factors)
## Rows: 83
## Columns: 9
## $ GEOID <chr> "26001", "26003", "26005", "26007", "26009", "26…
## $ NAME <chr> "Alcona County, Michigan", "Alger County, Michig…
## $ population19E <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 8337, …
## $ householdsE <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13487…
## $ median_ageE <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.2, …
## $ median_incomeE <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 46581,…
## $ bach_degree_plus_a25E <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.0, …
## $ unemployment_rate <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, 4.9…
## $ public_assist_rate <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.3, …
ac_race_factors <- get_acs(
geography = "county",
state = "MI",
variables = c(population19 = "B01001_001",
asian = "B03002_006",
black = "B03002_004",
native = "B03002_005",
pacific_islander = "B03002_007",
white = "B03002_003",
hispanic = "B03002_012"),
output = "wide"
)
## Getting data from the 2016-2020 5-year ACS
#select estimates
ac_race_factors <- estimates_only_by_GEOID(ac_race_factors) %>%
#create percentages
mutate(percent_asian = (asianE/ population19E)*100,
percent_black = (blackE/ population19E)*100,
percent_native = (nativeE/ population19E)*100,
percent_pacific_islander = (pacific_islanderE/ population19E)*100,
percent_white = (whiteE/ population19E)*100,
percent_hispanic = (hispanicE/ population19E)*100) %>%
#round
mutate_at(vars(starts_with("percent")), funs(round(., 1)))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#update variable to only reflect percentages
ac_race_factors <- ac_race_factors %>%
select(GEOID, (starts_with("percent",ignore.case = TRUE)))
glimpse(ac_race_factors)
## Rows: 83
## Columns: 7
## $ GEOID <chr> "26001", "26003", "26005", "26007", "26009", …
## $ percent_asian <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …
this is where things get exciting! time to merge the 2 census sets with the covid data by county!
ac_eccon_factors <- left_join(ac_eccon_factors, ac_race_factors, by = "GEOID")
glimpse(ac_eccon_factors)
## Rows: 83
## Columns: 15
## $ GEOID <chr> "26001", "26003", "26005", "26007", "26009", …
## $ NAME <chr> "Alcona County, Michigan", "Alger County, Mic…
## $ population19E <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 833…
## $ householdsE <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13…
## $ median_ageE <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.…
## $ median_incomeE <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 465…
## $ bach_degree_plus_a25E <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.…
## $ unemployment_rate <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, …
## $ public_assist_rate <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.…
## $ percent_asian <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …
ac_eccon_factors <- ac_eccon_factors %>%
mutate(County = str_remove_all(NAME, " County, Michigan"))
glimpse(ac_eccon_factors)
## Rows: 83
## Columns: 16
## $ GEOID <chr> "26001", "26003", "26005", "26007", "26009", …
## $ NAME <chr> "Alcona County, Michigan", "Alger County, Mic…
## $ population19E <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 833…
## $ householdsE <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13…
## $ median_ageE <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.…
## $ median_incomeE <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 465…
## $ bach_degree_plus_a25E <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.…
## $ unemployment_rate <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, …
## $ public_assist_rate <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.…
## $ percent_asian <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …
## $ County <chr> "Alcona", "Alger", "Allegan", "Alpena", "Antr…
The grand finale…. Merge everything together!
covid_census <- left_join(ac_eccon_factors, mi_cd_deaths, by = "County")
glimpse(covid_census)
## Rows: 83
## Columns: 23
## $ GEOID <chr> "26001", "26003", "26005", "26007", "26009", …
## $ NAME <chr> "Alcona County, Michigan", "Alger County, Mic…
## $ population19E <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 833…
## $ householdsE <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13…
## $ median_ageE <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.…
## $ median_incomeE <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 465…
## $ bach_degree_plus_a25E <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.…
## $ unemployment_rate <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, …
## $ public_assist_rate <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.…
## $ percent_asian <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …
## $ County <chr> "Alcona", "Alger", "Allegan", "Alpena", "Antr…
## $ state_name <chr> "Michigan", "Michigan", "Michigan", "Michigan…
## $ Lat <dbl> 44.68469, 46.41293, 42.59147, 45.03478, 44.99…
## $ Long_ <dbl> -83.59508, -86.60260, -85.89103, -83.62212, -…
## $ Population <dbl> 10405, 9108, 118081, 28405, 23324, 14883, 820…
## $ county_fips <dbl> 26001, 26003, 26005, 26007, 26009, 26011, 260…
## $ Total_Deaths <dbl> 72, 15, 368, 143, 70, 72, 55, 177, 607, 73, 5…
## $ Deaths_Per_Pop_Thousand <dbl> 6.92, 1.65, 3.12, 5.03, 3.00, 4.84, 6.70, 2.8…
# write csv as a backup -> update file path if needed
write.csv(covid_census,"~/STA 518/BrookemWalters-Portfolio/Stats 518 Final Project/data/covid_census.csv")