Welcome to my project!

The Script Below:

Here explains the inspiration and the potential bias I bring to my data analysis.

Step 0 : connect the census via API

(full API directions here)

Part One Retrieving the Lastest Covid Deaths by County,

Step 1:

  • install packages in console if needed, load the ::tidyverse::

#install.packages(c(“tidycensus”, “tidyverse”))

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.1.3     ✔ stringr 1.4.0
## ✔ readr   1.4.0     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Step 1.2:

jh_covid_repo <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"
us_cd_deaths <- read_csv(jh_covid_repo)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso3 = col_character(),
##   Admin2 = col_character(),
##   Province_State = col_character(),
##   Country_Region = col_character(),
##   Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

Step 1.3:

  • filter the data by Michigan, and create a new value
mi_cd_deaths <- filter(us_cd_deaths, Province_State == "Michigan" )

Step 1.4:

  • trim the unnecessary variables
  • update variable names for common join fields, and future plots
# deaths are cumulative, pull the last column for the most current sum
mi_cd_deaths <- mi_cd_deaths %>% 
select(Admin2, Province_State, Lat, Long_, Population, FIPS, length(names(mi_cd_deaths))) %>% 

#update column names from dates to death, and change combined key to county for joins later
  rename(  
  County = "Admin2",
  state_name = "Province_State",
  county_fips = "FIPS"
  )

names(mi_cd_deaths)[7] <- "Total_Deaths" 

Step 1.5:

  • create a covid death rate per thousand (mortality rate), round to the nearest hundredths place
mi_cd_deaths <- mutate(mi_cd_deaths, Deaths_Per_Pop_Thousand =  (Total_Deaths / (Population / 1000 )))
mi_cd_deaths <- mutate_at(mi_cd_deaths, 8, round, 2)
#micdd_tidy is now ready to merge with census data

Part Two

Step 2.1: Load Tidy Census

library(tidycensus)

Step 2.2:

  • pull data from ACS:
  • assign data value “v20”
  • capture all of the variable names in CSV format for reference
#find the following variables by county and create a csv for reference
v20<- load_variables(2020, "acs5", cache = TRUE)
write.csv(v20,"~/STA 518/BrookemWalters-Portfolio/Stats 518 Final Project/data/variable_names.csv")

Step 2.3:

  • create function to select relevant estimates to automate 1 step of trimming out unused variables
estimates_only_by_GEOID <- function(x){
  x <- select(x, GEOID, ends_with("E")) 
}

For steps 2.4 - 2.6 I’ll pull the ACS variables in two chunks, it’s easier to manage the calculations this way - set one economic factors - set two racial population estimates

Step 2.4

  • pull factors: population, and estimates: household, median age, median income, unemployed population, labor force population, households on public assistant, and adults over 25 with a bachelor’s degree or higher
ac_eccon_factors <- get_acs(
  geography = "county",
  state = "MI",
  variables = c(population19 = "B01001_001",
                households = "B09001_002",
                median_age = "B01002_001",
                median_income = "B19013_001",
                unemployed_in_lf = "B23025_005",
                labor_force_pop = "B23025_002",
                hh_on_assistance = "B09010_002",
                bach_degree_plus_a25= "DP02_0068P"),
  output = "wide"
) 
## Getting data from the 2016-2020 5-year ACS
## Fetching data by table type ("B/C", "S", "DP") and combining the result.

Step 2.5

  • create unemployment and public assistance rates,
  • remove the counts
  • round for aesthetics
# use the function created above to select the relevant estimates 
ac_eccon_factors <- estimates_only_by_GEOID(ac_eccon_factors) %>% 
  #create the unemployment rate
  mutate(unemployment_rate = (unemployed_in_lfE / labor_force_popE)*100) %>% 
  mutate_at(11, round, 1) %>% 
  select(-labor_force_popE, -unemployed_in_lfE) %>% 

  # create the public assistance rate
  mutate(public_assist_rate = (hh_on_assistanceE / householdsE)*100) %>% 
  mutate_at(10, round, 1) %>% 
  select(-hh_on_assistanceE)

glimpse(ac_eccon_factors)
## Rows: 83
## Columns: 9
## $ GEOID                 <chr> "26001", "26003", "26005", "26007", "26009", "26…
## $ NAME                  <chr> "Alcona County, Michigan", "Alger County, Michig…
## $ population19E         <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 8337, …
## $ householdsE           <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13487…
## $ median_ageE           <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.2, …
## $ median_incomeE        <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 46581,…
## $ bach_degree_plus_a25E <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.0, …
## $ unemployment_rate     <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, 4.9…
## $ public_assist_rate    <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.3, …

Step 2.6

  • pull county racial population variables
ac_race_factors <- get_acs(
  geography = "county",
  state = "MI",
  variables = c(population19 = "B01001_001",
                asian = "B03002_006",
                black = "B03002_004",
                native = "B03002_005",
                pacific_islander = "B03002_007",
                white = "B03002_003",
                hispanic = "B03002_012"),
  output = "wide"
)
## Getting data from the 2016-2020 5-year ACS

Step 2.7

  • create population rates by racial composition
  • round the values for consistency
  • Note GEOID will be used to join the two sets of ACS variables
#select estimates
ac_race_factors <- estimates_only_by_GEOID(ac_race_factors) %>% 
#create percentages  
  mutate(percent_asian = (asianE/ population19E)*100,
         percent_black = (blackE/ population19E)*100,
         percent_native = (nativeE/ population19E)*100,
         percent_pacific_islander = (pacific_islanderE/ population19E)*100,
         percent_white = (whiteE/ population19E)*100,
         percent_hispanic = (hispanicE/ population19E)*100) %>% 
  #round
  mutate_at(vars(starts_with("percent")), funs(round(., 1)))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#update variable to only reflect percentages
ac_race_factors <- ac_race_factors %>% 
  select(GEOID, (starts_with("percent",ignore.case = TRUE)))
glimpse(ac_race_factors)
## Rows: 83
## Columns: 7
## $ GEOID                    <chr> "26001", "26003", "26005", "26007", "26009", …
## $ percent_asian            <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black            <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native           <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white            <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic         <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …

this is where things get exciting! time to merge the 2 census sets with the covid data by county!

Step 2.8

ac_eccon_factors <-  left_join(ac_eccon_factors, ac_race_factors, by = "GEOID")
glimpse(ac_eccon_factors)
## Rows: 83
## Columns: 15
## $ GEOID                    <chr> "26001", "26003", "26005", "26007", "26009", …
## $ NAME                     <chr> "Alcona County, Michigan", "Alger County, Mic…
## $ population19E            <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 833…
## $ householdsE              <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13…
## $ median_ageE              <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.…
## $ median_incomeE           <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 465…
## $ bach_degree_plus_a25E    <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.…
## $ unemployment_rate        <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, …
## $ public_assist_rate       <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.…
## $ percent_asian            <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black            <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native           <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white            <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic         <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …

Step 2.9

ac_eccon_factors <- ac_eccon_factors %>% 
  mutate(County = str_remove_all(NAME, " County, Michigan"))
glimpse(ac_eccon_factors)
## Rows: 83
## Columns: 16
## $ GEOID                    <chr> "26001", "26003", "26005", "26007", "26009", …
## $ NAME                     <chr> "Alcona County, Michigan", "Alger County, Mic…
## $ population19E            <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 833…
## $ householdsE              <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13…
## $ median_ageE              <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.…
## $ median_incomeE           <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 465…
## $ bach_degree_plus_a25E    <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.…
## $ unemployment_rate        <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, …
## $ public_assist_rate       <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.…
## $ percent_asian            <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black            <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native           <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white            <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic         <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …
## $ County                   <chr> "Alcona", "Alger", "Allegan", "Alpena", "Antr…

The grand finale…. Merge everything together!

covid_census <-  left_join(ac_eccon_factors, mi_cd_deaths, by = "County")
glimpse(covid_census)
## Rows: 83
## Columns: 23
## $ GEOID                    <chr> "26001", "26003", "26005", "26007", "26009", …
## $ NAME                     <chr> "Alcona County, Michigan", "Alger County, Mic…
## $ population19E            <dbl> 10396, 9098, 117104, 28431, 23301, 15013, 833…
## $ householdsE              <dbl> 1326, 1379, 28264, 5315, 4204, 2718, 1472, 13…
## $ median_ageE              <dbl> 58.9, 49.6, 40.2, 48.1, 51.6, 50.0, 45.8, 42.…
## $ median_incomeE           <dbl> 43341, 45184, 65071, 42603, 57256, 45679, 465…
## $ bach_degree_plus_a25E    <dbl> 18.4, 19.3, 23.6, 18.0, 29.6, 12.9, 15.2, 23.…
## $ unemployment_rate        <dbl> 6.1, 4.7, 3.4, 7.5, 4.2, 6.8, 4.1, 5.3, 5.9, …
## $ public_assist_rate       <dbl> 35.1, 22.8, 12.1, 30.2, 19.8, 30.5, 33.0, 15.…
## $ percent_asian            <dbl> 0.3, 0.8, 0.7, 0.5, 0.4, 0.2, 0.5, 0.6, 0.6, …
## $ percent_black            <dbl> 0.4, 6.9, 1.2, 0.6, 0.4, 0.4, 8.7, 0.4, 1.5, …
## $ percent_native           <dbl> 0.6, 3.9, 0.3, 0.3, 0.6, 1.3, 10.2, 0.2, 0.2,…
## $ percent_pacific_islander <dbl> 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1, …
## $ percent_white            <dbl> 95.5, 83.5, 87.4, 95.2, 94.2, 94.5, 72.7, 93.…
## $ percent_hispanic         <dbl> 1.6, 1.7, 7.5, 1.4, 2.3, 2.0, 1.7, 3.1, 5.5, …
## $ County                   <chr> "Alcona", "Alger", "Allegan", "Alpena", "Antr…
## $ state_name               <chr> "Michigan", "Michigan", "Michigan", "Michigan…
## $ Lat                      <dbl> 44.68469, 46.41293, 42.59147, 45.03478, 44.99…
## $ Long_                    <dbl> -83.59508, -86.60260, -85.89103, -83.62212, -…
## $ Population               <dbl> 10405, 9108, 118081, 28405, 23324, 14883, 820…
## $ county_fips              <dbl> 26001, 26003, 26005, 26007, 26009, 26011, 260…
## $ Total_Deaths             <dbl> 72, 15, 368, 143, 70, 72, 55, 177, 607, 73, 5…
## $ Deaths_Per_Pop_Thousand  <dbl> 6.92, 1.65, 3.12, 5.03, 3.00, 4.84, 6.70, 2.8…
# write csv as a backup -> update file path if needed
write.csv(covid_census,"~/STA 518/BrookemWalters-Portfolio/Stats 518 Final Project/data/covid_census.csv")

Success!!! Data cleaning complete!