This markdown will be used to filter the dataset for the project

We start by reading the original CSV and checking the first few rows:

# load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# read original CSV

dat <- read.csv("NCHS_-_Leading_Causes_of_Death__United_States.csv",
stringsAsFactors = FALSE)

head(dat)
##   Year                                      X113.Cause.Name
## 1 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 2 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 3 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 4 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 5 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 6 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
##               Cause.Name         State Deaths Age.adjusted.Death.Rate
## 1 Unintentional injuries United States 169936                    49.4
## 2 Unintentional injuries       Alabama   2703                    53.8
## 3 Unintentional injuries        Alaska    436                    63.7
## 4 Unintentional injuries       Arizona   4184                    56.2
## 5 Unintentional injuries      Arkansas   1625                    51.8
## 6 Unintentional injuries    California  13840                    33.2
names(dat)
## [1] "Year"                    "X113.Cause.Name"        
## [3] "Cause.Name"              "State"                  
## [5] "Deaths"                  "Age.adjusted.Death.Rate"

We will now remove the ICD-coded cause name to prevent confusion with the standard cause name column. We will then rename “cause.name” to “cause” for simplicity:

# Remove 113, ICD-coded cause name

dat$X113.Cause.Name <- NULL

# Rename Cause.Name to Cause for simplicity

names(dat)[names(dat) == "Cause.Name"] <- "Cause"

names(dat)
## [1] "Year"                    "Cause"                  
## [3] "State"                   "Deaths"                 
## [5] "Age.adjusted.Death.Rate"

We will now classify states between “northern” and “southern”, and remove false cases of states (e.g., District of Columbia):

# States considered "Southern"

south_states <- c(
"Delaware","Maryland","Virginia","West Virginia",
"North Carolina","South Carolina","Georgia","Florida",
"Kentucky","Tennessee","Alabama","Mississippi",
"Arkansas","Louisiana","Oklahoma","Texas"
)

## Remove United States and DC
dat <- dat |>
  filter(!(State %in% c("United States", "District of Columbia")))

# Add Region classification
dat <- dat |>
  mutate(Region = ifelse(State %in% south_states, "South", "North"))

head(dat)
##   Year                  Cause      State Deaths Age.adjusted.Death.Rate Region
## 1 2017 Unintentional injuries    Alabama   2703                    53.8  South
## 2 2017 Unintentional injuries     Alaska    436                    63.7  North
## 3 2017 Unintentional injuries    Arizona   4184                    56.2  North
## 4 2017 Unintentional injuries   Arkansas   1625                    51.8  South
## 5 2017 Unintentional injuries California  13840                    33.2  North
## 6 2017 Unintentional injuries   Colorado   3037                    53.6  North

Now, we will summarize total deaths by state:

total_deaths <- dat |>
  group_by(State) |>
  summarize(TotalDeaths = sum(Deaths, na.rm = TRUE))

head(total_deaths)
## # A tibble: 6 × 2
##   State      TotalDeaths
##   <chr>            <int>
## 1 Alabama        1603535
## 2 Alaska          117890
## 3 Arizona        1572443
## 4 Arkansas        987455
## 5 California     8169513
## 6 Colorado       1039435

We must now summarize total heart disease deaths by state:

heart_deaths <- dat |>
  filter(Cause == "Heart disease") |>
  group_by(State) |>
  summarize(HeartDeaths = sum(Deaths, na.rm = TRUE))

head(heart_deaths)
## # A tibble: 6 × 2
##   State      HeartDeaths
##   <chr>            <int>
## 1 Alabama         240543
## 2 Alaska           12921
## 3 Arizona         205461
## 4 Arkansas        146642
## 5 California     1204573
## 6 Colorado        121876

We will now create an inner join to compare the heart disease deaths per state with the total deaths per state to find the proportion of heart disease deaths / total deaths. We will also classify states by Northern or Southern region in this chunk:

merged <- heart_deaths |>
  inner_join(total_deaths, by = c("State")) |>
  mutate(HeartProp = HeartDeaths / TotalDeaths)

# Adds Region column to classify states by Northern or Southern region
merged <- merged |>
  mutate(Region = ifelse(State %in% south_states, "South", "North"))

merged
## # A tibble: 50 × 5
##    State       HeartDeaths TotalDeaths HeartProp Region
##    <chr>             <int>       <int>     <dbl> <chr> 
##  1 Alabama          240543     1603535     0.150 South 
##  2 Alaska            12921      117890     0.110 North 
##  3 Arizona          205461     1572443     0.131 North 
##  4 Arkansas         146642      987455     0.149 South 
##  5 California      1204573     8169513     0.147 North 
##  6 Colorado         121876     1039435     0.117 North 
##  7 Connecticut      145780      981027     0.149 North 
##  8 Delaware          36432      255015     0.143 South 
##  9 Florida          862602     5868231     0.147 South 
## 10 Georgia          320627     2326265     0.138 South 
## # ℹ 40 more rows

Finally, we will select our “state”, “region”, and “proportion of heart disease deaths” columns and download the new csv:

final_statkey <- merged |>
  select(State, Region, HeartProp)

# Save file to upload to StatKey

write.csv(final_statkey,
"HeartProp_byRegion_StatKey.csv",
row.names = FALSE)

(final_statkey)
## # A tibble: 50 × 3
##    State       Region HeartProp
##    <chr>       <chr>      <dbl>
##  1 Alabama     South      0.150
##  2 Alaska      North      0.110
##  3 Arizona     North      0.131
##  4 Arkansas    South      0.149
##  5 California  North      0.147
##  6 Colorado    North      0.117
##  7 Connecticut North      0.149
##  8 Delaware    South      0.143
##  9 Florida     South      0.147
## 10 Georgia     South      0.138
## # ℹ 40 more rows