We start by reading the original CSV and checking the first few rows:
# load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# read original CSV
dat <- read.csv("NCHS_-_Leading_Causes_of_Death__United_States.csv",
stringsAsFactors = FALSE)
head(dat)
## Year X113.Cause.Name
## 1 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 2 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 3 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 4 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 5 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## 6 2017 Accidents (unintentional injuries) (V01-X59,Y85-Y86)
## Cause.Name State Deaths Age.adjusted.Death.Rate
## 1 Unintentional injuries United States 169936 49.4
## 2 Unintentional injuries Alabama 2703 53.8
## 3 Unintentional injuries Alaska 436 63.7
## 4 Unintentional injuries Arizona 4184 56.2
## 5 Unintentional injuries Arkansas 1625 51.8
## 6 Unintentional injuries California 13840 33.2
names(dat)
## [1] "Year" "X113.Cause.Name"
## [3] "Cause.Name" "State"
## [5] "Deaths" "Age.adjusted.Death.Rate"
We will now remove the ICD-coded cause name to prevent confusion with the standard cause name column. We will then rename “cause.name” to “cause” for simplicity:
# Remove 113, ICD-coded cause name
dat$X113.Cause.Name <- NULL
# Rename Cause.Name to Cause for simplicity
names(dat)[names(dat) == "Cause.Name"] <- "Cause"
names(dat)
## [1] "Year" "Cause"
## [3] "State" "Deaths"
## [5] "Age.adjusted.Death.Rate"
We will now classify states between “northern” and “southern”, and remove false cases of states (e.g., District of Columbia):
# States considered "Southern"
south_states <- c(
"Delaware","Maryland","Virginia","West Virginia",
"North Carolina","South Carolina","Georgia","Florida",
"Kentucky","Tennessee","Alabama","Mississippi",
"Arkansas","Louisiana","Oklahoma","Texas"
)
## Remove United States and DC
dat <- dat |>
filter(!(State %in% c("United States", "District of Columbia")))
# Add Region classification
dat <- dat |>
mutate(Region = ifelse(State %in% south_states, "South", "North"))
head(dat)
## Year Cause State Deaths Age.adjusted.Death.Rate Region
## 1 2017 Unintentional injuries Alabama 2703 53.8 South
## 2 2017 Unintentional injuries Alaska 436 63.7 North
## 3 2017 Unintentional injuries Arizona 4184 56.2 North
## 4 2017 Unintentional injuries Arkansas 1625 51.8 South
## 5 2017 Unintentional injuries California 13840 33.2 North
## 6 2017 Unintentional injuries Colorado 3037 53.6 North
Now, we will summarize total deaths by state:
total_deaths <- dat |>
group_by(State) |>
summarize(TotalDeaths = sum(Deaths, na.rm = TRUE))
head(total_deaths)
## # A tibble: 6 × 2
## State TotalDeaths
## <chr> <int>
## 1 Alabama 1603535
## 2 Alaska 117890
## 3 Arizona 1572443
## 4 Arkansas 987455
## 5 California 8169513
## 6 Colorado 1039435
We must now summarize total heart disease deaths by state:
heart_deaths <- dat |>
filter(Cause == "Heart disease") |>
group_by(State) |>
summarize(HeartDeaths = sum(Deaths, na.rm = TRUE))
head(heart_deaths)
## # A tibble: 6 × 2
## State HeartDeaths
## <chr> <int>
## 1 Alabama 240543
## 2 Alaska 12921
## 3 Arizona 205461
## 4 Arkansas 146642
## 5 California 1204573
## 6 Colorado 121876
We will now create an inner join to compare the heart disease deaths per state with the total deaths per state to find the proportion of heart disease deaths / total deaths. We will also classify states by Northern or Southern region in this chunk:
merged <- heart_deaths |>
inner_join(total_deaths, by = c("State")) |>
mutate(HeartProp = HeartDeaths / TotalDeaths)
# Adds Region column to classify states by Northern or Southern region
merged <- merged |>
mutate(Region = ifelse(State %in% south_states, "South", "North"))
merged
## # A tibble: 50 × 5
## State HeartDeaths TotalDeaths HeartProp Region
## <chr> <int> <int> <dbl> <chr>
## 1 Alabama 240543 1603535 0.150 South
## 2 Alaska 12921 117890 0.110 North
## 3 Arizona 205461 1572443 0.131 North
## 4 Arkansas 146642 987455 0.149 South
## 5 California 1204573 8169513 0.147 North
## 6 Colorado 121876 1039435 0.117 North
## 7 Connecticut 145780 981027 0.149 North
## 8 Delaware 36432 255015 0.143 South
## 9 Florida 862602 5868231 0.147 South
## 10 Georgia 320627 2326265 0.138 South
## # ℹ 40 more rows
Finally, we will select our “state”, “region”, and “proportion of heart disease deaths” columns and download the new csv:
final_statkey <- merged |>
select(State, Region, HeartProp)
# Save file to upload to StatKey
write.csv(final_statkey,
"HeartProp_byRegion_StatKey.csv",
row.names = FALSE)
(final_statkey)
## # A tibble: 50 × 3
## State Region HeartProp
## <chr> <chr> <dbl>
## 1 Alabama South 0.150
## 2 Alaska North 0.110
## 3 Arizona North 0.131
## 4 Arkansas South 0.149
## 5 California North 0.147
## 6 Colorado North 0.117
## 7 Connecticut North 0.149
## 8 Delaware South 0.143
## 9 Florida South 0.147
## 10 Georgia South 0.138
## # ℹ 40 more rows