loading dataset and libraries needed
library("dplyr")
mydf <- read.csv("NCHS.csv")
Death_df <- tibble::as_tibble(mydf)
rm(mydf)
nrow(Death_df)
[1] 15028
Death_df
# A tibble: 15,028 x 6
Year X113.Cause.Name Cause.Name State Deaths Age.adjusted.Dea~
<int> <chr> <chr> <chr> <int> <dbl>
1 1999 Accidents (unintention~ Unintentiona~ Alabama 2313 52.2
2 1999 Accidents (unintention~ Unintentiona~ Alaska 294 55.9
3 1999 Accidents (unintention~ Unintentiona~ Arizona 2214 44.8
4 1999 Accidents (unintention~ Unintentiona~ Arkansas 1287 47.6
5 1999 Accidents (unintention~ Unintentiona~ Califor~ 9198 28.7
6 1999 Accidents (unintention~ Unintentiona~ Colorado 1519 39
7 1999 Accidents (unintention~ Unintentiona~ Connect~ 1034 29.3
8 1999 Accidents (unintention~ Unintentiona~ Delaware 267 35.3
9 1999 Accidents (unintention~ Unintentiona~ Distric~ 161 28.4
10 1999 Accidents (unintention~ Unintentiona~ Florida 5961 35.7
# ... with 15,018 more rows
Death_df %>%
select(Cause.Name) %>%
distinct(Cause.Name) %>%
subset(Cause.Name !="All Causes")%>%
print
# A tibble: 16 x 1
Cause.Name
<chr>
1 Unintentional Injuries
2 Alzheimer's disease
3 Homicide
4 Stroke
5 Chronic liver disease and cirrhosis
6 CLRD
7 Diabetes
8 Diseases of Heart
9 Essential hypertension and hypertensive renal disease
10 Influenza and pneumonia
11 Cancer
12 Suicide
13 Kidney Disease
14 Parkinson's disease
15 Pneumonitis due to solids and liquids
16 Septicemia
Death_df %>%
filter(Year >= 1999 & Death_df$Year <= 2015, Cause.Name == "All Causes", State != "United States")%>%
filter(!is.na(Deaths)) %>%
summarize(Total_Deaths=sum(Deaths))
# A tibble: 1 x 1
Total_Deaths
<int>
1 42170818
Death_df %>%
filter(Year >= 1999 & Death_df$Year <= 2015, Cause.Name == "All Causes", State != "United States")%>%
group_by(Year) %>%
summarize(sum(Deaths))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 17 x 2
Year `sum(Deaths)`
<int> <int>
1 1999 2391399
2 2000 2403351
3 2001 2416425
4 2002 2443387
5 2003 2448288
6 2004 2397615
7 2005 2448017
8 2006 2426264
9 2007 2423712
10 2008 2471984
11 2009 2437163
12 2010 2468435
13 2011 2515458
14 2012 2543279
15 2013 2596993
16 2014 2626418
17 2015 2712630
Death_df %>%
filter(Cause.Name == "All Causes", State != "United States")%>%
group_by(State) %>%
summarize(sum(Deaths)) %>%
top_n(10)
`summarise()` ungrouping output (override with `.groups` argument)
Selecting by sum(Deaths)
# A tibble: 10 x 2
State `sum(Deaths)`
<chr> <int>
1 California 4044823
2 Florida 2933810
3 Illinois 1765173
4 Michigan 1503723
5 New Jersey 1218421
6 New York 2587220
7 North Carolina 1308548
8 Ohio 1867737
9 Pennsylvania 2180843
10 Texas 2777261
Death_df %>%
filter(Cause.Name != "All Causes", State == "United States") %>%
group_by(Cause.Name) %>%
summarize(sum(Deaths)) %>%
top_n(n=10)
`summarise()` ungrouping output (override with `.groups` argument)
Selecting by sum(Deaths)
# A tibble: 10 x 2
Cause.Name `sum(Deaths)`
<chr> <int>
1 Alzheimer's disease 1257309
2 Cancer 9646498
3 CLRD 2280130
4 Diabetes 1236321
5 Diseases of Heart 10939923
6 Influenza and pneumonia 987432
7 Kidney Disease 757934
8 Stroke 2437998
9 Suicide 604878
10 Unintentional Injuries 2016510