##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Question 1: What is the total number of records in the dataset ?
nrow(Death_df)
## [1] 15028
Question 2 : What were The Causes of Death in this dataset ?
Death_df %>%
select(Cause.Name) %>%
distinct(Cause.Name) %>%
subset(Cause.Name !="All Causes")%>%
print
## # A tibble: 16 x 1
## Cause.Name
## <fct>
## 1 Unintentional Injuries
## 2 Alzheimer's disease
## 3 Homicide
## 4 Stroke
## 5 Chronic liver disease and cirrhosis
## 6 CLRD
## 7 Diabetes
## 8 Diseases of Heart
## 9 Essential hypertension and hypertensive renal disease
## 10 Influenza and pneumonia
## 11 Cancer
## 12 Suicide
## 13 Kidney Disease
## 14 Parkinson's disease
## 15 Pneumonitis due to solids and liquids
## 16 Septicemia
Question 3 : What was the total number of Deaths in United States from 1999 to 2015 ?
Death_df %>%
filter(Year >= 1999 & Death_df$Year <= 2015, Cause.Name == "All Causes", State != "United States")%>%
filter(!is.na(Deaths)) %>%
summarize(Total_Deaths=sum(Deaths))
## # A tibble: 1 x 1
## Total_Deaths
## <int>
## 1 42170818
Question 4 : What is the number of Deaths per each year from 1999 to 2015 ?
Death_df %>%
filter(Year >= 1999 & Death_df$Year <= 2015, Cause.Name == "All Causes", State != "United States")%>%
group_by(Year) %>%
summarize(sum(Deaths))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 17 x 2
## Year `sum(Deaths)`
## <int> <int>
## 1 1999 2391399
## 2 2000 2403351
## 3 2001 2416425
## 4 2002 2443387
## 5 2003 2448288
## 6 2004 2397615
## 7 2005 2448017
## 8 2006 2426264
## 9 2007 2423712
## 10 2008 2471984
## 11 2009 2437163
## 12 2010 2468435
## 13 2011 2515458
## 14 2012 2543279
## 15 2013 2596993
## 16 2014 2626418
## 17 2015 2712630
Question 5 : Which ten states had the highest number of deaths overall ?
Death_df %>%
filter(Cause.Name == "All Causes", State != "United States")%>%
group_by(State) %>%
summarize(sum(Deaths)) %>%
top_n(10)
## `summarise()` ungrouping output (override with `.groups` argument)
## Selecting by sum(Deaths)
## # A tibble: 10 x 2
## State `sum(Deaths)`
## <fct> <int>
## 1 California 4044823
## 2 Florida 2933810
## 3 Illinois 1765173
## 4 Michigan 1503723
## 5 New Jersey 1218421
## 6 New York 2587220
## 7 North Carolina 1308548
## 8 Ohio 1867737
## 9 Pennsylvania 2180843
## 10 Texas 2777261
Question 6 : What was the top causes of deaths in the U.S during the period ?
Death_df %>%
filter(Cause.Name != "All Causes", State == "United States") %>%
group_by(Cause.Name) %>%
summarize(sum(Deaths)) %>%
top_n(10)
## `summarise()` ungrouping output (override with `.groups` argument)
## Selecting by sum(Deaths)
## # A tibble: 10 x 2
## Cause.Name `sum(Deaths)`
## <fct> <int>
## 1 Alzheimer's disease 1257309
## 2 Cancer 9646498
## 3 CLRD 2280130
## 4 Diabetes 1236321
## 5 Diseases of Heart 10939923
## 6 Influenza and pneumonia 987432
## 7 Kidney Disease 757934
## 8 Stroke 2437998
## 9 Suicide 604878
## 10 Unintentional Injuries 2016510