loading dataset and libraries needed

library("dplyr")
mydf <- read.csv("NCHS.csv")
Death_df <- tibble::as_tibble(mydf)
rm(mydf)

Question 1: What is the total number of records in the dataset ?

nrow(Death_df)
[1] 15028
Death_df
# A tibble: 15,028 x 6
    Year X113.Cause.Name         Cause.Name    State    Deaths Age.adjusted.Dea~
   <int> <chr>                   <chr>         <chr>     <int>             <dbl>
 1  1999 Accidents (unintention~ Unintentiona~ Alabama    2313              52.2
 2  1999 Accidents (unintention~ Unintentiona~ Alaska      294              55.9
 3  1999 Accidents (unintention~ Unintentiona~ Arizona    2214              44.8
 4  1999 Accidents (unintention~ Unintentiona~ Arkansas   1287              47.6
 5  1999 Accidents (unintention~ Unintentiona~ Califor~   9198              28.7
 6  1999 Accidents (unintention~ Unintentiona~ Colorado   1519              39  
 7  1999 Accidents (unintention~ Unintentiona~ Connect~   1034              29.3
 8  1999 Accidents (unintention~ Unintentiona~ Delaware    267              35.3
 9  1999 Accidents (unintention~ Unintentiona~ Distric~    161              28.4
10  1999 Accidents (unintention~ Unintentiona~ Florida    5961              35.7
# ... with 15,018 more rows

Question 2 : What were The Causes of Death in this dataset ?

Death_df %>%
  select(Cause.Name) %>%
  distinct(Cause.Name) %>%
  subset(Cause.Name !="All Causes")%>%
  print
# A tibble: 16 x 1
   Cause.Name                                           
   <chr>                                                
 1 Unintentional Injuries                               
 2 Alzheimer's disease                                  
 3 Homicide                                             
 4 Stroke                                               
 5 Chronic liver disease and cirrhosis                  
 6 CLRD                                                 
 7 Diabetes                                             
 8 Diseases of Heart                                    
 9 Essential hypertension and hypertensive renal disease
10 Influenza and pneumonia                              
11 Cancer                                               
12 Suicide                                              
13 Kidney Disease                                       
14 Parkinson's disease                                  
15 Pneumonitis due to solids and liquids                
16 Septicemia                                           

Question 3 : What was the total number of Deaths in United States from 1999 to 2015 ?

Death_df %>%
  filter(Year >= 1999 & Death_df$Year <= 2015, Cause.Name == "All Causes", State != "United States")%>%
  filter(!is.na(Deaths)) %>%
  summarize(Total_Deaths=sum(Deaths))
# A tibble: 1 x 1
  Total_Deaths
         <int>
1     42170818

Question 4 : What is the number of Deaths per each year from 1999 to 2015 ?

Death_df %>%
  filter(Year >= 1999 & Death_df$Year <= 2015, Cause.Name == "All Causes", State != "United States")%>%
  group_by(Year) %>%
  summarize(sum(Deaths))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 17 x 2
    Year `sum(Deaths)`
   <int>         <int>
 1  1999       2391399
 2  2000       2403351
 3  2001       2416425
 4  2002       2443387
 5  2003       2448288
 6  2004       2397615
 7  2005       2448017
 8  2006       2426264
 9  2007       2423712
10  2008       2471984
11  2009       2437163
12  2010       2468435
13  2011       2515458
14  2012       2543279
15  2013       2596993
16  2014       2626418
17  2015       2712630

Question 5 : Which ten states had the highest number of deaths overall ?

Death_df %>%
  filter(Cause.Name == "All Causes", State != "United States")%>%
  group_by(State) %>%
  summarize(sum(Deaths)) %>%
  top_n(10)
`summarise()` ungrouping output (override with `.groups` argument)
Selecting by sum(Deaths)
# A tibble: 10 x 2
   State          `sum(Deaths)`
   <chr>                  <int>
 1 California           4044823
 2 Florida              2933810
 3 Illinois             1765173
 4 Michigan             1503723
 5 New Jersey           1218421
 6 New York             2587220
 7 North Carolina       1308548
 8 Ohio                 1867737
 9 Pennsylvania         2180843
10 Texas                2777261

Question 6 : What was the top causes of deaths in the U.S during the period ?

Death_df %>%
  filter(Cause.Name != "All Causes", State == "United States") %>%
  group_by(Cause.Name) %>%
  summarize(sum(Deaths)) %>%
  top_n(n=10)
`summarise()` ungrouping output (override with `.groups` argument)
Selecting by sum(Deaths)
# A tibble: 10 x 2
   Cause.Name              `sum(Deaths)`
   <chr>                           <int>
 1 Alzheimer's disease           1257309
 2 Cancer                        9646498
 3 CLRD                          2280130
 4 Diabetes                      1236321
 5 Diseases of Heart            10939923
 6 Influenza and pneumonia        987432
 7 Kidney Disease                 757934
 8 Stroke                        2437998
 9 Suicide                        604878
10 Unintentional Injuries        2016510