New York City Leading Causes of Death from https://data.cityofnewyork.us/Health/New-York-City-Leading-Causes-of-Death/jb7j-dtam down load csv file to local computer for data analysis
set environment
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(DT)
options(dplyr.width = Inf)
read database
setwd("C:/Users/tbao/Desktop/CUNY MSDS notes/607/week 6/project 2/New York City Leading cause of Death")
getwd()
## [1] "C:/Users/tbao/Desktop/CUNY MSDS notes/607/week 6/project 2/New York City Leading cause of Death"
df <- read.csv("C:/Users/tbao/Desktop/CUNY MSDS notes/607/week 6/project 2/New York City Leading cause of Death/New_York_City_Leading_Causes_of_Death.csv", header=TRUE, sep=",")
datatable(df)
leading causes of death each year for men and women.
df_death <- df %>% subset(select = c(Sex,Year,Cause.of.Death,Count)) %>%
group_by(Sex,Year) %>%
filter(Count == max(Count)) %>% distinct(Sex,Year,Cause.of.Death,Count)
head(df_death)
## # A tibble: 6 x 4
## # Groups: Sex, Year [6]
## Sex Year Cause.of.Death Count
## <fct> <int> <fct> <int>
## 1 FEMALE 2010 DISEASES OF HEART 5351
## 2 MALE 2010 DISEASES OF HEART 4495
## 3 FEMALE 2011 DISEASES OF HEART 5016
## 4 MALE 2011 DISEASES OF HEART 4220
## 5 FEMALE 2007 DISEASES OF HEART 7050
## 6 MALE 2007 DISEASES OF HEART 5632
Display the leading causes of death each year for each ethnic group.
df_death_eth <- df %>% subset(select = c(Ethnicity,Year,Cause.of.Death,Count)) %>%
group_by(Ethnicity,Year) %>%
filter(Count == max(Count)) %>% distinct(Ethnicity,Year,Cause.of.Death,Count)
head(df_death_eth)
## # A tibble: 6 x 4
## # Groups: Ethnicity, Year [6]
## Ethnicity Year Cause.of.Death Count
## <fct> <int> <fct> <int>
## 1 NON-HISPANIC WHITE 2010 DISEASES OF HEART 5351
## 2 ASIAN & PACIFIC ISLANDER 2011 MALIGNANT NEOPLASMS 550
## 3 HISPANIC 2011 DISEASES OF HEART 1348
## 4 NON-HISPANIC BLACK 2011 DISEASES OF HEART 2243
## 5 NON-HISPANIC WHITE 2011 DISEASES OF HEART 5016
## 6 ASIAN & PACIFIC ISLANDER 2007 MALIGNANT NEOPLASMS 528
Calculate which cause of death has reduced the most and which has increased the most in the years given
df_death_trends <- df %>% subset(select = c(Year,Cause.of.Death,Count)) %>%
group_by(Year, Cause.of.Death, the_min = min(Count), the_max = max(Count), the_diff = (the_max - the_min)) %>% count(Year, Cause.of.Death, max(the_diff))
datatable(df_death_trends)
Calculate which cause of death has remained stable over the years given.
df_death_stable <- df %>% subset(select = c(Year,Cause.of.Death,Count)) %>%
group_by(Year, Cause.of.Death, the_min = min(Count), the_max = max(Count), the_diff = (the_max - the_min)) %>% count(Year, Cause.of.Death, min(the_diff))
datatable(df_death_stable)