Data can be obtained from this link
Data is downloaded and read from local harddisk
We start by loading needed libraries and reading the data
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
NHCS_data <- read.csv("NCHS.csv")
dim(NHCS_data)
## [1] 15028 6
NHCS data file has 6 columns & 15028 rows
We start by checking how our data is structured
str(NHCS_data)
## 'data.frame': 15028 obs. of 6 variables:
## $ Year : int 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
## $ X113.Cause.Name : Factor w/ 17 levels "Accidents (unintentional injuries) (V01-X59,Y85-Y86)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Cause.Name : Factor w/ 17 levels "All Causes","Alzheimer's disease",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ State : Factor w/ 52 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Deaths : int 2313 294 2214 1287 9198 1519 1034 267 161 5961 ...
## $ Age.adjusted.Death.Rate: num 52.2 55.9 44.8 47.6 28.7 39 29.3 35.3 28.4 35.7 ...
summary(NHCS_data)
## Year
## Min. :1999
## 1st Qu.:2003
## Median :2007
## Mean :2007
## 3rd Qu.:2011
## Max. :2015
##
## X113.Cause.Name
## Accidents (unintentional injuries) (V01-X59,Y85-Y86): 884
## All Causes : 884
## Alzheimer's disease (G30) : 884
## Assault (homicide) (*U01-*U02,X85-Y09,Y87.1) : 884
## Cerebrovascular diseases (I60-I69) : 884
## Chronic liver disease and cirrhosis (K70,K73-K74) : 884
## (Other) :9724
## Cause.Name State
## All Causes : 884 Alabama : 289
## Alzheimer's disease : 884 Alaska : 289
## Cancer : 884 Arizona : 289
## Chronic liver disease and cirrhosis: 884 Arkansas : 289
## CLRD : 884 California: 289
## Diabetes : 884 Colorado : 289
## (Other) :9724 (Other) :13294
## Deaths Age.adjusted.Death.Rate
## Min. : 10 Min. : 1.30
## 1st Qu.: 294 1st Qu.: 8.30
## Median : 838 Median : 18.90
## Mean : 10233 Mean : 86.53
## 3rd Qu.: 2737 3rd Qu.: 46.30
## Max. :2712630 Max. :1087.30
## NA's :15 NA's :111
After checking structure of our data, we will use NHCS dataser to answer the few questions we have in hand ..
unique(NHCS_data$Cause.Name)
## [1] Unintentional Injuries
## [2] All Causes
## [3] Alzheimer's disease
## [4] Homicide
## [5] Stroke
## [6] Chronic liver disease and cirrhosis
## [7] CLRD
## [8] Diabetes
## [9] Diseases of Heart
## [10] Essential hypertension and hypertensive renal disease
## [11] Influenza and pneumonia
## [12] Cancer
## [13] Suicide
## [14] Kidney Disease
## [15] Parkinson's disease
## [16] Pneumonitis due to solids and liquids
## [17] Septicemia
## 17 Levels: All Causes Alzheimer's disease ... Unintentional Injuries
# we notice there is a cause named 'All Causes' .. hence we count in 2 ways:
# get total death by filtering on 'All Causes' ,
# get total death for all records except 'All Causes'
sum(filter(NHCS_data, NHCS_data$Cause.Name == "All Causes")$Deaths, na.rm = TRUE)
## [1] 84341636
sum(filter(NHCS_data, NHCS_data$Cause.Name != "All Causes")$Deaths, na.rm = TRUE)
## [1] 69280514
# check if states coulmn has all states
unique(NHCS_data$State)
## [1] Alabama Alaska Arizona
## [4] Arkansas California Colorado
## [7] Connecticut Delaware District of Columbia
## [10] Florida Georgia Hawaii
## [13] Idaho Illinois Indiana
## [16] Iowa Kansas Kentucky
## [19] Louisiana Maine Maryland
## [22] Massachusetts Michigan Minnesota
## [25] Mississippi Missouri Montana
## [28] Nebraska Nevada New Hampshire
## [31] New Jersey New Mexico New York
## [34] North Carolina North Dakota Ohio
## [37] Oklahoma Oregon Pennsylvania
## [40] Rhode Island South Carolina South Dakota
## [43] Tennessee Texas United States
## [46] Utah Vermont Virginia
## [49] Washington West Virginia Wisconsin
## [52] Wyoming
## 52 Levels: Alabama Alaska Arizona Arkansas California ... Wyoming
# # we notice we have 'United States' listed among states, hence we count death in all 4 cases:
# get total death by filtering on 'All Causes' & 'United States',
# get total death by filtering on 'All Causes' & individual States,
# get total death by filtering on individual Causes' & 'United States',
# get total death by filtering on individual Causes & individual States
death_count <- c(
sum(filter(NHCS_data, NHCS_data$Cause.Name == "All Causes" , NHCS_data$State =="United States")$Deaths, na.rm = TRUE),
sum(filter(NHCS_data, NHCS_data$Cause.Name == "All Causes" , NHCS_data$State !="United States")$Deaths, na.rm = TRUE),
sum(filter(NHCS_data, NHCS_data$Cause.Name != "All Causes" , NHCS_data$State =="United States")$Deaths, na.rm = TRUE),
sum(filter(NHCS_data, NHCS_data$Cause.Name != "All Causes" , NHCS_data$State !="United States")$Deaths, na.rm = TRUE)
)
names(death_count) <- c('All Causes & United States','All Causes & States','Causes & United States','Causes & States')
death_count
## All Causes & United States All Causes & States
## 42170818 42170818
## Causes & United States Causes & States
## 34640315 34640199
# Sum Death per Year from 1999 to 2015
death_per_year <- filter(NHCS_data, NHCS_data$Cause.Name == "All Causes" , NHCS_data$State =="United States")
death_per_year <- aggregate(death_per_year['Deaths'], by=death_per_year['Year'], sum,na.rm=TRUE, na.action=NULL)
# Display Sum of Deaths per year from 1999 to 2015
ggplot(death_per_year, aes(Year, Deaths)) + geom_line()+labs(title = "Sum of Deaths per Year in the USA from 1999 to 2015")
# Sum Death per Year from 1999 to 2015
death_per_states <- filter(NHCS_data, NHCS_data$Cause.Name == "All Causes" , NHCS_data$State !="United States")
death_per_states <- aggregate(death_per_states['Deaths'], by=death_per_states['State'], sum)
# Display top 10 states with highest deaths
ggplot(death_per_states[
order( -death_per_states[,2]),
][1:10,], aes(State, Deaths)) + geom_bar(stat = "identity", fill="blue", alpha=0.5)+labs(title = "Top 10 Deaths per State in the USA from 1999 to 2015") + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0))
# Sum Death per Cause from 1999 to 2015
death_per_cause <- filter(NHCS_data, NHCS_data$Cause.Name != "All Causes" , NHCS_data$State =="United States")
death_per_cause <- aggregate(death_per_cause['Deaths'], by=death_per_cause['Cause.Name'], sum,na.rm=TRUE, na.action=NULL)
death_per_cause <- death_per_cause[order( -death_per_cause[,2]),][1:10,]
# Display top 10 states with highest deaths
ggplot(death_per_cause, aes(Cause.Name, Deaths)) + geom_bar(stat = "identity", fill="blue", alpha=0.5)+labs(title = "Top 10 Deaths per Cause in the USA from 1999 to 2015") + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0))