The objective of the analysis is to study the storm data collected by the National Oceanic and Atmospheric Administration (NOAA) to answer the following two questions. First, which type of events are the most harmful with respect to population health? Second, which types of events have the greatest economic consequences? To answer the first question I will concentrate my analysis in the total number of fatalities and injuries per event. The second question I will analyze the property damage and crop damage across the United States. I will explore the storm data database from NOAA to conduct my analysis. The full data documentation is found here - Storm Data Documentation
First we download and unzip the storm dataset from the NOAA website. I only imported the columns necessary to complete my analysis. This is will limit the object memory allocation and load the data faster.
library(R.utils) # need this to unzip the bzip2 file
warn.conflicts = FALSE
location=getwd() #use the current working directory
url<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
filename<-"repdataStormData.csv"
data_location<-file.path(location,"noaadata") # creates a repository to keep the raw data
data_file<-file.path(data_location,filename) #concat path and file name
#if the directory data does not existing download fresh data
if(!file.exists(data_file)){
rawdata<-tempfile()
rawdata<-paste(rawdata,filename,".bz2",sep="")
download.file(url,rawdata) #download data from the URL
bunzip2(rawdata,data_file)
}
library(data.table)
data<-fread(data_file,sep=",",stringsAsFactors = FALSE,select = c("BGN_DATE","TIME_ZONE","EVTYPE","FATALITIES","INJURIES","STATE","COUNTYNAME","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP","COUNTY","LATITUDE","LONGITUDE"),nrows=-1)
Some necessary libraries we will be using for this analysis
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(grid)
library(gtable)
library(dplyr)
library(colorplaner)
## Warning: package 'colorplaner' was built under R version 3.4.4
library(fiftystater)
## Warning: package 'fiftystater' was built under R version 3.4.4
library(stringdist)
## Warning: package 'stringdist' was built under R version 3.4.4
data$EVTYPE=toupper(trimws(data$EVTYPE))
data$FATALITIES=as.numeric(data$FATALITIES)
data$INJURIES=as.numeric(data$INJURIES)
data$BGN_DATE=as.Date(data$BGN_DATE,format="%m/%d/%Y") #need to convert to date
### formating the crop damage and property damage to real dollar amount
data<-mutate(data,PROPDMGEXP=ifelse(PROPDMGEXP=='B',1E9,
ifelse(PROPDMGEXP=='M',1E6,
ifelse(PROPDMGEXP=='K',1E3,0))))
data<-mutate(data,CROPDMGEXP=ifelse(CROPDMGEXP=='B',1E9,
ifelse(CROPDMGEXP=='M',1E6,
ifelse(CROPDMGEXP=='K',1E3,0))))
data<-mutate(data,PROPDMG_CASH=PROPDMGEXP*PROPDMG,CROPDMG_CASH=CROPDMGEXP*CROPDMG)
During the analysis of the raw data I realized the EVTYPE column did not followed the naming convention in the National Weather Service Storm Data Documentation . I found lots variations especially with misspellings and abbreviations. First, I used regex to clean some of most obvious problems then I used clustering to try and group as many similar events together. I used the stringdist package to cluster the EVTYPEs. After many trial and errors I figured that cutting the tree at 0.3 gave me the best results.
#first some regex to fix Thunderstorms Wind
data$EVTYPE<-gsub('^TSTM WIND','THUDERSTORM WIND',data$EVTYPE)
data$EVTYPE<-gsub('^THUDERSTORM WINDs','THUDERSTORM WIND',data$EVTYPE)
#Excessive Heat is also an easy one to determined
data$EVTYPE<-gsub('^HEAT','EXCESSIVE HEAT',data$EVTYPE)
data$EVTYPE<-gsub('^EXCESSIVE','EXCESSIVE HEAT',data$EVTYPE)
set.seed(1973)
EVTYPES <- unique(data$EVTYPE) #get the unique values only
cluster_event<-stringdistmatrix(EVTYPES,EVTYPES,method = "jw")
hc <- hclust(as.dist(cluster_event))
event_cluster_df <- data.frame(EVTYPES,cutree(hc,h=0.3))
colnames(event_cluster_df)<-c("Event","Cluster")
rownames(event_cluster_df)<-event_cluster_df$Event
event_cluster_df<-event_cluster_df[order(event_cluster_df$Cluster),]
data<-merge(data,event_cluster_df,by.x="EVTYPE",by.y="Event",all.x=TRUE,all.y=TRUE)
test_cluster<-group_by(data,EVTYPE,Cluster)
test_cluster<-summarize(test_cluster,count=n())
test_cluster<-ungroup(test_cluster)
Quick demostration that the clustering works
subset(test_cluster,Cluster==1)
## # A tibble: 10 x 3
## EVTYPE Cluster count
## <chr> <int> <int>
## 1 TORNADO 1 60652
## 2 TORNADO DEBRIS 1 1
## 3 TORNADO F0 1 19
## 4 TORNADO F1 1 4
## 5 TORNADO F2 1 3
## 6 TORNADO F3 1 2
## 7 TORNADOES 1 2
## 8 TORNADOES, TSTM WIND, HAIL 1 1
## 9 TORNADOS 1 1
## 10 TORNDAO 1 1
subset(test_cluster,Cluster==44)
## # A tibble: 3 x 3
## EVTYPE Cluster count
## <chr> <int> <int>
## 1 HIGH TEMPERATURE RECORD 44 3
## 2 LOW TEMPERATURE RECORD 44 1
## 3 TEMPERATURE RECORD 44 43
After using clustering to group the events now we summarize the data by cluster.
data_by_event<-group_by(data,Cluster)
data_by_event<-summarize(data_by_event,
count=n(),
EVTYPE=first(EVTYPE),
FATALITIES=sum(FATALITIES,na.rm=TRUE),
INJURIES=sum(INJURIES,na.rm=TRUE),
PROPDMG=sum(PROPDMG_CASH,na.rm=TRUE),
CROPDMG=sum(CROPDMG_CASH,na.rm=TRUE)
)
data_by_event<-ungroup(data_by_event)
data_by_event <-arrange(data_by_event,desc(FATALITIES))
data_by_event <-arrange(data_by_event,desc(INJURIES))
I also thought it will be necessary to show the impact of the weather events across the United States. I am grouping the total fatalities, injuries, property damage and crop damage by state columns (FATALITIES,INJURIES,PROPDMG_CASH,CROPDMG_CASH).
data_fatalities<-data_by_event[order(-data_by_event$FATALITIES),]
data_injuries<-data_by_event[order(-data_by_event$INJURIES),]
data_by_state<-group_by(data,STATE)
data_by_state<-summarize(data_by_state,
count=n(),
EVTYPE=first(EVTYPE),
count=n(),
FATALITIES=sum(FATALITIES,na.rm=TRUE),
INJURIES=sum(INJURIES,na.rm=TRUE),
PROPDMG=sum(PROPDMG_CASH,na.rm=TRUE),
CROPDMG=sum(CROPDMG_CASH,na.rm=TRUE))
data_by_state<-ungroup(data_by_state)
And, finally I am adding the states names to each grouping so we can use ggplot2 with geom_map to plot a map of the United States.
getStateName<-function(x){
if(length(state.name[state.abb==x])>0)
return(tolower(state.name[state.abb==x]))
else
return("")
}
data_by_state<-mutate(data_by_state,state_name=sapply(data_by_state$STATE,getStateName))
This table shows the top ten events based on total of fatalities. According my analysis and the empirical data in the plot tornados have the biggest impact on population health.
data_by_event <-arrange(data_by_event,desc(FATALITIES))
data_by_event <-arrange(data_by_event,desc(INJURIES))
head(data_by_event[c("EVTYPE","FATALITIES","INJURIES")],10)
## # A tibble: 10 x 3
## EVTYPE FATALITIES INJURIES
## <chr> <dbl> <dbl>
## 1 TORNADO 5658 91364
## 2 THUDERSTORM WIND 711 9507
## 3 EXCESSIVE HEAT 3023 9042
## 4 FLOOD 476 6791
## 5 LIGHTING 817 5230
## 6 ICE STORM 89.0 1977
## 7 FLASH FLOOD 1018 1785
## 8 WILD FIRES 90.0 1606
## 9 HIGH WINDS 286 1449
## 10 WINTER MIX 218 1430
This plot shows the top events base on the number of injuries and fatalities
data_injuries<-transform(data_injuries,EVTYPE=reorder(EVTYPE,INJURIES))
data_injuries<-arrange(data_injuries,desc(INJURIES))
plot1<-ggplot(data_injuries[1:10,],aes(EVTYPE,fill=INJURIES))+geom_bar(aes(y=INJURIES),stat="identity")+
ggtitle("Top 10 Events Injuries")+ylab("Total Injuries")+xlab("Type of Event")+
scale_fill_gradient(low="yellow",high="red")+coord_flip()
data_fatalities<-transform(data_fatalities,EVTYPE=reorder(EVTYPE,FATALITIES))
data_fatalities<-arrange(data_fatalities,desc(INJURIES))
plot2<-ggplot(data_fatalities[1:10,],aes(EVTYPE,fill=FATALITIES))+geom_bar(aes(y=FATALITIES),stat="identity") +
ggtitle("Top 10 Events Fatalities")+ylab("Total Fatalities")+xlab("Type of Event")+
scale_fill_gradient(low="yellow",high="red")+coord_flip()
#display the two images in on page
g1<-ggplotGrob(plot1)
g2<-ggplotGrob(plot2)
g<-rbind(g1,g2,size="first")
grid.newpage()
grid.draw(g)
This table shows the top ten events with the highest economic cost
data_by_event <-arrange(data_by_event,desc(PROPDMG))
data_by_event <-arrange(data_by_event,desc(CROPDMG))
head(data_by_event[c("EVTYPE","PROPDMG","CROPDMG")],10)
## # A tibble: 10 x 3
## EVTYPE PROPDMG CROPDMG
## <chr> <dbl> <dbl>
## 1 DROUGHT 1046106000 13972566000
## 2 FLOOD 144771964800 5783673950
## 3 BEACH FLOOD 5235239500 5057484000
## 4 ICE STORM 3944927810 5022113500
## 5 HAIL 15727918720 3025627450
## 6 HURRICANE 15330340010 2887420000
## 7 HURRICANE OPAL/HIGH WINDS 69405840000 2617872800
## 8 FLASH FLOOD 16732553610 1437153150
## 9 EXTREME COLD 77190400 1330023000
## 10 THUDERSTORM WIND 9758818130 1225345900
The following plot shows the top ten events based on property damage and crop damage
data_propdmg<-transform(data_by_event,EVTYPE=reorder(EVTYPE,PROPDMG))
data_propdmg<-arrange(data_propdmg,desc(PROPDMG))
plot1<-ggplot(data_propdmg[1:10,],aes(EVTYPE,fill=PROPDMG))+geom_bar(aes(y=PROPDMG),stat="identity") +
ggtitle("Top 10 Events Property Damage")+ylab("Total Cost")+xlab("Type of Event")+
scale_fill_gradient(low="yellow",high="red")+coord_flip()
data_cropdmg<-transform(data_by_event,EVTYPE=reorder(EVTYPE,CROPDMG))
data_cropdmg<-arrange(data_cropdmg,desc(CROPDMG))
plot2<-ggplot(data_cropdmg[1:10,],aes(EVTYPE,fill=CROPDMG))+geom_bar(aes(y=CROPDMG),stat="identity") +
ggtitle("Top 10 Events Crop Damage")+ylab("Total Cost")+xlab("Type of Event")+
scale_fill_gradient(low="yellow",high="red")+coord_flip()
#display the two images in on page
g1<-ggplotGrob(plot1)
g2<-ggplotGrob(plot2)
g<-rbind(g1,g2,size="first")
grid.newpage()
grid.draw(g)
This map of the United States illustrate the impact of each event by state. We can see that Texas and California are greatly impacted by natural disasters.
plot1 <- ggplot(data_by_state, aes(fill = FATALITIES,fill2=INJURIES))
plot1<-plot1 + geom_map(aes(map_id = state_name),map=fifty_states) + expand_limits(x = fifty_states$long, y = fifty_states$lat)+ scale_x_continuous(breaks = NULL) + scale_y_continuous(breaks = NULL) + labs(x = "", y = "") +
fifty_states_inset_boxes() + scale_fill_colorplane()
## Weather events impact on financial damage caused by property and crop damage
cost_by_state<-arrange(data_by_state,desc(CROPDMG))
cost_by_state<-arrange(cost_by_state,desc(PROPDMG))
plot2 <- ggplot(data_by_state, aes(fill = CROPDMG,fill2=PROPDMG))
plot2<-plot2 + geom_map(aes(map_id = state_name),map=fifty_states) +expand_limits(x = fifty_states$long,
y = fifty_states$lat)+ scale_x_continuous(breaks = NULL) + scale_y_continuous(breaks = NULL) + labs(x = "", y = "") +
fifty_states_inset_boxes() +scale_fill_colorplane()
#display the two images in on page
g1<-ggplotGrob(plot1)
g2<-ggplotGrob(plot2)
g<-rbind(g1,g2,size="first")
grid.newpage()
grid.draw(g)