SYNOPSIS

The objective of the analysis is to study the storm data collected by the National Oceanic and Atmospheric Administration (NOAA) to answer the following two questions. First, which type of events are the most harmful with respect to population health? Second, which types of events have the greatest economic consequences? To answer the first question I will concentrate my analysis in the total number of fatalities and injuries per event. The second question I will analyze the property damage and crop damage across the United States. I will explore the storm data database from NOAA to conduct my analysis. The full data documentation is found here - Storm Data Documentation

DATA PROCESSING

First we download and unzip the storm dataset from the NOAA website. I only imported the columns necessary to complete my analysis. This is will limit the object memory allocation and load the data faster.

library(R.utils) # need this to unzip the bzip2 file

warn.conflicts = FALSE

location=getwd() #use the current working directory

url<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
filename<-"repdataStormData.csv"

data_location<-file.path(location,"noaadata") # creates a repository to keep the raw data

data_file<-file.path(data_location,filename) #concat path and file name

#if the directory data does not existing download fresh data
if(!file.exists(data_file)){
  rawdata<-tempfile()
  rawdata<-paste(rawdata,filename,".bz2",sep="")
  download.file(url,rawdata) #download data from the URL
  bunzip2(rawdata,data_file)
}

library(data.table)
data<-fread(data_file,sep=",",stringsAsFactors = FALSE,select = c("BGN_DATE","TIME_ZONE","EVTYPE","FATALITIES","INJURIES","STATE","COUNTYNAME","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP","COUNTY","LATITUDE","LONGITUDE"),nrows=-1)

Some necessary libraries we will be using for this analysis

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(grid)
library(gtable)
library(dplyr)
library(colorplaner)
## Warning: package 'colorplaner' was built under R version 3.4.4
library(fiftystater)
## Warning: package 'fiftystater' was built under R version 3.4.4
library(stringdist)
## Warning: package 'stringdist' was built under R version 3.4.4

Performing some transformation to the original dataset

  • Trim all the event types (EVTYPE) and upper cased the names
  • Convert the FATALITIES and INJURIES columns to numeric
  • Use the as.Date function to convert the BGN_DATE to date
  • Use the columns PROPDMGEXP and CROPDMGEXP to create the correct multiplier
  • Create two new columns PROPDMG_CASH and CROPDMG_CASH to use for one the studies
  • Add the new columns to the dataset
data$EVTYPE=toupper(trimws(data$EVTYPE))
data$FATALITIES=as.numeric(data$FATALITIES)
data$INJURIES=as.numeric(data$INJURIES)
data$BGN_DATE=as.Date(data$BGN_DATE,format="%m/%d/%Y")  #need to convert to date

### formating the crop damage and property damage to real dollar amount
data<-mutate(data,PROPDMGEXP=ifelse(PROPDMGEXP=='B',1E9,
                         ifelse(PROPDMGEXP=='M',1E6,
                                ifelse(PROPDMGEXP=='K',1E3,0))))

data<-mutate(data,CROPDMGEXP=ifelse(CROPDMGEXP=='B',1E9,
                                     ifelse(CROPDMGEXP=='M',1E6,
                                            ifelse(CROPDMGEXP=='K',1E3,0))))

data<-mutate(data,PROPDMG_CASH=PROPDMGEXP*PROPDMG,CROPDMG_CASH=CROPDMGEXP*CROPDMG)

Using clustering to group the weather events

During the analysis of the raw data I realized the EVTYPE column did not followed the naming convention in the National Weather Service Storm Data Documentation . I found lots variations especially with misspellings and abbreviations. First, I used regex to clean some of most obvious problems then I used clustering to try and group as many similar events together. I used the stringdist package to cluster the EVTYPEs. After many trial and errors I figured that cutting the tree at 0.3 gave me the best results.

#first some regex to fix Thunderstorms Wind
data$EVTYPE<-gsub('^TSTM WIND','THUDERSTORM WIND',data$EVTYPE)
data$EVTYPE<-gsub('^THUDERSTORM WINDs','THUDERSTORM WIND',data$EVTYPE)

#Excessive Heat is also an easy one to determined 
data$EVTYPE<-gsub('^HEAT','EXCESSIVE HEAT',data$EVTYPE)
data$EVTYPE<-gsub('^EXCESSIVE','EXCESSIVE HEAT',data$EVTYPE)

set.seed(1973)

EVTYPES <- unique(data$EVTYPE) #get the unique values only

cluster_event<-stringdistmatrix(EVTYPES,EVTYPES,method = "jw")

hc <- hclust(as.dist(cluster_event))
event_cluster_df <- data.frame(EVTYPES,cutree(hc,h=0.3))

colnames(event_cluster_df)<-c("Event","Cluster")

rownames(event_cluster_df)<-event_cluster_df$Event
event_cluster_df<-event_cluster_df[order(event_cluster_df$Cluster),]
data<-merge(data,event_cluster_df,by.x="EVTYPE",by.y="Event",all.x=TRUE,all.y=TRUE)

test_cluster<-group_by(data,EVTYPE,Cluster)
test_cluster<-summarize(test_cluster,count=n())
test_cluster<-ungroup(test_cluster)

Quick demostration that the clustering works

subset(test_cluster,Cluster==1)
## # A tibble: 10 x 3
##    EVTYPE                     Cluster count
##    <chr>                        <int> <int>
##  1 TORNADO                          1 60652
##  2 TORNADO DEBRIS                   1     1
##  3 TORNADO F0                       1    19
##  4 TORNADO F1                       1     4
##  5 TORNADO F2                       1     3
##  6 TORNADO F3                       1     2
##  7 TORNADOES                        1     2
##  8 TORNADOES, TSTM WIND, HAIL       1     1
##  9 TORNADOS                         1     1
## 10 TORNDAO                          1     1
subset(test_cluster,Cluster==44)
## # A tibble: 3 x 3
##   EVTYPE                  Cluster count
##   <chr>                     <int> <int>
## 1 HIGH TEMPERATURE RECORD      44     3
## 2 LOW TEMPERATURE RECORD       44     1
## 3 TEMPERATURE RECORD           44    43

After using clustering to group the events now we summarize the data by cluster.

data_by_event<-group_by(data,Cluster)

data_by_event<-summarize(data_by_event,
                     count=n(),
                     EVTYPE=first(EVTYPE),
                     FATALITIES=sum(FATALITIES,na.rm=TRUE),
                     INJURIES=sum(INJURIES,na.rm=TRUE),
                     PROPDMG=sum(PROPDMG_CASH,na.rm=TRUE),
                     CROPDMG=sum(CROPDMG_CASH,na.rm=TRUE)
                     )

data_by_event<-ungroup(data_by_event)
data_by_event <-arrange(data_by_event,desc(FATALITIES))
data_by_event <-arrange(data_by_event,desc(INJURIES))

I also thought it will be necessary to show the impact of the weather events across the United States. I am grouping the total fatalities, injuries, property damage and crop damage by state columns (FATALITIES,INJURIES,PROPDMG_CASH,CROPDMG_CASH).

data_fatalities<-data_by_event[order(-data_by_event$FATALITIES),]
data_injuries<-data_by_event[order(-data_by_event$INJURIES),]

data_by_state<-group_by(data,STATE)

data_by_state<-summarize(data_by_state,
                         count=n(),
                         EVTYPE=first(EVTYPE),
                         count=n(),
                         FATALITIES=sum(FATALITIES,na.rm=TRUE),
                         INJURIES=sum(INJURIES,na.rm=TRUE),
                         PROPDMG=sum(PROPDMG_CASH,na.rm=TRUE),
                         CROPDMG=sum(CROPDMG_CASH,na.rm=TRUE))

data_by_state<-ungroup(data_by_state)

And, finally I am adding the states names to each grouping so we can use ggplot2 with geom_map to plot a map of the United States.

getStateName<-function(x){
  if(length(state.name[state.abb==x])>0) 
    return(tolower(state.name[state.abb==x]))
  else
    return("")
}

data_by_state<-mutate(data_by_state,state_name=sapply(data_by_state$STATE,getStateName))

RESULTS

Weather events most harmful in respect to population health

This table shows the top ten events based on total of fatalities. According my analysis and the empirical data in the plot tornados have the biggest impact on population health.

data_by_event <-arrange(data_by_event,desc(FATALITIES))
data_by_event <-arrange(data_by_event,desc(INJURIES))
head(data_by_event[c("EVTYPE","FATALITIES","INJURIES")],10)
## # A tibble: 10 x 3
##    EVTYPE           FATALITIES INJURIES
##    <chr>                 <dbl>    <dbl>
##  1 TORNADO              5658      91364
##  2 THUDERSTORM WIND      711       9507
##  3 EXCESSIVE HEAT       3023       9042
##  4 FLOOD                 476       6791
##  5 LIGHTING              817       5230
##  6 ICE STORM              89.0     1977
##  7 FLASH FLOOD          1018       1785
##  8 WILD FIRES             90.0     1606
##  9 HIGH  WINDS           286       1449
## 10 WINTER MIX            218       1430

This plot shows the top events base on the number of injuries and fatalities

data_injuries<-transform(data_injuries,EVTYPE=reorder(EVTYPE,INJURIES))
data_injuries<-arrange(data_injuries,desc(INJURIES))

plot1<-ggplot(data_injuries[1:10,],aes(EVTYPE,fill=INJURIES))+geom_bar(aes(y=INJURIES),stat="identity")+
  ggtitle("Top 10 Events Injuries")+ylab("Total Injuries")+xlab("Type of Event")+
  scale_fill_gradient(low="yellow",high="red")+coord_flip()

data_fatalities<-transform(data_fatalities,EVTYPE=reorder(EVTYPE,FATALITIES))
data_fatalities<-arrange(data_fatalities,desc(INJURIES))

plot2<-ggplot(data_fatalities[1:10,],aes(EVTYPE,fill=FATALITIES))+geom_bar(aes(y=FATALITIES),stat="identity") +
  ggtitle("Top 10 Events Fatalities")+ylab("Total Fatalities")+xlab("Type of Event")+
  scale_fill_gradient(low="yellow",high="red")+coord_flip()

#display the two images in on page
g1<-ggplotGrob(plot1)
g2<-ggplotGrob(plot2)
g<-rbind(g1,g2,size="first")
grid.newpage()
grid.draw(g)

Weather events with the greatest economic consequences

This table shows the top ten events with the highest economic cost

data_by_event <-arrange(data_by_event,desc(PROPDMG))
data_by_event <-arrange(data_by_event,desc(CROPDMG))
head(data_by_event[c("EVTYPE","PROPDMG","CROPDMG")],10)
## # A tibble: 10 x 3
##    EVTYPE                         PROPDMG     CROPDMG
##    <chr>                            <dbl>       <dbl>
##  1 DROUGHT                     1046106000 13972566000
##  2 FLOOD                     144771964800  5783673950
##  3 BEACH FLOOD                 5235239500  5057484000
##  4 ICE STORM                   3944927810  5022113500
##  5 HAIL                       15727918720  3025627450
##  6 HURRICANE                  15330340010  2887420000
##  7 HURRICANE OPAL/HIGH WINDS  69405840000  2617872800
##  8 FLASH FLOOD                16732553610  1437153150
##  9 EXTREME COLD                  77190400  1330023000
## 10 THUDERSTORM WIND            9758818130  1225345900

The following plot shows the top ten events based on property damage and crop damage

data_propdmg<-transform(data_by_event,EVTYPE=reorder(EVTYPE,PROPDMG))
data_propdmg<-arrange(data_propdmg,desc(PROPDMG))
plot1<-ggplot(data_propdmg[1:10,],aes(EVTYPE,fill=PROPDMG))+geom_bar(aes(y=PROPDMG),stat="identity") +
  ggtitle("Top 10 Events Property Damage")+ylab("Total Cost")+xlab("Type of Event")+
  scale_fill_gradient(low="yellow",high="red")+coord_flip()

data_cropdmg<-transform(data_by_event,EVTYPE=reorder(EVTYPE,CROPDMG))
data_cropdmg<-arrange(data_cropdmg,desc(CROPDMG))
plot2<-ggplot(data_cropdmg[1:10,],aes(EVTYPE,fill=CROPDMG))+geom_bar(aes(y=CROPDMG),stat="identity") +
  ggtitle("Top 10 Events Crop Damage")+ylab("Total Cost")+xlab("Type of Event")+
  scale_fill_gradient(low="yellow",high="red")+coord_flip()

#display the two images in on page
g1<-ggplotGrob(plot1)
g2<-ggplotGrob(plot2)
g<-rbind(g1,g2,size="first")
grid.newpage()
grid.draw(g)

This map of the United States illustrate the impact of each event by state. We can see that Texas and California are greatly impacted by natural disasters.

plot1 <- ggplot(data_by_state, aes(fill = FATALITIES,fill2=INJURIES))
plot1<-plot1 + geom_map(aes(map_id = state_name),map=fifty_states) +  expand_limits(x = fifty_states$long, y = fifty_states$lat)+ scale_x_continuous(breaks = NULL) + scale_y_continuous(breaks = NULL) + labs(x = "", y = "") +
  fifty_states_inset_boxes() +  scale_fill_colorplane()

## Weather events impact on financial damage caused by property and crop damage
cost_by_state<-arrange(data_by_state,desc(CROPDMG))
cost_by_state<-arrange(cost_by_state,desc(PROPDMG))

plot2 <- ggplot(data_by_state, aes(fill = CROPDMG,fill2=PROPDMG))
plot2<-plot2 + geom_map(aes(map_id = state_name),map=fifty_states) +expand_limits(x = fifty_states$long, 
y = fifty_states$lat)+  scale_x_continuous(breaks = NULL) +  scale_y_continuous(breaks = NULL) + labs(x = "", y = "") +
  fifty_states_inset_boxes() +scale_fill_colorplane()

#display the two images in on page
g1<-ggplotGrob(plot1)
g2<-ggplotGrob(plot2)
g<-rbind(g1,g2,size="first")
grid.newpage()
grid.draw(g)