Background

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

Synopsis

Our findings found that floods caused the greatest amount of property damage. Tornados caused the greatest loss of human life and injuries.

Loading and Processing the Raw Data

The NOAA data was downloaded from the Coursera course website. https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2

if(!file.exists("./Project")) 
{dir.create("./Project")
  
  fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
  download.file(fileURL, destfile = "./Project/Storm_Data.csv.bz2")

}

Data_Set <- read.csv("./Project/Storm_Data.csv.bz2")

As the focus of this analysis was to identify those weather events which had the greatest significant impact, the data set was then reduced to just the weather events that caused some level of property damage, crop damage, injury or fatality. (This also has the benefit of making the data set easier to work with.) If the assignment sought to focus on the mean/median/distribution of each weather event, these additional data points could have been left in.

library(dplyr, quietly = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Weather_subset <- subset(Data_Set, FATALITIES >0 | INJURIES>0|PROPDMG>0|CROPDMG>0)
dim(Data_Set)
## [1] 902297     37
dim(Weather_subset)
## [1] 254633     37

Next, the PROPDMGEXP and CROPDMGEXP fields are cleaned up to remove invalid values. All 0’s, blanks, and symbols were translated to a value of 1. Any numeric value was used as is. h mapped to hundreds (10^2), k = thousands (10^3), m= millions(10^6), b= billions(10^9).

# PROPDMGEXP transformation

Weather_subset$PROPDMGEXP[Weather_subset$PROPDMGEXP %in% c('', '0')] <-1  
Weather_subset$PROPDMGEXP <- gsub("\\+", 1, Weather_subset$PROPDMGEXP )
Weather_subset$PROPDMGEXP <- gsub("\\-", 1, Weather_subset$PROPDMGEXP )

Weather_subset$PROPDMGEXP <- gsub("h", 2, ignore.case = TRUE, Weather_subset$PROPDMGEXP )
Weather_subset$PROPDMGEXP <- gsub("k", 3, ignore.case = TRUE, Weather_subset$PROPDMGEXP ) 
Weather_subset$PROPDMGEXP <- gsub("m", 6, ignore.case = TRUE, Weather_subset$PROPDMGEXP )
Weather_subset$PROPDMGEXP <- gsub("b", 9, ignore.case = TRUE, Weather_subset$PROPDMGEXP )

# CROPDMGEXP transformation

Weather_subset$CROPDMGEXP <- gsub("h", 2, ignore.case = TRUE, Weather_subset$CROPDMGEXP )
Weather_subset$CROPDMGEXP <- gsub("k", 3, ignore.case = TRUE, Weather_subset$CROPDMGEXP ) 
Weather_subset$CROPDMGEXP <- gsub("m", 6, ignore.case = TRUE, Weather_subset$CROPDMGEXP )
Weather_subset$CROPDMGEXP <- gsub("b", 9, ignore.case = TRUE, Weather_subset$CROPDMGEXP )
Weather_subset$CROPDMGEXP <- gsub("\\?", 1, Weather_subset$CROPDMGEXP )
Weather_subset$CROPDMGEXP[Weather_subset$CROPDMGEXP %in% c('', '0')] <-1 

# Add additional columns that multiply the damage by the exponent
Weather_subset <- Weather_subset %>% mutate( PROP_DAMAGE_FULL = (10^as.numeric(PROPDMGEXP))*PROPDMG )
Weather_subset <- Weather_subset %>% mutate( CROP_DAMAGE_FULL = (10^as.numeric(CROPDMGEXP))*CROPDMG )

# Add additional columns that add CROP+PROP, INJURIES+FATALITIES
Weather_subset <- Weather_subset %>% mutate( TOTAL_DAMAGE_FULL = PROP_DAMAGE_FULL+CROP_DAMAGE_FULL)
Weather_subset <- Weather_subset %>% mutate( TOTAL_INJ_FATALITY = INJURIES+FATALITIES)

Clean up the event types

All of the 448 event types were converted to upper case. For the sake of expediency and clarity, a mapping file was created that merged similar event types in to logical categories e.g., “TSTM WIND G45” was categorized as “TROPICAL STORM”. The cateogory file used was “Event Type Category.txt”

library(stringr)
Weather_subset$EVTYPE <- str_to_upper(Weather_subset$EVTYPE)
Weather_subset$EVTYPE <- droplevels(Weather_subset)$EVTYPE

ev_categories <- read.csv("./Event Type CATEGORY FILE.csv")
Weather_subset <- merge(Weather_subset, ev_categories, by = "EVTYPE")

Results

Individual event damage is aggregated at the category level. The top 10 categories are displayed in rank order of total damage (crop + property).

library(ggplot2)
total_dam_by_event_type <- aggregate(TOTAL_DAMAGE_FULL~CATEGORY, data = Weather_subset, sum) 
count_events_by_type <- aggregate(TOTAL_DAMAGE_FULL~CATEGORY, data = Weather_subset, length) 

display_damage_and_count_by_category <- merge(total_dam_by_event_type, count_events_by_type, by = "CATEGORY")
names(display_damage_and_count_by_category) <- c("Category", "Damage", "Count")
display_damage_and_count_by_category <- arrange(display_damage_and_count_by_category, desc(Damage))

dam_plot <- ggplot(data = head(display_damage_and_count_by_category, n=10), 
                   aes(x = reorder(Category, Damage), y = Damage/1000000000) ) + geom_bar(stat="identity") +coord_flip()
                
dam_plot <- dam_plot + labs(x = "Storm Category", y= "Damage in Billions USD")
dam_plot

Similarly, the top 10 storm categories are ranked by total number of injuries and fatalities caused.

total_INJ_FATALITY_by_event_type <- aggregate(TOTAL_INJ_FATALITY~CATEGORY, data = Weather_subset, sum) 
display_Inj_Fatality_by_category <- merge(total_INJ_FATALITY_by_event_type, count_events_by_type, by = "CATEGORY")
names(display_Inj_Fatality_by_category) <- c("Category", "Inj_Fatality", "Count")

display_Inj_Fatality_by_category <- arrange(display_Inj_Fatality_by_category, desc(Count))

Casualty_Plot <- ggplot(data = head(display_Inj_Fatality_by_category, n=10),
                        aes(x= reorder(Category, Inj_Fatality), y= Inj_Fatality)) + geom_bar(stat = "identity") + coord_flip()
Casualty_Plot <- Casualty_Plot + labs(x = "Storm Category", y = "Combined Injuries and Fatalities") + ggtitle("Top Storm Categories by Number of Fatalities and Injuries")
Casualty_Plot

Conclusion:

Based on this analysis, I’d much rather deal with a Flood than a Tornado. Crops and property can be replaced, but human lives cannot. - The type of event that is most harmful with respect to population health is the Tornado. - The type of event that causes the most economic consequences is Flood.