Synopsis/Summary

Severe weather events captured in the NOAA Storm database from 1950-2011 have been analyzed to assess the events causing the largest impact on US population health and economy. Tornadoes events were found to responsible for the greatest number of both fatalities and injuries. Floods were found to be responsible for the greatest economic cost.

Introduction

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

Data processing

load required libraries

library(dplyr) # load dplyr for data manipulation
library(ggplot2) # ggplot for data visualization
library(gridExtra) # to plot graphs side by side
library(reshape2) # to melt the dataframe

Loading the data

# url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data <- read.csv("repdata_data_StormData.csv", stringsAsFactors = FALSE, header=TRUE)
str(data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

The data contains more than 90,000 observations and 37 variables. We need to extract and create a YEAR column from datetime column BGN_DATE. First, the column is converteded into datetime format and then a new column YEAR is created from BGN_DATE column.

# format BGN_DATE as appropriate datetime format
data$BGN_DATE <- as.POSIXct(data$BGN_DATE, format="%m/%d/%Y %H:%M:%S")
# Create YEAR Column
data$YEAR <- format(data$BGN_DATE, format="%Y")

The objective of this assignment is to collect data on human and economic cost of various natural events. We will retain relevant columns for this analysis and discard any redundant information.

important <- c("YEAR", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
data <- data[,important]
str(data)
## 'data.frame':    902297 obs. of  8 variables:
##  $ YEAR      : chr  "1950" "1950" "1951" "1951" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...

For the analysis, YEAR and EVTYPE columns are converted into factor variables.

# convert YEAR and EVTYPE as factor variables
data$YEAR <- as.factor(data$YEAR)
data$EVTYPE <- as.factor(data$EVTYPE)

The property damage (PROPDMG) and crop damage(CROPDMG) number is reported with different units. To get a uniform cost estimate, we will convert all data in a uniform unit.

# convert units to calculate property damage
data$PDMULTIPLIER <- 10^0
data$PDMULTIPLIER[data$PROPDMGEXP == "h"] <- 10^2
data$PDMULTIPLIER[data$PROPDMGEXP == "H"] <- 10^2
data$PDMULTIPLIER[data$PROPDMGEXP == "K"] <- 10^3
data$PDMULTIPLIER[data$PROPDMGEXP == "m"] <- 10^6
data$PDMULTIPLIER[data$PROPDMGEXP == "M"] <- 10^6
data$PDMULTIPLIER[data$PROPDMGEXP == "B"] <- 10^9
# convert units to calculate crop damage damage
data$CDMULTIPLIER <- 10^0
data$CDMULTIPLIER[data$CROPDMGEXP == "k"] <- 10^3
data$CDMULTIPLIER[data$CROPDMGEXP == "K"] <- 10^3
data$CDMULTIPLIER[data$CROPDMGEXP == "m"] <- 10^6
data$CDMULTIPLIER[data$CROPDMGEXP == "M"] <- 10^6
data$CDMULTIPLIER[data$CROPDMGEXP == "B"] <- 10^9
str(data)
## 'data.frame':    902297 obs. of  10 variables:
##  $ YEAR        : Factor w/ 62 levels "1950","1951",..: 1 1 2 2 2 2 2 3 3 3 ...
##  $ EVTYPE      : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ FATALITIES  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES    : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG     : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP  : chr  "K" "K" "K" "K" ...
##  $ CROPDMG     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP  : chr  "" "" "" "" ...
##  $ PDMULTIPLIER: num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ CDMULTIPLIER: num  1 1 1 1 1 1 1 1 1 1 ...

To calculate total cost of each event, we will multiply reported numbers with the respective multiplier.

data$PROPTOTAL <- data$PROPDMG * data$PDMULTIPLIER
data$CROPTOTAL <- data$CROPDMG * data$CDMULTIPLIER
data$TOTALCOST <- data$PROPTOTAL + data$CROPTOTAL

Results

Human impact of events

#fatalities
fatalities <- data %>% group_by(EVTYPE) %>% summarise(totalFatalities = sum(FATALITIES)) %>% arrange(desc(totalFatalities)) %>% head(n=10)
plot1 <- ggplot(fatalities, aes(x = reorder(EVTYPE, -totalFatalities), y = totalFatalities)) + 
    geom_bar(stat="identity", fill="red") +
    ggtitle("Graph: Fatalities by type of catastrophe") + 
    xlab("") +
    ylab("Fatalities") +
    theme(text=element_text(size=10)) +
    coord_flip()
# Injuries
injuries <- data %>% group_by(EVTYPE) %>% summarise(totalInjuries = sum(INJURIES)) %>% arrange(desc(totalInjuries)) %>% head(n=10)
plot2 <- ggplot(injuries, aes(x = reorder(EVTYPE, -totalInjuries), y = totalInjuries) ) + 
    geom_bar(stat="identity", fill="green") + 
    ggtitle("Graph: Injuries by type of catastrophe") + 
    xlab("") +
    ylab("Injuries") +
    theme(text=element_text(size=10)) +
    coord_flip()
grid.arrange(plot1, plot2, ncol=2)

Economic impact of events

economic <- data %>% select(EVTYPE, PROPTOTAL, CROPTOTAL, TOTALCOST) %>% group_by(EVTYPE) %>% 
    summarise(Property = sum(PROPTOTAL)/1000000000, Crop = sum(CROPTOTAL)/1000000000, Total = sum(TOTALCOST)/1000000000 ) %>% arrange(desc(Total)) %>% head(n=10)
economic <- melt(data = economic, id.vars = "EVTYPE", measure.vars = c("Property", "Crop"))
economic$EVTYPE <- as.character(economic$EVTYPE)
economic$EVTYPE <- factor(economic$EVTYPE, levels=unique(economic$EVTYPE))
# economic
# Crop damage
plot3 <- ggplot(economic, aes(x = EVTYPE, y = value, fill=variable) ) + 
    geom_bar(stat="identity") + 
    ggtitle("Economic Cost of Environmental Events") + 
    xlab("Event type") + 
    ylab("Cost in billions") +
    theme(legend.title=element_blank(), 
          legend.position=c(0.85,0.85),
          axis.text.x = element_text(angle=20, hjust=1)) +
    scale_fill_discrete(labels=c(" Property damage", " Crop damage"))
    
plot3