Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The analysis and results below show that tornadoes cause the most injuries and fatalities while floods cause the most monetary damages.

Basic settings

#Load required libraries
library(ggplot2)
library(cowplot)
library(plyr)

# Set system locale
Sys.setlocale("LC_TIME", "C")
options("scipen"=100)

# Create a custom theme for charts
customTheme <- theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), axis.title.x=element_blank())

Loading and Processing the Raw Data

if(!file.exists('repdata-data-StormData.csv.bz2')) {
    download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
                  destfile = "repdata-data-StormData.csv.bz2")
} 

stormData <- read.csv("repdata-data-StormData.csv.bz2", sep = ",")

stormData$date = as.Date(stormData$BGN_DATE, format = "%m/%d/%Y")
stormData$year = as.numeric(format(stormData$date, "%Y"))

dim(stormData)
## [1] 902297     39

There are a large database with 902297 rows and 37 columns in total. Below the main variables used in this analysis.

summary(stormData[,c(2,7,8,23,24,25,27)])
##               BGN_DATE          STATE                      EVTYPE      
##  5/25/2011 0:00:00:  1202   TX     : 83728   HAIL             :288661  
##  4/27/2011 0:00:00:  1193   KS     : 53440   TSTM WIND        :219940  
##  6/9/2011 0:00:00 :  1030   OK     : 46802   THUNDERSTORM WIND: 82563  
##  5/30/2004 0:00:00:  1016   MO     : 35648   TORNADO          : 60652  
##  4/4/2011 0:00:00 :  1009   IA     : 31069   FLASH FLOOD      : 54277  
##  4/2/2006 0:00:00 :   981   NE     : 30271   FLOOD            : 25326  
##  (Other)          :895866   (Other):621339   (Other)          :170878  
##    FATALITIES          INJURIES            PROPDMG       
##  Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00  
##  1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   0.00  
##  Median :  0.0000   Median :   0.0000   Median :   0.00  
##  Mean   :  0.0168   Mean   :   0.1557   Mean   :  12.06  
##  3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:   0.50  
##  Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00  
##                                                          
##     CROPDMG       
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  1.527  
##  3rd Qu.:  0.000  
##  Max.   :990.000  
## 

The distribution of storms with a sharp increase in the number of events after 1995 we can see on the chart below

ggplot(count(stormData, "year"), aes(x=year, y=freq)) +
    ggtitle("Total count of storms during 1950 - 2011 years") + ylab("Number of Storms") +
    geom_bar(colour="black", fill="white", stat="identity") +
    geom_smooth(method="loess")

Determining the Property and Crop Damage values

Each type of damage is divided in two columns, one that stores the absolute value in dollars (PROPDMG and CROPDMG - see above), and another which stores a multiplier to be applied on each absolute value (PROPDMGEXP and CROPDMGEXP).

levels(stormData$PROPDMGEXP)
##  [1] ""  "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
#Determining the exponents of property damage
stormData$prop_exp[stormData$PROPDMGEXP == "B"] <- 1e+09
stormData$prop_exp[stormData$PROPDMGEXP %in% c("M", "m", "6")] <- 1e+06
stormData$prop_exp[stormData$PROPDMGEXP %in% c("K", "k", "3")] <- 1000
stormData$prop_exp[stormData$PROPDMGEXP %in% c("H", "h", "2")] <- 100
stormData$prop_exp[stormData$PROPDMGEXP %in% c("", "0", "+", "-", "?")] <- 1
stormData$prop_exp[stormData$PROPDMGEXP == "8"] <- 1e+08
stormData$prop_exp[stormData$PROPDMGEXP == "7"] <- 1e+07
stormData$prop_exp[stormData$PROPDMGEXP == "5"] <- 1e+05
stormData$prop_exp[stormData$PROPDMGEXP == "4"] <- 10000
stormData$prop_exp[stormData$PROPDMGEXP == "1"] <- 10

# Calculation of the total value of property damage
stormData$propertydamage <- stormData$PROPDMG * stormData$prop_exp

levels(stormData$CROPDMGEXP)
## [1] ""  "?" "0" "2" "B" "k" "K" "m" "M"
#Determining the exponents of crop damage
stormData$crop_exp[stormData$CROPDMGEXP == "B"] <- 1e+09
stormData$crop_exp[stormData$CROPDMGEXP %in% c("M", "m", "6")] <- 1e+06
stormData$crop_exp[stormData$CROPDMGEXP %in% c("K", "k", "3")] <- 1000
stormData$crop_exp[stormData$CROPDMGEXP %in% c("H", "h", "2")] <- 100
stormData$crop_exp[stormData$CROPDMGEXP %in% c("", "0", "+", "-", "?")] <- 1
stormData$crop_exp[stormData$CROPDMGEXP == "1"] <- 10

# Calculation of the total value of crop damage
stormData$cropdamage <- stormData$CROPDMG * stormData$crop_exp

Results

Aross the United States, Which types of events are nost harmful with respect to population health?

We are going to use the social damages: fatalities and injuries.

# aggregate and sum the data by fatalities
stormData$FATALITIES <- as.integer(stormData$FATALITIES)

aggFatalities <- aggregate(FATALITIES ~ EVTYPE, stormData, sum)
aggFatalities <- aggFatalities [order(aggFatalities$FATALITIES, decreasing=TRUE), ]

topFatalities <- aggFatalities[1:15,]
topFatalities$EVTYPE <- factor(topFatalities$EVTYPE, levels=unique(as.character(topFatalities$EVTYPE)))

# aggregate and sum the data by injuries
stormData$INJURIES <- as.integer(stormData$INJURIES)

aggInjuries <- aggregate(INJURIES ~ EVTYPE, stormData, sum)
aggInjuries <- aggInjuries [order(aggInjuries$INJURIES, decreasing=TRUE), ]

topInjuries <- aggInjuries[1:15,]
topInjuries$EVTYPE <- factor(topInjuries$EVTYPE, levels=unique(as.character(topInjuries$EVTYPE)))

fatPlot <- ggplot(topFatalities, aes(x=EVTYPE, y=FATALITIES)) +
    ggtitle("Fatalities caused by storm events") +
    ylab("Fatalities qty.") +
    scale_y_continuous(breaks = seq(0, 100000, by = 500)) +
    geom_bar(fill="grey", stat="identity") +
    theme_bw() + customTheme
injPlot <- ggplot(topInjuries, aes(x=EVTYPE, y=INJURIES)) +
    ggtitle("Injuries caused by storm events") +
    ylab("Injuries qty.") +
    scale_y_continuous(breaks = seq(0, 1000000, by = 10000)) +
    geom_bar(fill="grey", stat="identity") +
    theme_bw() + customTheme

plot_grid(fatPlot, injPlot, ncol = 2, align = 'h')

Figure 1: Total social damages provoked by storm events in US (states, territories and adjacent oceanic areas). In order to make graphic display more feasible only the 15 most damaging events are shown.

As we can see, tornadoes are by far the most dangerous to human health both in terms of fatalities and injury.

Across the United States, which types of events have the greatest economic consequences?

Earlier we have calculated property damage and crop damage in absolute values. And now we will use them and calcutate amounts by storm type.

# aggregate and sum the data by property damage
aggPropDamage <- aggregate(propertydamage ~ EVTYPE, data = stormData, FUN = sum)
aggPropDamage <- aggPropDamage[order(aggPropDamage$propertydamage, decreasing=TRUE),]

topPropDamage <- aggPropDamage[1:15,]
topPropDamage$EVTYPE <- factor(topPropDamage$EVTYPE, levels=unique(as.character(topPropDamage$EVTYPE)))

# aggregate and sum the data by crop damage
aggCropDamage <- aggregate(cropdamage ~ EVTYPE, data = stormData, FUN = sum)
aggCropDamage <- aggCropDamage[order(aggCropDamage$cropdamage, decreasing=TRUE),]

topCropDamage <- aggCropDamage[1:15,]
topCropDamage$EVTYPE <- factor(topCropDamage$EVTYPE, levels=unique(as.character(topCropDamage$EVTYPE)))

# Draw charts
propDamagePlot <- ggplot(topPropDamage, aes(x=EVTYPE, y=propertydamage/10^9)) +
    ggtitle("Properties damages caused by storm events") +
    ylab("Damages (billions - US dollars)") +
    scale_y_continuous(breaks = seq(0, 10000, by = 10)) +
    geom_bar(fill="grey", stat="identity") +
    theme_bw() + customTheme

cropDamagePlot <- ggplot(topCropDamage, aes(x=EVTYPE, y=cropdamage/10^9)) +
    ggtitle("Crop damages caused by storm events") +
    ylab("Damages (billions - US dollars)") +
    scale_y_continuous(breaks = seq(0, 10000, by = 1)) +
    geom_bar(fill="grey", stat="identity") +
    theme_bw() + customTheme

plot_grid(propDamagePlot, cropDamagePlot, ncol = 2)

Based on the above graphs, it is determined that floods and hurricanes caused the most property damage, while droughts and floods caused the most crop damage in the United States between 1990 to 2011.