Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The analysis and results below show that tornadoes cause the most injuries and fatalities while floods cause the most monetary damages.
#Load required libraries
library(ggplot2)
library(cowplot)
library(plyr)
# Set system locale
Sys.setlocale("LC_TIME", "C")
options("scipen"=100)
# Create a custom theme for charts
customTheme <- theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), axis.title.x=element_blank())
if(!file.exists('repdata-data-StormData.csv.bz2')) {
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = "repdata-data-StormData.csv.bz2")
}
stormData <- read.csv("repdata-data-StormData.csv.bz2", sep = ",")
stormData$date = as.Date(stormData$BGN_DATE, format = "%m/%d/%Y")
stormData$year = as.numeric(format(stormData$date, "%Y"))
dim(stormData)
## [1] 902297 39
There are a large database with 902297 rows and 37 columns in total. Below the main variables used in this analysis.
summary(stormData[,c(2,7,8,23,24,25,27)])
## BGN_DATE STATE EVTYPE
## 5/25/2011 0:00:00: 1202 TX : 83728 HAIL :288661
## 4/27/2011 0:00:00: 1193 KS : 53440 TSTM WIND :219940
## 6/9/2011 0:00:00 : 1030 OK : 46802 THUNDERSTORM WIND: 82563
## 5/30/2004 0:00:00: 1016 MO : 35648 TORNADO : 60652
## 4/4/2011 0:00:00 : 1009 IA : 31069 FLASH FLOOD : 54277
## 4/2/2006 0:00:00 : 981 NE : 30271 FLOOD : 25326
## (Other) :895866 (Other):621339 (Other) :170878
## FATALITIES INJURIES PROPDMG
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
##
## CROPDMG
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
The distribution of storms with a sharp increase in the number of events after 1995 we can see on the chart below
ggplot(count(stormData, "year"), aes(x=year, y=freq)) +
ggtitle("Total count of storms during 1950 - 2011 years") + ylab("Number of Storms") +
geom_bar(colour="black", fill="white", stat="identity") +
geom_smooth(method="loess")
Each type of damage is divided in two columns, one that stores the absolute value in dollars (PROPDMG and CROPDMG - see above), and another which stores a multiplier to be applied on each absolute value (PROPDMGEXP and CROPDMGEXP).
levels(stormData$PROPDMGEXP)
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
#Determining the exponents of property damage
stormData$prop_exp[stormData$PROPDMGEXP == "B"] <- 1e+09
stormData$prop_exp[stormData$PROPDMGEXP %in% c("M", "m", "6")] <- 1e+06
stormData$prop_exp[stormData$PROPDMGEXP %in% c("K", "k", "3")] <- 1000
stormData$prop_exp[stormData$PROPDMGEXP %in% c("H", "h", "2")] <- 100
stormData$prop_exp[stormData$PROPDMGEXP %in% c("", "0", "+", "-", "?")] <- 1
stormData$prop_exp[stormData$PROPDMGEXP == "8"] <- 1e+08
stormData$prop_exp[stormData$PROPDMGEXP == "7"] <- 1e+07
stormData$prop_exp[stormData$PROPDMGEXP == "5"] <- 1e+05
stormData$prop_exp[stormData$PROPDMGEXP == "4"] <- 10000
stormData$prop_exp[stormData$PROPDMGEXP == "1"] <- 10
# Calculation of the total value of property damage
stormData$propertydamage <- stormData$PROPDMG * stormData$prop_exp
levels(stormData$CROPDMGEXP)
## [1] "" "?" "0" "2" "B" "k" "K" "m" "M"
#Determining the exponents of crop damage
stormData$crop_exp[stormData$CROPDMGEXP == "B"] <- 1e+09
stormData$crop_exp[stormData$CROPDMGEXP %in% c("M", "m", "6")] <- 1e+06
stormData$crop_exp[stormData$CROPDMGEXP %in% c("K", "k", "3")] <- 1000
stormData$crop_exp[stormData$CROPDMGEXP %in% c("H", "h", "2")] <- 100
stormData$crop_exp[stormData$CROPDMGEXP %in% c("", "0", "+", "-", "?")] <- 1
stormData$crop_exp[stormData$CROPDMGEXP == "1"] <- 10
# Calculation of the total value of crop damage
stormData$cropdamage <- stormData$CROPDMG * stormData$crop_exp
We are going to use the social damages: fatalities and injuries.
# aggregate and sum the data by fatalities
stormData$FATALITIES <- as.integer(stormData$FATALITIES)
aggFatalities <- aggregate(FATALITIES ~ EVTYPE, stormData, sum)
aggFatalities <- aggFatalities [order(aggFatalities$FATALITIES, decreasing=TRUE), ]
topFatalities <- aggFatalities[1:15,]
topFatalities$EVTYPE <- factor(topFatalities$EVTYPE, levels=unique(as.character(topFatalities$EVTYPE)))
# aggregate and sum the data by injuries
stormData$INJURIES <- as.integer(stormData$INJURIES)
aggInjuries <- aggregate(INJURIES ~ EVTYPE, stormData, sum)
aggInjuries <- aggInjuries [order(aggInjuries$INJURIES, decreasing=TRUE), ]
topInjuries <- aggInjuries[1:15,]
topInjuries$EVTYPE <- factor(topInjuries$EVTYPE, levels=unique(as.character(topInjuries$EVTYPE)))
fatPlot <- ggplot(topFatalities, aes(x=EVTYPE, y=FATALITIES)) +
ggtitle("Fatalities caused by storm events") +
ylab("Fatalities qty.") +
scale_y_continuous(breaks = seq(0, 100000, by = 500)) +
geom_bar(fill="grey", stat="identity") +
theme_bw() + customTheme
injPlot <- ggplot(topInjuries, aes(x=EVTYPE, y=INJURIES)) +
ggtitle("Injuries caused by storm events") +
ylab("Injuries qty.") +
scale_y_continuous(breaks = seq(0, 1000000, by = 10000)) +
geom_bar(fill="grey", stat="identity") +
theme_bw() + customTheme
plot_grid(fatPlot, injPlot, ncol = 2, align = 'h')
Figure 1: Total social damages provoked by storm events in US (states, territories and adjacent oceanic areas). In order to make graphic display more feasible only the 15 most damaging events are shown.
As we can see, tornadoes are by far the most dangerous to human health both in terms of fatalities and injury.
Earlier we have calculated property damage and crop damage in absolute values. And now we will use them and calcutate amounts by storm type.
# aggregate and sum the data by property damage
aggPropDamage <- aggregate(propertydamage ~ EVTYPE, data = stormData, FUN = sum)
aggPropDamage <- aggPropDamage[order(aggPropDamage$propertydamage, decreasing=TRUE),]
topPropDamage <- aggPropDamage[1:15,]
topPropDamage$EVTYPE <- factor(topPropDamage$EVTYPE, levels=unique(as.character(topPropDamage$EVTYPE)))
# aggregate and sum the data by crop damage
aggCropDamage <- aggregate(cropdamage ~ EVTYPE, data = stormData, FUN = sum)
aggCropDamage <- aggCropDamage[order(aggCropDamage$cropdamage, decreasing=TRUE),]
topCropDamage <- aggCropDamage[1:15,]
topCropDamage$EVTYPE <- factor(topCropDamage$EVTYPE, levels=unique(as.character(topCropDamage$EVTYPE)))
# Draw charts
propDamagePlot <- ggplot(topPropDamage, aes(x=EVTYPE, y=propertydamage/10^9)) +
ggtitle("Properties damages caused by storm events") +
ylab("Damages (billions - US dollars)") +
scale_y_continuous(breaks = seq(0, 10000, by = 10)) +
geom_bar(fill="grey", stat="identity") +
theme_bw() + customTheme
cropDamagePlot <- ggplot(topCropDamage, aes(x=EVTYPE, y=cropdamage/10^9)) +
ggtitle("Crop damages caused by storm events") +
ylab("Damages (billions - US dollars)") +
scale_y_continuous(breaks = seq(0, 10000, by = 1)) +
geom_bar(fill="grey", stat="identity") +
theme_bw() + customTheme
plot_grid(propDamagePlot, cropDamagePlot, ncol = 2)
Based on the above graphs, it is determined that floods and hurricanes caused the most property damage, while droughts and floods caused the most crop damage in the United States between 1990 to 2011.