U.S. Storm Events - Economic Costs and Casualties

# load all libraries necessary for analysis
library(knitr)
library(psych)
library(stringr)
library(ggplot2)
library(gridExtra)
library(xtable)

# set global options for code chunks
opts_chunk$set(tidy = TRUE, fig.width = 12)

# set width so describe() output does not wrap
options(width = 120)

# seed value for random number generators
my_seed <- 666

Synopsis

This analysis of U.S. storm event data shows:

total and mean damages by event type;
total and mean casualties by event type; and
a brief exploration of the distribution of total damages.

Data Processing

Load and summarize the data

The data for this assignment came in the form of a CSV file compressed via the bzip2 algorithm. The file was downloaded from the course web site. Documentation of the database is available here:

National Weather Service Storm Data Documentation
National Climatic Data Center Storm Events FAQ

According to the course web site, events in the database begin in 1950 and end in 2011. Earlier years have fewer events recorded. Recent years are more complete.

So, let’s read the CSV file.

storms <- read.csv("repdata-data-StormData.csv", header = TRUE)

Data hygiene

We need to clean the data before we can analyze it. I am interested in using the EVTYPE column in my analysis. There is a lot of cleanup and standardization that could be performed on this variable that I will not be doing for lack of time. There are, for instance, many variants of “FLOOD”" or “FLOODING” that should probably be coded the same way. This variation will lead to strange, questionable results when I calculate the mean damages and casualties by event type.

However, for purposes of this report, I will do the following:

Remove all non-alphanumeric characters from the column;
Trim and upper-case the data;
Convert EVTYPE to a factor; and
Extract just the five columns required for the analysis.

# clean up EVTYPE column remove non-alphanumeric characters
storms$EVTYPE <- gsub("[^A-Za-z0-9]", " ", storms$EVTYPE)
# trim and uppercase
storms$EVTYPE <- str_trim(toupper(storms$EVTYPE))
# replace multiple spaces with a single space
storms$EVTYPE <- gsub("  ", " ", storms$EVTYPE)

# convert EVTYPE to a factor
storms$EVTYPE <- factor(storms$EVTYPE)

# extract just the columns we'll need for our analysis
storms <- storms[, c(8, 23, 24, 25, 27)]

Summary statistics

Now that the data has been cleaned and transformed the way I want it, let’s look at some summary statistics. Note that the distributions of FATALITIES, INJURIES, PROPDMG, and CROPDMG are highly skewed to the right.

# describe from the psych package
describe(storms)

##            vars      n   mean     sd median trimmed    mad min  max range   skew  kurtosis   se
## EVTYPE*       1 902297 445.39 262.72    392  445.38 381.03   1  843   842   0.07     -1.83 0.28
## FATALITIES    2 902297   0.02   0.77      0    0.00   0.00   0  583   583 520.22 377440.21 0.00
## INJURIES      3 902297   0.16   5.43      0    0.00   0.00   0 1700  1700 141.80  30496.31 0.01
## PROPDMG       4 902297  12.06  59.48      0    0.95   0.00   0 5000  5000  11.75    410.90 0.06
## CROPDMG       5 902297   1.53  22.17      0    0.00   0.00   0  990   990  22.22    580.73 0.02

# structure
str(storms)

## 'data.frame':    902297 obs. of  5 variables:
##  $ EVTYPE    : Factor w/ 843 levels "","ABNORMAL WARMTH",..: 714 714 714 714 714 714 714 714 714 714 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...

Results

Damage to Crops and Property from Storm Events

Table

Following are the top event types by total damages. Damage numbers are in 1000’s of dollars.

# aggregate crop and property damage for economic costs of events
economics <- aggregate(list(CropDamage = storms$CROPDMG, PropertyDamage = storms$PROPDMG, TotalDamage = storms$CROPDMG + 
    storms$PROPDMG), by = list(EventType = storms$EVTYPE), FUN = sum, na.rm = TRUE)
# subset events with total damages greater than 0
economics <- subset(economics, economics$TotalDamage > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
economics_top_ten <- head(economics[order(-economics$TotalDamage), ], 10)

# display a table with the results
economics_top_ten_table <- xtable(economics_top_ten, caption = "Top 10 Storm Event Types - Total Damages")
print(economics_top_ten_table, type = "html", include.rownames = FALSE)

Top 10 Storm Event Types - Total Damages
EventType	CropDamage	PropertyDamage	TotalDamage
TORNADO	100018.52	3212258.16	3312276.68
FLASH FLOOD	179200.46	1420674.59	1599875.05
TSTM WIND	109202.60	1336103.61	1445306.21
HAIL	579596.28	688693.38	1268289.66
FLOOD	168037.88	899938.48	1067976.36
THUNDERSTORM WIND	66794.45	876844.17	943638.62
LIGHTNING	3580.61	603351.78	606932.39
THUNDERSTORM WINDS	18684.93	446323.18	465008.11
HIGH WIND	17283.21	324731.56	342014.77
WINTER STORM	1978.99	132720.59	134699.58

Following are the top event types by mean total damages. Variation in the EVTYPE leads to some questionable results.

# average damage per event
economics_mean <- aggregate(list(CropDamage = storms$CROPDMG, PropertyDamage = storms$PROPDMG, TotalDamage = storms$CROPDMG + 
    storms$PROPDMG), by = list(EventType = storms$EVTYPE), FUN = mean, na.rm = TRUE)
# subset events with total damages greater than 0
economics_mean <- subset(economics_mean, economics_mean$TotalDamage > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
economics_mean_top_ten <- head(economics_mean[order(-economics_mean$TotalDamage), ], 10)

# display a table with the results
economics_mean_top_ten_table <- xtable(economics_mean_top_ten, caption = "Top 10 Storm Events - Average Total Damages")
print(economics_mean_top_ten_table, type = "html", include.rownames = FALSE)

Top 10 Storm Events - Average Total Damages
EventType	CropDamage	PropertyDamage	TotalDamage
TROPICAL STORM GORDON	500.00	500.00	1000.00
COASTAL EROSION	0.00	766.00	766.00
HEAVY RAIN AND FLOOD	0.00	600.00	600.00
RIVER AND STREAM FLOOD	0.00	600.00	600.00
DUST STORM HIGH WINDS	500.00	50.00	550.00
HIGH WINDS COLD	401.00	122.00	523.00
FOREST FIRES	500.00	5.00	505.00
BLIZZARD WINTER STORM	0.00	500.00	500.00
FLASH FLOODING THUNDERSTORM WI	0.00	500.00	500.00
FLOOD RIVER FLOOD	0.00	500.00	500.00

Visualization - Distribution of Damages

The distribution of total damages for events which had damages is highly skewed.

storms_with_damage <- subset(storms, CROPDMG > 0 | PROPDMG > 0)
storms_with_damage$total_damage <- storms_with_damage$CROPDMG + storms_with_damage$PROPDMG
plot1 <- ggplot(storms_with_damage, aes(x = total_damage)) + xlab("Total Damages") + ylab("Events") + geom_histogram(colour = "black", 
    fill = "cadetblue1", binwidth = 50) + theme_bw()
plot1

A log10 transformation of total damages results in a nearly normal distribution, with very little skew or kurtosis. Summary statistics and a Q-Q normal plot confirm this.

storms_with_damage$log_total_damage <- log10(storms_with_damage$total_damage)
plot2 <- ggplot(storms_with_damage, aes(x = log_total_damage)) + xlab("Log10 of Total Damages") + ylab("Events") + geom_histogram(colour = "black", 
    fill = "mediumpurple1", binwidth = 0.25) + theme_bw()
plot2

describe(storms_with_damage$log_total_damage)

##   vars      n mean   sd median trimmed  mad min max range  skew kurtosis se
## 1    1 245031 0.98 0.84      1    0.97 0.79  -2 3.7   5.7 -0.12     0.38  0

# looks normal; draw a Q-Q plot
qqnorm(storms_with_damage$log_total_damage, col = "darkorange", main = "Normal Q-Q Plot\nLog10 of Total Damages")
qqline(storms_with_damage$log_total_damage, col = "blue", lwd = 1)

Fatalities and Injuries Due to Storm Events

Following are the top event types by total casualties.

# aggregate fatalities and injuries for events
health <- aggregate(list(Fatalities = storms$FATALITIES, Injuries = storms$INJURIES, TotalCasualties = storms$FATALITIES + 
    storms$INJURIES), by = list(EventType = storms$EVTYPE), FUN = sum, na.rm = TRUE)
# subset events with total casualties > 0
health <- subset(health, health$TotalCasualties > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
health_top_ten <- head(health[order(-health$TotalCasualties), ], 10)

# display a table with the results
health_top_ten_table <- xtable(health_top_ten, caption = "Top 10 Storm Event Types - Total Casualties")
print(health_top_ten_table, type = "html", include.rownames = FALSE)

Top 10 Storm Event Types - Total Casualties
EventType	Fatalities	Injuries	TotalCasualties
TORNADO	5633.00	91346.00	96979.00
EXCESSIVE HEAT	1903.00	6525.00	8428.00
TSTM WIND	504.00	6957.00	7461.00
FLOOD	470.00	6789.00	7259.00
LIGHTNING	817.00	5230.00	6047.00
HEAT	937.00	2100.00	3037.00
FLASH FLOOD	978.00	1777.00	2755.00
ICE STORM	89.00	1975.00	2064.00
THUNDERSTORM WIND	133.00	1488.00	1621.00
WINTER STORM	206.00	1321.00	1527.00

Following are the top event types by average number of casualties. Again, variation in the EVTYPE column leads to questionable results.

# mean number of casualties
health_mean <- aggregate(list(Fatalities = storms$FATALITIES, Injuries = storms$INJURIES, TotalCasualties = storms$FATALITIES + 
    storms$INJURIES), by = list(EventType = storms$EVTYPE), FUN = mean, na.rm = TRUE)
# subset events with total casualties > 0
health_mean <- subset(health_mean, health_mean$TotalCasualties > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
health_mean_top_ten <- head(health_mean[order(-health_mean$TotalCasualties), ], 10)

# display a table with the results
health_mean_top_ten_table <- xtable(health_mean_top_ten, caption = "Top 10 Storm Events - Mean Casualties")
print(health_mean_top_ten_table, type = "html", include.rownames = FALSE)

Top 10 Storm Events - Mean Casualties
EventType	Fatalities	Injuries	TotalCasualties
TROPICAL STORM GORDON	8.00	43.00	51.00
WILD FIRES	0.75	37.50	38.25
THUNDERSTORMW	0.00	27.00	27.00
TORNADOES TSTM WIND HAIL	25.00	0.00	25.00
HIGH WIND AND SEAS	3.00	20.00	23.00
HEAT WAVE DROUGHT	4.00	15.00	19.00
SNOW HIGH WINDS	0.00	18.00	18.00
HURRICANE TYPHOON	0.73	14.49	15.22
GLAZE ICE STORM	0.00	15.00	15.00
COLD AND SNOW	14.00	0.00	14.00

Appendix A - Programming Environment

sessionInfo()

## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] xtable_1.7-4    gridExtra_0.9.1 ggplot2_1.0.0   stringr_0.6.2   psych_1.4.8.11  knitr_1.7      
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5   formatR_1.0      gtable_0.1.2     htmltools_0.2.6 
##  [7] labeling_0.3     MASS_7.3-35      munsell_0.4.2    plyr_1.8.1       proto_0.3-10     Rcpp_0.11.3     
## [13] reshape2_1.4     rmarkdown_0.3.3  scales_0.2.4     tools_3.1.2      yaml_2.1.13