# load all libraries necessary for analysis
library(knitr)
library(psych)
library(stringr)
library(ggplot2)
library(gridExtra)
library(xtable)

# set global options for code chunks
opts_chunk$set(tidy = TRUE, fig.width = 12)

# set width so describe() output does not wrap
options(width = 120)

# seed value for random number generators
my_seed <- 666

Synopsis

This analysis of U.S. storm event data shows:

  1. total and mean damages by event type;
  2. total and mean casualties by event type; and
  3. a brief exploration of the distribution of total damages.

Data Processing

Load and summarize the data

The data for this assignment came in the form of a CSV file compressed via the bzip2 algorithm. The file was downloaded from the course web site. Documentation of the database is available here:

National Weather Service Storm Data Documentation
National Climatic Data Center Storm Events FAQ

According to the course web site, events in the database begin in 1950 and end in 2011. Earlier years have fewer events recorded. Recent years are more complete.

So, let’s read the CSV file.

storms <- read.csv("repdata-data-StormData.csv", header = TRUE)

Data hygiene

We need to clean the data before we can analyze it. I am interested in using the EVTYPE column in my analysis. There is a lot of cleanup and standardization that could be performed on this variable that I will not be doing for lack of time. There are, for instance, many variants of “FLOOD”" or “FLOODING” that should probably be coded the same way. This variation will lead to strange, questionable results when I calculate the mean damages and casualties by event type.

However, for purposes of this report, I will do the following:

  1. Remove all non-alphanumeric characters from the column;
  2. Trim and upper-case the data;
  3. Convert EVTYPE to a factor; and
  4. Extract just the five columns required for the analysis.
# clean up EVTYPE column remove non-alphanumeric characters
storms$EVTYPE <- gsub("[^A-Za-z0-9]", " ", storms$EVTYPE)
# trim and uppercase
storms$EVTYPE <- str_trim(toupper(storms$EVTYPE))
# replace multiple spaces with a single space
storms$EVTYPE <- gsub("  ", " ", storms$EVTYPE)

# convert EVTYPE to a factor
storms$EVTYPE <- factor(storms$EVTYPE)

# extract just the columns we'll need for our analysis
storms <- storms[, c(8, 23, 24, 25, 27)]

Summary statistics

Now that the data has been cleaned and transformed the way I want it, let’s look at some summary statistics. Note that the distributions of FATALITIES, INJURIES, PROPDMG, and CROPDMG are highly skewed to the right.

# describe from the psych package
describe(storms)
##            vars      n   mean     sd median trimmed    mad min  max range   skew  kurtosis   se
## EVTYPE*       1 902297 445.39 262.72    392  445.38 381.03   1  843   842   0.07     -1.83 0.28
## FATALITIES    2 902297   0.02   0.77      0    0.00   0.00   0  583   583 520.22 377440.21 0.00
## INJURIES      3 902297   0.16   5.43      0    0.00   0.00   0 1700  1700 141.80  30496.31 0.01
## PROPDMG       4 902297  12.06  59.48      0    0.95   0.00   0 5000  5000  11.75    410.90 0.06
## CROPDMG       5 902297   1.53  22.17      0    0.00   0.00   0  990   990  22.22    580.73 0.02
# structure
str(storms)
## 'data.frame':    902297 obs. of  5 variables:
##  $ EVTYPE    : Factor w/ 843 levels "","ABNORMAL WARMTH",..: 714 714 714 714 714 714 714 714 714 714 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...

Results

Damage to Crops and Property from Storm Events

Table

Following are the top event types by total damages. Damage numbers are in 1000’s of dollars.

# aggregate crop and property damage for economic costs of events
economics <- aggregate(list(CropDamage = storms$CROPDMG, PropertyDamage = storms$PROPDMG, TotalDamage = storms$CROPDMG + 
    storms$PROPDMG), by = list(EventType = storms$EVTYPE), FUN = sum, na.rm = TRUE)
# subset events with total damages greater than 0
economics <- subset(economics, economics$TotalDamage > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
economics_top_ten <- head(economics[order(-economics$TotalDamage), ], 10)

# display a table with the results
economics_top_ten_table <- xtable(economics_top_ten, caption = "Top 10 Storm Event Types - Total Damages")
print(economics_top_ten_table, type = "html", include.rownames = FALSE)
Top 10 Storm Event Types - Total Damages
EventType CropDamage PropertyDamage TotalDamage
TORNADO 100018.52 3212258.16 3312276.68
FLASH FLOOD 179200.46 1420674.59 1599875.05
TSTM WIND 109202.60 1336103.61 1445306.21
HAIL 579596.28 688693.38 1268289.66
FLOOD 168037.88 899938.48 1067976.36
THUNDERSTORM WIND 66794.45 876844.17 943638.62
LIGHTNING 3580.61 603351.78 606932.39
THUNDERSTORM WINDS 18684.93 446323.18 465008.11
HIGH WIND 17283.21 324731.56 342014.77
WINTER STORM 1978.99 132720.59 134699.58

Following are the top event types by mean total damages. Variation in the EVTYPE leads to some questionable results.

# average damage per event
economics_mean <- aggregate(list(CropDamage = storms$CROPDMG, PropertyDamage = storms$PROPDMG, TotalDamage = storms$CROPDMG + 
    storms$PROPDMG), by = list(EventType = storms$EVTYPE), FUN = mean, na.rm = TRUE)
# subset events with total damages greater than 0
economics_mean <- subset(economics_mean, economics_mean$TotalDamage > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
economics_mean_top_ten <- head(economics_mean[order(-economics_mean$TotalDamage), ], 10)

# display a table with the results
economics_mean_top_ten_table <- xtable(economics_mean_top_ten, caption = "Top 10 Storm Events - Average Total Damages")
print(economics_mean_top_ten_table, type = "html", include.rownames = FALSE)
Top 10 Storm Events - Average Total Damages
EventType CropDamage PropertyDamage TotalDamage
TROPICAL STORM GORDON 500.00 500.00 1000.00
COASTAL EROSION 0.00 766.00 766.00
HEAVY RAIN AND FLOOD 0.00 600.00 600.00
RIVER AND STREAM FLOOD 0.00 600.00 600.00
DUST STORM HIGH WINDS 500.00 50.00 550.00
HIGH WINDS COLD 401.00 122.00 523.00
FOREST FIRES 500.00 5.00 505.00
BLIZZARD WINTER STORM 0.00 500.00 500.00
FLASH FLOODING THUNDERSTORM WI 0.00 500.00 500.00
FLOOD RIVER FLOOD 0.00 500.00 500.00

Visualization - Distribution of Damages

The distribution of total damages for events which had damages is highly skewed.

storms_with_damage <- subset(storms, CROPDMG > 0 | PROPDMG > 0)
storms_with_damage$total_damage <- storms_with_damage$CROPDMG + storms_with_damage$PROPDMG
plot1 <- ggplot(storms_with_damage, aes(x = total_damage)) + xlab("Total Damages") + ylab("Events") + geom_histogram(colour = "black", 
    fill = "cadetblue1", binwidth = 50) + theme_bw()
plot1

A log10 transformation of total damages results in a nearly normal distribution, with very little skew or kurtosis. Summary statistics and a Q-Q normal plot confirm this.

storms_with_damage$log_total_damage <- log10(storms_with_damage$total_damage)
plot2 <- ggplot(storms_with_damage, aes(x = log_total_damage)) + xlab("Log10 of Total Damages") + ylab("Events") + geom_histogram(colour = "black", 
    fill = "mediumpurple1", binwidth = 0.25) + theme_bw()
plot2

describe(storms_with_damage$log_total_damage)
##   vars      n mean   sd median trimmed  mad min max range  skew kurtosis se
## 1    1 245031 0.98 0.84      1    0.97 0.79  -2 3.7   5.7 -0.12     0.38  0
# looks normal; draw a Q-Q plot
qqnorm(storms_with_damage$log_total_damage, col = "darkorange", main = "Normal Q-Q Plot\nLog10 of Total Damages")
qqline(storms_with_damage$log_total_damage, col = "blue", lwd = 1)

Fatalities and Injuries Due to Storm Events

Following are the top event types by total casualties.

# aggregate fatalities and injuries for events
health <- aggregate(list(Fatalities = storms$FATALITIES, Injuries = storms$INJURIES, TotalCasualties = storms$FATALITIES + 
    storms$INJURIES), by = list(EventType = storms$EVTYPE), FUN = sum, na.rm = TRUE)
# subset events with total casualties > 0
health <- subset(health, health$TotalCasualties > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
health_top_ten <- head(health[order(-health$TotalCasualties), ], 10)

# display a table with the results
health_top_ten_table <- xtable(health_top_ten, caption = "Top 10 Storm Event Types - Total Casualties")
print(health_top_ten_table, type = "html", include.rownames = FALSE)
Top 10 Storm Event Types - Total Casualties
EventType Fatalities Injuries TotalCasualties
TORNADO 5633.00 91346.00 96979.00
EXCESSIVE HEAT 1903.00 6525.00 8428.00
TSTM WIND 504.00 6957.00 7461.00
FLOOD 470.00 6789.00 7259.00
LIGHTNING 817.00 5230.00 6047.00
HEAT 937.00 2100.00 3037.00
FLASH FLOOD 978.00 1777.00 2755.00
ICE STORM 89.00 1975.00 2064.00
THUNDERSTORM WIND 133.00 1488.00 1621.00
WINTER STORM 206.00 1321.00 1527.00

Following are the top event types by average number of casualties. Again, variation in the EVTYPE column leads to questionable results.

# mean number of casualties
health_mean <- aggregate(list(Fatalities = storms$FATALITIES, Injuries = storms$INJURIES, TotalCasualties = storms$FATALITIES + 
    storms$INJURIES), by = list(EventType = storms$EVTYPE), FUN = mean, na.rm = TRUE)
# subset events with total casualties > 0
health_mean <- subset(health_mean, health_mean$TotalCasualties > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
health_mean_top_ten <- head(health_mean[order(-health_mean$TotalCasualties), ], 10)

# display a table with the results
health_mean_top_ten_table <- xtable(health_mean_top_ten, caption = "Top 10 Storm Events - Mean Casualties")
print(health_mean_top_ten_table, type = "html", include.rownames = FALSE)
Top 10 Storm Events - Mean Casualties
EventType Fatalities Injuries TotalCasualties
TROPICAL STORM GORDON 8.00 43.00 51.00
WILD FIRES 0.75 37.50 38.25
THUNDERSTORMW 0.00 27.00 27.00
TORNADOES TSTM WIND HAIL 25.00 0.00 25.00
HIGH WIND AND SEAS 3.00 20.00 23.00
HEAT WAVE DROUGHT 4.00 15.00 19.00
SNOW HIGH WINDS 0.00 18.00 18.00
HURRICANE TYPHOON 0.73 14.49 15.22
GLAZE ICE STORM 0.00 15.00 15.00
COLD AND SNOW 14.00 0.00 14.00

Appendix A - Programming Environment

sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] xtable_1.7-4    gridExtra_0.9.1 ggplot2_1.0.0   stringr_0.6.2   psych_1.4.8.11  knitr_1.7      
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5   formatR_1.0      gtable_0.1.2     htmltools_0.2.6 
##  [7] labeling_0.3     MASS_7.3-35      munsell_0.4.2    plyr_1.8.1       proto_0.3-10     Rcpp_0.11.3     
## [13] reshape2_1.4     rmarkdown_0.3.3  scales_0.2.4     tools_3.1.2      yaml_2.1.13