# load all libraries necessary for analysis
library(knitr)
library(psych)
library(stringr)
library(ggplot2)
library(gridExtra)
library(xtable)
# set global options for code chunks
opts_chunk$set(tidy = TRUE, fig.width = 12)
# set width so describe() output does not wrap
options(width = 120)
# seed value for random number generators
my_seed <- 666
This analysis of U.S. storm event data shows:
The data for this assignment came in the form of a CSV file compressed via the bzip2 algorithm. The file was downloaded from the course web site. Documentation of the database is available here:
National Weather Service Storm Data Documentation
National Climatic Data Center Storm Events FAQ
According to the course web site, events in the database begin in 1950 and end in 2011. Earlier years have fewer events recorded. Recent years are more complete.
So, let’s read the CSV file.
storms <- read.csv("repdata-data-StormData.csv", header = TRUE)
We need to clean the data before we can analyze it. I am interested in using the EVTYPE column in my analysis. There is a lot of cleanup and standardization that could be performed on this variable that I will not be doing for lack of time. There are, for instance, many variants of “FLOOD”" or “FLOODING” that should probably be coded the same way. This variation will lead to strange, questionable results when I calculate the mean damages and casualties by event type.
However, for purposes of this report, I will do the following:
# clean up EVTYPE column remove non-alphanumeric characters
storms$EVTYPE <- gsub("[^A-Za-z0-9]", " ", storms$EVTYPE)
# trim and uppercase
storms$EVTYPE <- str_trim(toupper(storms$EVTYPE))
# replace multiple spaces with a single space
storms$EVTYPE <- gsub(" ", " ", storms$EVTYPE)
# convert EVTYPE to a factor
storms$EVTYPE <- factor(storms$EVTYPE)
# extract just the columns we'll need for our analysis
storms <- storms[, c(8, 23, 24, 25, 27)]
Now that the data has been cleaned and transformed the way I want it, let’s look at some summary statistics. Note that the distributions of FATALITIES, INJURIES, PROPDMG, and CROPDMG are highly skewed to the right.
# describe from the psych package
describe(storms)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## EVTYPE* 1 902297 445.39 262.72 392 445.38 381.03 1 843 842 0.07 -1.83 0.28
## FATALITIES 2 902297 0.02 0.77 0 0.00 0.00 0 583 583 520.22 377440.21 0.00
## INJURIES 3 902297 0.16 5.43 0 0.00 0.00 0 1700 1700 141.80 30496.31 0.01
## PROPDMG 4 902297 12.06 59.48 0 0.95 0.00 0 5000 5000 11.75 410.90 0.06
## CROPDMG 5 902297 1.53 22.17 0 0.00 0.00 0 990 990 22.22 580.73 0.02
# structure
str(storms)
## 'data.frame': 902297 obs. of 5 variables:
## $ EVTYPE : Factor w/ 843 levels "","ABNORMAL WARMTH",..: 714 714 714 714 714 714 714 714 714 714 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
Following are the top event types by total damages. Damage numbers are in 1000’s of dollars.
# aggregate crop and property damage for economic costs of events
economics <- aggregate(list(CropDamage = storms$CROPDMG, PropertyDamage = storms$PROPDMG, TotalDamage = storms$CROPDMG +
storms$PROPDMG), by = list(EventType = storms$EVTYPE), FUN = sum, na.rm = TRUE)
# subset events with total damages greater than 0
economics <- subset(economics, economics$TotalDamage > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
economics_top_ten <- head(economics[order(-economics$TotalDamage), ], 10)
# display a table with the results
economics_top_ten_table <- xtable(economics_top_ten, caption = "Top 10 Storm Event Types - Total Damages")
print(economics_top_ten_table, type = "html", include.rownames = FALSE)
| EventType | CropDamage | PropertyDamage | TotalDamage |
|---|---|---|---|
| TORNADO | 100018.52 | 3212258.16 | 3312276.68 |
| FLASH FLOOD | 179200.46 | 1420674.59 | 1599875.05 |
| TSTM WIND | 109202.60 | 1336103.61 | 1445306.21 |
| HAIL | 579596.28 | 688693.38 | 1268289.66 |
| FLOOD | 168037.88 | 899938.48 | 1067976.36 |
| THUNDERSTORM WIND | 66794.45 | 876844.17 | 943638.62 |
| LIGHTNING | 3580.61 | 603351.78 | 606932.39 |
| THUNDERSTORM WINDS | 18684.93 | 446323.18 | 465008.11 |
| HIGH WIND | 17283.21 | 324731.56 | 342014.77 |
| WINTER STORM | 1978.99 | 132720.59 | 134699.58 |
Following are the top event types by mean total damages. Variation in the EVTYPE leads to some questionable results.
# average damage per event
economics_mean <- aggregate(list(CropDamage = storms$CROPDMG, PropertyDamage = storms$PROPDMG, TotalDamage = storms$CROPDMG +
storms$PROPDMG), by = list(EventType = storms$EVTYPE), FUN = mean, na.rm = TRUE)
# subset events with total damages greater than 0
economics_mean <- subset(economics_mean, economics_mean$TotalDamage > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
economics_mean_top_ten <- head(economics_mean[order(-economics_mean$TotalDamage), ], 10)
# display a table with the results
economics_mean_top_ten_table <- xtable(economics_mean_top_ten, caption = "Top 10 Storm Events - Average Total Damages")
print(economics_mean_top_ten_table, type = "html", include.rownames = FALSE)
| EventType | CropDamage | PropertyDamage | TotalDamage |
|---|---|---|---|
| TROPICAL STORM GORDON | 500.00 | 500.00 | 1000.00 |
| COASTAL EROSION | 0.00 | 766.00 | 766.00 |
| HEAVY RAIN AND FLOOD | 0.00 | 600.00 | 600.00 |
| RIVER AND STREAM FLOOD | 0.00 | 600.00 | 600.00 |
| DUST STORM HIGH WINDS | 500.00 | 50.00 | 550.00 |
| HIGH WINDS COLD | 401.00 | 122.00 | 523.00 |
| FOREST FIRES | 500.00 | 5.00 | 505.00 |
| BLIZZARD WINTER STORM | 0.00 | 500.00 | 500.00 |
| FLASH FLOODING THUNDERSTORM WI | 0.00 | 500.00 | 500.00 |
| FLOOD RIVER FLOOD | 0.00 | 500.00 | 500.00 |
The distribution of total damages for events which had damages is highly skewed.
storms_with_damage <- subset(storms, CROPDMG > 0 | PROPDMG > 0)
storms_with_damage$total_damage <- storms_with_damage$CROPDMG + storms_with_damage$PROPDMG
plot1 <- ggplot(storms_with_damage, aes(x = total_damage)) + xlab("Total Damages") + ylab("Events") + geom_histogram(colour = "black",
fill = "cadetblue1", binwidth = 50) + theme_bw()
plot1
A log10 transformation of total damages results in a nearly normal distribution, with very little skew or kurtosis. Summary statistics and a Q-Q normal plot confirm this.
storms_with_damage$log_total_damage <- log10(storms_with_damage$total_damage)
plot2 <- ggplot(storms_with_damage, aes(x = log_total_damage)) + xlab("Log10 of Total Damages") + ylab("Events") + geom_histogram(colour = "black",
fill = "mediumpurple1", binwidth = 0.25) + theme_bw()
plot2
describe(storms_with_damage$log_total_damage)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 245031 0.98 0.84 1 0.97 0.79 -2 3.7 5.7 -0.12 0.38 0
# looks normal; draw a Q-Q plot
qqnorm(storms_with_damage$log_total_damage, col = "darkorange", main = "Normal Q-Q Plot\nLog10 of Total Damages")
qqline(storms_with_damage$log_total_damage, col = "blue", lwd = 1)
Following are the top event types by total casualties.
# aggregate fatalities and injuries for events
health <- aggregate(list(Fatalities = storms$FATALITIES, Injuries = storms$INJURIES, TotalCasualties = storms$FATALITIES +
storms$INJURIES), by = list(EventType = storms$EVTYPE), FUN = sum, na.rm = TRUE)
# subset events with total casualties > 0
health <- subset(health, health$TotalCasualties > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
health_top_ten <- head(health[order(-health$TotalCasualties), ], 10)
# display a table with the results
health_top_ten_table <- xtable(health_top_ten, caption = "Top 10 Storm Event Types - Total Casualties")
print(health_top_ten_table, type = "html", include.rownames = FALSE)
| EventType | Fatalities | Injuries | TotalCasualties |
|---|---|---|---|
| TORNADO | 5633.00 | 91346.00 | 96979.00 |
| EXCESSIVE HEAT | 1903.00 | 6525.00 | 8428.00 |
| TSTM WIND | 504.00 | 6957.00 | 7461.00 |
| FLOOD | 470.00 | 6789.00 | 7259.00 |
| LIGHTNING | 817.00 | 5230.00 | 6047.00 |
| HEAT | 937.00 | 2100.00 | 3037.00 |
| FLASH FLOOD | 978.00 | 1777.00 | 2755.00 |
| ICE STORM | 89.00 | 1975.00 | 2064.00 |
| THUNDERSTORM WIND | 133.00 | 1488.00 | 1621.00 |
| WINTER STORM | 206.00 | 1321.00 | 1527.00 |
Following are the top event types by average number of casualties. Again, variation in the EVTYPE column leads to questionable results.
# mean number of casualties
health_mean <- aggregate(list(Fatalities = storms$FATALITIES, Injuries = storms$INJURIES, TotalCasualties = storms$FATALITIES +
storms$INJURIES), by = list(EventType = storms$EVTYPE), FUN = mean, na.rm = TRUE)
# subset events with total casualties > 0
health_mean <- subset(health_mean, health_mean$TotalCasualties > 0)
# find the top ten event types in terms of total damages (crop damage + property damage)
health_mean_top_ten <- head(health_mean[order(-health_mean$TotalCasualties), ], 10)
# display a table with the results
health_mean_top_ten_table <- xtable(health_mean_top_ten, caption = "Top 10 Storm Events - Mean Casualties")
print(health_mean_top_ten_table, type = "html", include.rownames = FALSE)
| EventType | Fatalities | Injuries | TotalCasualties |
|---|---|---|---|
| TROPICAL STORM GORDON | 8.00 | 43.00 | 51.00 |
| WILD FIRES | 0.75 | 37.50 | 38.25 |
| THUNDERSTORMW | 0.00 | 27.00 | 27.00 |
| TORNADOES TSTM WIND HAIL | 25.00 | 0.00 | 25.00 |
| HIGH WIND AND SEAS | 3.00 | 20.00 | 23.00 |
| HEAT WAVE DROUGHT | 4.00 | 15.00 | 19.00 |
| SNOW HIGH WINDS | 0.00 | 18.00 | 18.00 |
| HURRICANE TYPHOON | 0.73 | 14.49 | 15.22 |
| GLAZE ICE STORM | 0.00 | 15.00 | 15.00 |
| COLD AND SNOW | 14.00 | 0.00 | 14.00 |
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] xtable_1.7-4 gridExtra_0.9.1 ggplot2_1.0.0 stringr_0.6.2 psych_1.4.8.11 knitr_1.7
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5 formatR_1.0 gtable_0.1.2 htmltools_0.2.6
## [7] labeling_0.3 MASS_7.3-35 munsell_0.4.2 plyr_1.8.1 proto_0.3-10 Rcpp_0.11.3
## [13] reshape2_1.4 rmarkdown_0.3.3 scales_0.2.4 tools_3.1.2 yaml_2.1.13