Synopsis.

This analysis uses the US National Oceanic and Atmospheric Administration’s (NOAA) storm database, which tracks characteristics of major storms and weather events in the US. The focus of this analysis is to answer:

Based on preliminary exploratory analysis of the data, the event type data was very noisy. In my opinion, tidying it would probably not have improved or altered the findings significantly. My approach to the analysis was to focus on being able to answer the questions, which I believe I have in the Results section below.

Data Processing.

Population Health.

The data were loaded in as follows, and two data frames, top10_fatalities and top10_injuries of the events causing the top ten fatalities and injuries. These are plotted in the Results section below.

df1 <- read.csv("repdata-data-StormData.csv", header = TRUE)
require(plyr) ; require(ggplot2) # Load required packages

# subset for those where fatalaties are greater than zero
df1.fatalities <- subset(df1, df1$FATALITIES > 0)
# create summary data frame that summarize total fatalities by event type 
summ1 <- ddply(df1.fatalities, .(EVTYPE), summarise, tot = sum(FATALITIES))
# sort the summary data frame descending to events with highest fatalities
summ1_sorted <- summ1[order(-summ1$tot),]
# save the top 10 worst events in a data frame
top10_fatalities <- head(summ1_sorted,10)

# repeat as above for injuries
df1.injuries <- subset(df1, df1$INJURIES > 0)
summ2 <- ddply(df1.injuries, .(EVTYPE), summarise, tot = sum(INJURIES))
summ2_sorted <- summ2[order(-summ2$tot),] # sort descending
top10_injuries <- head(summ2_sorted,10)

Economic Consequences

The following code creates a data frame of the events causing the most costly property and crop damage. These are plotted in the Results section below.

It creates a subset of rows where property damage exists and drops rows where the property damage exponent is not B, H, K or M, signifying billions, hundreds, thousands and millions, respectively.

require(plyr) ; require(ggplot2) # Load required packages
## Loading required package: plyr
## Loading required package: ggplot2
# create a subset of rows where property damage is greater than zero
df1.propdmg <- subset(df1, df1$PROPDMG > 0)
# convert the property damage expontent to upper case
df1.propdmg$PROPDMGEXP <- toupper(df1.propdmg$PROPDMGEXP)
# subset out anything that isn't B, H, K or M
df1.propdmg <- subset(df1.propdmg, df1.propdmg$PROPDMGEXP %in% c("B","H","K","M"))
# create a multiplier based on the exponent
df1.propdmg$multiplier[df1.propdmg$PROPDMGEXP == "B"]  <- 1000000000
df1.propdmg$multiplier[df1.propdmg$PROPDMGEXP == "M"]  <- 1000000
df1.propdmg$multiplier[df1.propdmg$PROPDMGEXP == "K"]  <- 1000
df1.propdmg$multiplier[df1.propdmg$PROPDMGEXP == "H"]  <- 100
# create a new column, by multiplication, containing computed cost of damage
df1.propdmg$computed_cost <- df1.propdmg$PROPDMG * df1.propdmg$multiplier
# create a data frame that summarizes total computed cost by event
summ1 <- ddply(df1.propdmg, .(EVTYPE), summarise, tot = sum(computed_cost))
# create a column to signify the type of damage
summ1$type_of_damage <- "Property"

# crop damage

# carry out similar processing as above for crop damages
df1.cropdmg <- subset(df1, df1$CROPDMG > 0)
df1.cropdmg$CROPDMGEXP <- toupper(df1.cropdmg$CROPDMGEXP)
df1.cropdmg <- subset(df1.cropdmg, df1.cropdmg$CROPDMGEXP %in% c("B","H","K","M"))
df1.cropdmg$multiplier[df1.cropdmg$CROPDMGEXP == "B"]  <- 1000000000
df1.cropdmg$multiplier[df1.cropdmg$CROPDMGEXP == "M"]  <- 1000000
df1.cropdmg$multiplier[df1.cropdmg$CROPDMGEXP == "K"]  <- 1000
df1.cropdmg$multiplier[df1.cropdmg$CROPDMGEXP == "H"]  <- 100
df1.cropdmg$computed_cost <- df1.cropdmg$CROPDMG * df1.cropdmg$multiplier
summ2 <- ddply(df1.cropdmg, .(EVTYPE), summarise, tot = sum(computed_cost))

summ2$type_of_damage <- "Crop"

# bind the property and crop damages data frames together
summ1and2 <- rbind(summ1, summ2)
# sort the new data frame by total cost of damage
summ1and2 <- summ1and2[order(-summ1and2$tot),] # sort descending
top20 <- head(summ1and2,20)

Results.

Plot the top 10 fatality events

plot1 <- ggplot(top10_fatalities, aes(x = tot, y = reorder(EVTYPE, tot))) + 
  geom_point(size = 3) +
  scale_x_continuous(breaks = seq(0, 6000, 1000), limits = c(0, 6000),
                     expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0.5)) +
  labs(x = "Fatalities (thousands)", y = NULL) +
  ggtitle("Fatalities by Event Type.") + 
  theme_bw() +
  theme(panel.grid.major.y = element_line(colour = "grey50"),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = rel(1.5), face = "bold", vjust = 1.5),
        axis.title.x = element_text(size = 13),
        axis.ticks.y = element_blank())
plot1

Plot the top 10 injury events

plot2 <- ggplot(top10_injuries, aes(x = tot, y = reorder(EVTYPE, tot))) + 
  geom_point(size = 3) +
  scale_x_continuous(breaks = seq(0, 100000, 10000), limits = c(0, 100000),
                     expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0.5)) +
  labs(x = "Injuries (thousands)", y = NULL) +
  ggtitle("Injuries by Event Type.") + 
  theme_bw() +
  theme(panel.grid.major.y = element_line(colour = "grey50"),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = rel(1.5), face = "bold", vjust = 1.5),
        axis.title.x = element_text(size = 13),
        axis.ticks.y = element_blank())

plot2

Plot the Crop and Property Damage Top 20.

qplot(tot, EVTYPE, data=top20, colour=factor(type_of_damage), geom="point", xlab="Cost of Damage", ylab="Event Types (20 most costly.)", main="Top 20 Events Causing Most Costly Damage.") + labs(colour="Type of Damage.")

Conclusion.

While the analysis would be improved by a more time-consuming tidy up of the data, the chosen approach provides simple answers to the questions posed and allows the reader to quickly form a view, using the reproducible code included, which was the objective of the assignment.