Summary

The purpose of this report is to determine the impact of weather events in the United States from 1950 to November 2011 on:

  1. Population health (injuries + fatalities)
  2. Economic damage (property + crop damages)

This analysis is based on the NOAA Storm Database, available for download on the Coursera website https://www.coursera.org/learn/reproducible-research/peer/OMZ37/course-project-2. A guide to the dataset is available at https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf.

The most harmful events in terms of population health are TORNADOES, while the most economically devastating events are FLOODS. However, it is important to note that the total economic toll of floods over the period is skewed by a 2006 flood in Napa that caused $115B in damages. Without this one-time event, HURRICANES would cause the most economic damage.

Data Processing

Load the data

# Load
df <- read.csv(bzfile("repdata%2Fdata%2FStormData.csv.bz2"))
# Remove scientific notation
options(scipen=999)

Keep only the variables of interest to the analysis.

names(df)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
df <- df[, c(8, 23:28)]

Convert character factors for economic damage variables into new numeric variables.

# Print unique character factors
unique(df$PROPDMGEXP)
##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(df$CROPDMGEXP)
## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M
# Make all caps
df$PROPDMGEXP <- toupper(df$PROPDMGEXP)
df$CROPDMGEXP <- toupper(df$CROPDMGEXP)

# Convert characters to multipliers
df$prop <- ifelse(df$PROPDMGEXP=="H", df$PROPDMG*100, df$PROPDMG)
df$prop <- ifelse(df$PROPDMGEXP=="K", df$PROPDMG*1000, df$prop)
df$prop <- ifelse(df$PROPDMGEXP=="M", df$PROPDMG*1000000, df$prop)
df$prop <- ifelse(df$PROPDMGEXP=="B", df$PROPDMG*1000000000, df$prop)

df$crop <- ifelse(df$CROPDMGEXP=="H", df$CROPDMG*100, df$CROPDMG)
df$crop <- ifelse(df$CROPDMGEXP=="K", df$CROPDMG*1000, df$crop)
df$crop <- ifelse(df$CROPDMGEXP=="M", df$CROPDMG*1000000, df$crop)
df$crop <- ifelse(df$CROPDMGEXP=="B", df$CROPDMG*1000000000, df$crop)

Add the crop and property damage variables to create a total damage variable.

df$TOTALDMG <- df$prop + df$crop

Add fatalities and injuries to create a total harm variable.

df$TOTALHRM <- df$FATALITIES + df$INJURIES

Clean up EVTYPE names to better determine which have the greatest impact.

# Make all caps
df$EVTYPE <- toupper(df$EVTYPE)

# Remove EVTYPEs that contain "SUMMARY"
df <- df[grepl("SUMMARY",df$EVTYPE)==F,]

# Consolidate names
df$event <- ifelse(grepl("SURF|WAVE|TIDE|SWELL|SEAS|SURGE|CURRENT|MARINE", df$EVTYPE)==T, "SURF",  df$EVTYPE)
df$event <- ifelse(grepl("HEAT|WARM|DRY|DROUGHT|HOT|RECORD TEMPERATURE|DRIEST|HIGH TEMP|RECORD HIGH", df$EVTYPE)==T, "HEAT",  df$event)
df$event <- ifelse(grepl("RAIN|PRECIP|WET|SHOWER", df$EVTYPE)==T, "RAIN",  df$event)
df$event <- ifelse(grepl("SNOW|WINTER|WINTRY|SLEET", df$EVTYPE)==T, "SNOW",  df$event)
df$event <- ifelse(grepl("COLD|COOL|LOW TEMP|RECORD LOW|HYPOTHERM", df$EVTYPE)==T, "COLD",  df$event)
df$event <- ifelse(grepl("ICE|FREEZ|FROST|ICY", df$EVTYPE)==T, "ICE",  df$event)
df$event <- ifelse(grepl("WIND|WND|GUST|TURBULENCE", df$EVTYPE)==T, "WIND",  df$event)
df$event <- ifelse(grepl("VOLCAN", df$EVTYPE)==T, "VOLCANIC ACTIVITY",  df$event)
df$event <- ifelse(grepl("DUST", df$EVTYPE)==T, "DUST",  df$event)
df$event <- ifelse(grepl("FIRE|SMOKE", df$EVTYPE)==T, "FIRE",  df$event)
df$event <- ifelse(grepl("HURRICANE|FLOYD", df$EVTYPE)==T, "HURRICANE",  df$event)
df$event <- ifelse(grepl("SLIDE|EROSION", df$EVTYPE)==T, "LANDSLIDE_AND_EROSION",  df$event)
df$event <- ifelse(grepl("AVALANC", df$EVTYPE)==T, "AVALANCHE",  df$event)
df$event <- ifelse(grepl("CLOUD", df$EVTYPE)==T, "WALL CLOUD",  df$event)
df$event <- ifelse(grepl("TSTM|THUNDER|LIGHTNING|STORM", df$EVTYPE)==T, "THUNDERSTORM",  df$event)
df$event <- ifelse(grepl("FLOOD|FLD|HIGH WATER|RISING WATER|SMALL STREAM|FLOOOD", df$EVTYPE)==T, "FLOOD",  df$event)
df$event <- ifelse(grepl("TORNADO|TORNDAO", df$EVTYPE)==T, "TORNADO",  df$event)
df$event <- ifelse(grepl("BLIZZARD", df$EVTYPE)==T, "BLIZZARD",  df$event)
df$event <- ifelse(grepl("HAIL", df$EVTYPE)==T, "HAIL",  df$event)
df$event <- ifelse(grepl("SPOUT", df$EVTYPE)==T, "WATERSPOUT",  df$event)

Results

Which events cause the most population harm?

# Compute total human harm per event type
library(plyr)
x <- ddply(df, .(event), summarize, HARM=sum(TOTALHRM), FATALITIES=sum(FATALITIES), INJURIES=sum(INJURIES))

# Trim the list of results to include only events with significant total harm.
x <- x[x$HARM >= 1000, ]

# Reshape the data to make fatalities and injuries factor variables, to be able to visualize them together
library(reshape2)
y <- melt(x, id=c("event"), measure.vars=c("FATALITIES","INJURIES"))
y <- rename(y, c("variable" = "type_of_harm"))

# Reorder factors so they are in order of most harmful to least harmful
y$event <- factor(y$event, levels = unique(y$event[order(-y$value)]))

# Plot the results
library(ggplot2)
ggplot(data=y, aes(event, value, fill=type_of_harm)) + 
    geom_bar(stat="identity") +
    xlab("Weather Event") + 
    ylab("Injuries + Fatalities") + 
    ggtitle("Top Weather Events That Cause Human Harm") +  
    theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1)) +
    scale_y_continuous(labels = scales::comma)

TORNADOES cause the most fatalities and injuries.

Which events cause the most economic damage?

# Compute total economic damange per event type
z <- ddply(df, .(event), summarize, DAMAGE=sum(TOTALDMG))

# Trim the list of results to include only events with significant total damage
z <- z[z$DAMAGE >= 1000000000,]

# Reorder factors so they are in order of most damaging to least damaging
z$event <- factor(z$event, levels = z$event[order(-z$DAMAGE)])

# Plot the results
ggplot(data=z, aes(event, DAMAGE)) + 
    geom_bar(stat="identity") +
    xlab("Weather Event") + 
    ylab("Damage (in USD)") + 
    ggtitle("Top Weather Events That Cause Economic Damage") +  
    theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1)) +
    scale_y_continuous(labels = scales::comma)

FLOODS cause the most economic damage.

Are there certain discrete events that may be skewing the results?

# Keep only events that cause significant damage
xx <- df[df$TOTALDMG >= 1000000,]
# Plot the results
ggplot(data=xx, aes(event, TOTALDMG, color=event)) + 
    geom_point() +
    labs(y="Damage (in USD)",x="Weather Event") +
    scale_y_continuous(labels = scales::comma) +  
    theme(axis.text.x = element_text(angle = 45, size=8, hjust = 1)) + 
    ggtitle("Distribution of Damages by Weather Event")

Yes, one flood in Napa in 2006 caused $115B in damages. If we remove that event, HURRICANES cause the most economic damage.