Synopsis

The goal of this report is to explore the NOAA Storm Database and answer some basic questions about severe weather events. The questions are we going to answer are: what weather events are the most damaging for the economy, what events cause the most injuries and most fatalities.

Data processing

In this section we are downloading bz2 archive with the data from the Internet if it was not downloaded yet. To save disk space we will dynamically un-archive and read data. As this process is time consuming we’ll be caching the result.

if (!file.exists("repdata-data-StormData.csv.bz2")) {
  download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "repdata-data-StormData.csv.bz2")
}

data <- read.csv(bzfile("repdata-data-StormData.csv.bz2"))

And once data is loaded, we have to tidy and clean it. Below is the code to perform such operations.

data_tiny <- data[,colnames(data) %in% c('BGN_DATE', 'COUNTYNAME', 'STATE', 'EVTYPE', 'MAG', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]
data_tiny$Date <- as.Date(data_tiny$BGN_DATE, '%m/%d/%Y 0:00:00')

#Cleaning Data - consolidating/groupping similar event types, fixing typos, etc
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("HEAT", "HEAT WAVE", "EXTREME HEAT", "UNSEASONABLY WARM AND DRY", "UNSEASONABLY WARM", "HEAT WAVES", "DROUGHT/EXCESSIVE HEAT", "RECORD/EXCESSIVE HEAT", "RECORD HEAT", "HEAT WAVE DROUGHT", "Heat Wave")] <- "EXCESSIVE HEAT"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("TORNADOES, TSTM WIND, HAIL", "WATERSPOUT/TORNADO")] <- "TORNADO"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("RIP CURRENT")] <- "RIP CURRENTS"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("TSTM WIND", "HIGH WIND", "THUNDERSTORM WINDS", "HIGH WINDS", "WIND", "THUNDERSTORM WIND", "STRONG WINDS", "TSTM WIND/HAIL", "GUSTY WINDS", "HIGH WIND/SEAS", "HIGH WIND AND SEAS", "HIGH WINDS/SNOW", "HURRICANE OPAL/HIGH WINDS", "WINDS", "GUSTY WIND", "Strong Winds", "Gusty winds")] <- "STRONG WIND"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("HURRICANE")] <- "HURRICANE/TYPHOON"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("EXTREME COLD/WIND CHILL", "COLD", "COLD AND SNOW", "LOW TEMPERATURE", "COLD WAVE", "Cold", "COLD/WIND CHILL", "Cold Temperature", "Extreme Cold", "COLD WEATHER", "WINTER WEATHER", "RECORD COLD", "SNOW/ BITTER COLD")] <- "EXTREME COLD"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("FLOOD/FLASH FLOOD", "FLASH FLOOD/FLOOD", "FLOODING", "FLOOD & HEAVY RAIN", "FLOOD/RIVER FLOOD", "MINOR FLOODING", "FLASH FLOOD")] <- "FLOOD"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("HEAVY SURF/HIGH SURF")] <- "HIGH SURF"
data_tiny$EVTYPE[data_tiny$EVTYPE %in% c("WILDFIRE", "WILD FIRES")] <- "WILD/FOREST FIRE"

# A function to convert K/M/B to thousands, millions, and billions respectivly
normalize <- function(num, multiplier ) {
  num <- as.numeric(num)
  if (toupper(multiplier) == "K") {
    return(num*1000)
  }
  if (toupper(multiplier) == "M") {
    return(num*1000000)
  }  
  if (toupper(multiplier) == "B") {
    return(num*1000000000)
  }    
  return(num)
}

# Converting "K", "M", "B" damage notation into numbers
data_tiny$Damage <- apply(data_tiny[,c("PROPDMG", "PROPDMGEXP")], 1, function(y) normalize(y['PROPDMG'], y['PROPDMGEXP']))

Results

Damage from the weather events

library("ggplot2")
# Calculating & preparing econimic damage data for plotting
economic_damage <- aggregate(Damage ~ EVTYPE, data=data_tiny, FUN=sum)  # aggregating data by VTYPE
economic_damage <- economic_damage[with(economic_damage, order(-Damage)),]  # Ordering by Damage
economic_damage$EVTYPE <- factor(economic_damage$EVTYPE, as.character(economic_damage$EVTYPE)) # Converting EVTYPE to characters to keep order by damage when plotting

ggplot(data=head(economic_damage, 15), aes(x=EVTYPE, y=Damage)) + geom_bar(aes(fill=Damage), stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill=FALSE) + coord_flip() #+ scale_y_continuous(labels = comma)

Injuries from the weather events

# Calculating & preparing injuries data for plotting
injuries <- aggregate(INJURIES ~ EVTYPE, data=data_tiny, FUN=sum)
injuries <- injuries[with(injuries, order(-INJURIES)),] 
injuries$EVTYPE <- factor(injuries$EVTYPE, as.character(injuries$EVTYPE))

ggplot(data=head(injuries, 15), aes(x=EVTYPE, y=INJURIES)) + geom_bar(aes(fill=INJURIES), stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill=FALSE) + coord_flip()

Fatalities from the weather events

# Calculating & preparing fatalities data for plotting
fatalities <- aggregate(FATALITIES ~ EVTYPE, data=data_tiny, FUN=sum)
fatalities <- fatalities[with(fatalities, order(-FATALITIES)),] 
fatalities$EVTYPE <- factor(fatalities$EVTYPE, as.character(fatalities$EVTYPE))

ggplot(data=head(fatalities, 15), aes(x=EVTYPE, y=FATALITIES)) + geom_bar(aes(fill=FATALITIES), stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill=FALSE) + coord_flip()

.