Synopsis

In this report, we will be using the NOAA Storm database to analyze the most harmful natural weather events from 1950 to 2011.

Our goal is to answer two questions:

  1. Across the United States, which types of events are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

Answers:

  1. Tornados are most harmful to population health

  2. Floods and Drought have the greatest economic consequences

Data Processing

## Import required libraries
library ("dplyr")
library(ggplot2)

## Check if the file exists before downloading another copy
if(!file.exists("StormData.csv.bz2")){
    download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
                  destfile = "StormData.csv.bz2")
}

## Read in the data from the bz2 archive
data <- read.csv("StormData.csv.bz2", stringsAsFactors = FALSE,
                 na.strings = "", header = TRUE)

## Character to date conversion on BGN_DATE
data$date <- as.Date(data$BGN_DATE, format="%m/%d/%Y")

## Subset the dataset, keeping only the columns needed for the analysis
keep <- c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
          "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
data <- data[keep]

## Remove events where no damages was recorded
data <- subset(data, FATALITIES>0 | INJURIES>0 | PROPDMG>0 | CROPDMG>0)

## Remove misc categories that don't fit the model
data$PROPDMGEXP[data$PROPDMGEXP == "1"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "2"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "3"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "4"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "5"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "6"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "7"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "8"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "+"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "0"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "?"] <- ""
data$PROPDMGEXP[data$PROPDMGEXP == "-"] <- ""

data$CROPDMGEXP[data$CROPDMGEXP == "2"] <- ""
data$CROPDMGEXP[data$CROPDMGEXP == "?"] <- ""
data$CROPDMGEXP[data$CROPDMGEXP == "0"] <- ""

## Fix lower/uppercase issues
data$PROPDMGEXP <- gsub("m", "M", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("h", "H", data$PROPDMGEXP)

data$CROPDMGEXP <- gsub("m", "M", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("k", "K", data$CROPDMGEXP)

## Convert character to numeric values to assist damage calculations
data$PROPDMGEXP[data$PROPDMGEXP =="H"] <- as.numeric(100)
data$PROPDMGEXP[data$PROPDMGEXP =="K"] <- as.numeric(1000)
data$PROPDMGEXP[data$PROPDMGEXP =="M"] <- as.numeric(1000000)
data$PROPDMGEXP[data$PROPDMGEXP =="B"] <- as.numeric(1000000000)

data$CROPDMGEXP[data$CROPDMGEXP =="K"] <- as.numeric(1000)
data$CROPDMGEXP[data$CROPDMGEXP =="M"] <- as.numeric(1000000)
data$CROPDMGEXP[data$CROPDMGEXP =="B"] <- as.numeric(1000000000)

## Make the PROD/CROP DMGEXP fields numeric
data$PROPDMGEXP <- as.numeric(data$PROPDMGEXP)
data$CROPDMGEXP <- as.numeric(data$CROPDMGEXP)

Results

Q: Which types of events are most harmful with respect to population health?

## Crate a new dataset by summing fatalities + injuries and returning top 10
health <- data%>%group_by (EVTYPE)%>%summarise (EVNTS = sum (FATALITIES + INJURIES))%>%top_n (10)
## Selecting by EVNTS
## Order by event count
health <- health[order(-health$EVNTS),]

## Plot the results
ggplot (health, aes (EVTYPE, EVNTS)) +  
    geom_bar (stat="identity",fill="red") +  
    theme (axis.text.x = element_text(angle = 25, hjust = 1)) +
    ggtitle ("Top 10 Most Dangerous Events") + 
    xlab ("Event Type") + ylab ("Casualties (Injuries + Fatalities)")

A: Tornados pose the greatest threat to population health


Q: Which types of events have the greatest economic consequences?

## Create two new datasets to distinguish crop damage and property damage
property <- subset(data, select = c("EVTYPE", "PROPDMG", "PROPDMGEXP"))
crops <- subset(data, select = c("EVTYPE", "CROPDMG", "CROPDMGEXP"))

## Calculate total damage by multiplying damage with multiplier
property$TOTALDMG <- property$PROPDMG * property$PROPDMGEXP
crops$TOTALDMG <- crops$CROPDMG * crops$CROPDMGEXP

## Remove all NAs from both datasets
property <- na.omit(property)
crops <- na.omit(crops)

## Create aggregate datasets for each type to graph
property_agg <- aggregate(property$TOTALDMG ~ property$EVTYPE, data = property, sum)
crops_agg <- aggregate(crops$TOTALDMG ~ crops$EVTYPE, data = crops, sum)

## Order by TOTALDMG
property_agg <- property_agg[order(-property_agg$`property$TOTALDMG`),]
crops_agg <- crops_agg[order(-crops_agg$`crops$TOTALDMG`),]

## Take only top 10
property_agg <- subset(property_agg[1:10,])
crops_agg <- subset(crops_agg[1:10,])

## Rename columns since ggplot gets confused
colnames(property_agg) <- c("EVENT", "DMG")
colnames(crops_agg) <- c("EVENT", "DMG")

## Plot property data
ggplot(data = property_agg, 
       aes(x=EVENT, y=DMG, fill=EVENT)) + 
       geom_bar(stat = "identity") + 
       labs(title="Ten 10 Causes of Property Damage") + 
       labs(x = "Type of event", y = "Property Damage") +
       theme(axis.text.x  = element_text(size=0)) + 
       scale_fill_discrete(name = "Event Type")

## Plot crops data
ggplot(data = crops_agg, 
       aes(x=EVENT, y=DMG, fill=EVENT)) + 
       geom_bar(stat = "identity") + 
       labs(title="Top 10 Causes of Crop Damage") + 
       labs(x = "Event Type", y = "Crop Damage") +
       theme(axis.text.x  = element_text(size=0)) + 
       scale_fill_discrete(name = "Event Type")

A: Floods have the greatest economic impact towards property and Drought has the greatest economic impact towards crops.