Reproducible Research: Peer Assessment 2

Sanyam Jain

Synopsis

In this report, the goal is to analyze the impact of different weather events on public health and economy based on the storm database collected from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) from 1950 - 2011. The data used will be estimates of fatalities, injuries, property and crop damage to decide which types of event are most harmful to the population health and economy. From these data, we found that high temperatures and tornado are most harmful with respect to population health, while flood, drought, and hurricane/typhoon have the greatest economic impacts.

Data Processing

Loading Packages:

library(ggplot2)
library(reshape2)
library(plyr)

# Check if the file exists in the working directory, download it (if it does
# not exist) and store it into the variable StormData
filename <- "stormData.csv.bz2"
if (!file.exists("StormData.csv")) {
    fileurl <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(fileurl, filename, "auto")
}
StormData <- read.csv(filename)

1. Population Health Analysis

Aggregating the number of fatalities and injuries per weather event.

fatalities <- ddply(StormData, "EVTYPE", summarize, Fatalities = sum(FATALITIES, 
    na.rm = T))
injuries <- ddply(StormData, "EVTYPE", summarize, Injuries = sum(INJURIES, na.rm = T))
harmdata <- merge(fatalities, injuries, by = "EVTYPE")

Cleaning Up misspelled Variables:

cleanHarm <- function(param) {
    subtotalFatalities <- sum(harmdata[grepl(param, harmdata$EVTYPE, ignore.case = T), 
        2])
    subtotalInjuries <- sum(harmdata[grepl(param, harmdata$EVTYPE, ignore.case = T), 
        3])
    regex <- grepl(param, harmdata$EVTYPE, ignore.case = T)
    total <- sum(harmdata[regex, 2])
    harmdata$EVTYPE <- as.character(harmdata$EVTYPE)
    harmdata <- subset(harmdata, !grepl(param, harmdata$EVTYPE, ignore.case = T))
    newRow <- data.frame(EVTYPE = param, Fatalities = subtotalFatalities, Injuries = subtotalInjuries)
    harmdata <- rbind(harmdata, newRow)
    return(harmdata)
}

harmdata <- cleanHarm("HEAT")
harmdata <- cleanHarm("TORNADO")
harmdata <- cleanHarm("FLOOD")
harmdata <- cleanHarm("HAIL")
harmdata <- cleanHarm("SNOW")
harmdata <- cleanHarm("CURRENT")
harmdata <- cleanHarm("COLD")
harmdata <- cleanHarm("TORM")
harmdata <- cleanHarm("TSTM")
harmdata <- cleanHarm("TORM|TSTM")
harmdata[nrow(harmdata), 1] <- "THUNDERSTORM"
harmdata <- cleanHarm("AVALAN")
harmdata[nrow(harmdata), 1] <- "AVALANCHE"
harmdata <- cleanHarm("WIND")
harmdata <- cleanHarm("HURRICANE")
harmdata <- cleanHarm("WINTER WEATHER")
harmdata <- cleanHarm("WILD")
harmdata[nrow(harmdata), 1] <- "WILD FIRE"
harmdata <- cleanHarm("LIGHT")
harmdata[nrow(harmdata), 1] <- "LIGHTNING"
harmdata <- cleanHarm("DUST DEVIL")
harmdata <- cleanHarm("ROAD")
harmdata[nrow(harmdata), 1] <- "ICE ON ROAD"
harmdata <- cleanHarm("ICE")
harmdata <- cleanHarm("FOG")
harmdata <- cleanHarm("HYPOTHERMIA")
harmdata <- cleanHarm("LANDSLIDE")
harmdata <- cleanHarm("MUDSLIDE")
harmdata <- cleanHarm("SURF")
harmdata[nrow(harmdata), 1] <- "HAZARDOUS SURF"
harmdata <- cleanHarm("WARM")
harmdata[nrow(harmdata), 1] <- "WARM WEATHER"
harmdata <- cleanHarm("MARINE")
harmdata[nrow(harmdata), 1] <- "MARINE ACCIDENT"
harmdata <- cleanHarm("WARM")
harmdata <- cleanHarm("low|cold")
harmdata[nrow(harmdata), 1] <- "COLD"
harmdata <- cleanHarm("RAINFALL|HEAVY RAIN")
harmdata[nrow(harmdata), 1] <- "HEAVY RAIN"

The fatalities total were seperated from the injuries total in the final dataset. Merging top 10 causes of injuries with the top 10 causes of fatalities:

MostHarmful <- harmdata[harmdata$EVTYPE %in% arrange(harmdata, desc(Injuries))[1:10, 
    1] | harmdata$EVTYPE %in% arrange(harmdata, desc(Fatalities))[1:10, 1], 
    ]
MostHarmful_melt <- melt(MostHarmful, id = "EVTYPE")

2. Economic Damage Analysis

Cleaning up some of the odd notations:

StormData$PROPDMGEXP <- mapvalues(StormData$PROPDMGEXP, from = c("B", "b", "M", 
    "m", "K", "k", "H", "h"), to = c("9", "9", "6", "6", "3", "3", "2", "2"), 
    warn_missing = F)
StormData$CROPDMGEXP <- mapvalues(StormData$CROPDMGEXP, from = c("B", "b", "M", 
    "m", "K", "k", "H", "h"), to = c("9", "9", "6", "6", "3", "3", "2", "2"), 
    warn_missing = F)
StormData <- subset(StormData, StormData$PROPDMGEXP != "+" & StormData$PROPDMGEXP != 
    "-" & StormData$PROPDMGEXP != "?")
StormData <- subset(StormData, StormData$CROPDMGEXP != "+" & StormData$CROPDMGEXP != 
    "-" & StormData$CROPDMGEXP != "?")

Calculating and aggregating the damage value for properties and crops per tracked weather event:

dataPropDmg <- ddply(StormData, "EVTYPE", summarize, PropertyDamage = sum(PROPDMG * 
    10^as.numeric(as.character(PROPDMGEXP)), na.rm = T))
dataCropDmg <- ddply(StormData, "EVTYPE", summarize, CropDamage = sum(CROPDMG * 
    10^as.numeric(as.character(CROPDMGEXP)), na.rm = T))
damage <- merge(dataPropDmg, dataCropDmg, by = "EVTYPE")

Cleaning Mispelled Variables :

cleanDamage <- function(arg) {
    subtotalProp <- sum(damage[grepl(arg, damage$EVTYPE, ignore.case = T), 2])
    subtotalCrop <- sum(damage[grepl(arg, damage$EVTYPE, ignore.case = T), 3])
    regex <- grepl(arg, damage$EVTYPE, ignore.case = T)
    total <- sum(damage[regex, 2])
    damage$EVTYPE <- as.character(damage$EVTYPE)
    damage <- subset(damage, !grepl(arg, damage$EVTYPE, ignore.case = T))
    newRow <- data.frame(EVTYPE = arg, PropertyDamage = subtotalProp, CropDamage = subtotalCrop)
    damage <- rbind(damage, newRow)
    return(damage)
}

damage <- cleanDamage("HEAT")
damage <- cleanDamage("TORNADO")
damage <- cleanDamage("FLOOD")
damage <- cleanDamage("HAIL")
damage <- cleanDamage("SNOW")
damage <- cleanDamage("CURRENT")
damage <- cleanDamage("COLD")
damage <- cleanDamage("TORM")
damage <- cleanDamage("TSTM")
damage <- cleanDamage("TORM|TSTM")
damage[nrow(damage), 1] <- "THUNDERSTORM"
damage <- cleanDamage("AVALAN")
damage[nrow(damage), 1] <- "AVALANCHE"
damage <- cleanDamage("WIND")
damage <- cleanDamage("HURRICANE")
damage <- cleanDamage("WINTER WEATHER")
damage <- cleanDamage("WILD")
damage[nrow(damage), 1] <- "WILD FIRE"
damage <- cleanDamage("LIGHT")
damage[nrow(damage), 1] <- "LIGHTNING"
damage <- cleanDamage("DUST DEVIL")
damage <- cleanDamage("ROAD")
damage[nrow(damage), 1] <- "ICE ON ROAD"
damage <- cleanDamage("ICE")
damage <- cleanDamage("FOG")
damage <- cleanDamage("HYPOTHERMIA")
damage <- cleanDamage("LANDSLIDE")
damage <- cleanDamage("MUDSLIDE")
damage <- cleanDamage("SURF")
damage[nrow(damage), 1] <- "HAZARDOUS SURF"
damage <- cleanDamage("WARM")
damage[nrow(damage), 1] <- "WARM WEATHER"
damage <- cleanDamage("MARINE")
damage[nrow(damage), 1] <- "MARINE ACCIDENT"
damage <- cleanDamage("WARM")
damage <- cleanDamage("low|cold")
damage[nrow(damage), 1] <- "COLD"
damage <- cleanDamage("RAINFALL|HEAVY RAIN")
damage[nrow(damage), 1] <- "HEAVY RAIN"

Finally, merging the 7 events that cause most of the damage. In addition we’ll rescale the values to USD Billions for better comparision/visualization:

MostDamage <- damage[damage$EVTYPE %in% arrange(damage, desc(PropertyDamage))[1:6, 
    1] | damage$EVTYPE %in% arrange(damage, desc(CropDamage))[1:6, 1], ]
MostDamage_melt <- melt(MostDamage, id = "EVTYPE")
MostDamage_melt$value <- MostDamage_melt$value/(10^9)

Results

arrange(MostHarmful, EVTYPE)
##          EVTYPE Fatalities Injuries
## 1     AVALANCHE        225      170
## 2          COLD        443      320
## 3       CURRENT        577      529
## 4         FLOOD       1525     8604
## 5          HAIL         20     1467
## 6          HEAT       3138     9224
## 7     HURRICANE        133     1328
## 8     LIGHTNING        817     5231
## 9          SNOW        169     1166
## 10 THUNDERSTORM       1148    13659
## 11      TORNADO       5661    91407
## 12    WILD FIRE         90     1606
## 13         WIND        469     1896
g <- ggplot(MostHarmful_melt, aes(EVTYPE, log(value), fill = variable)) + geom_bar(stat = "identity") + 
    coord_flip()
g <- g + xlab("Events") + ylab("Log (Total) ") + labs(title = "Injuries and Fatalities")
print(g)

From the economic perspective; damage to property is higher than the damage to crops. The most damaging weather event is flood when considering both property and crops or when analysing only property. For crops; the worst weather event is drought.

arrange(MostDamage, EVTYPE)
##         EVTYPE PropertyDamage  CropDamage
## 1         COLD      245869400  1416765500
## 2      DROUGHT     1046106000 13972566000
## 3        FLOOD   168211315589 12380109100
## 4         HAIL    16022991456  3111712870
## 5    HURRICANE    84656180010  5505292800
## 6 THUNDERSTORM    77519697134  6960767230
## 7      TORNADO    58603317864   417461520
## 8    WILD FIRE     8491563500   402781630
g <- ggplot(MostDamage_melt, aes(EVTYPE, value, fill = variable)) + geom_bar(stat = "identity")
g <- g + xlab("Event") + ylab("Total Damage USD Billions") + labs(title = "Property and Crop Damage") + 
    coord_flip()
print(g)

Conclusion

1.Tornadoes cause the highest number of total injuries and fatalities in the U.S. 2.Excessive heat, although it occurs less often than tornadoes, causes a relatively high number of injuries and fatalites per event. 3.Flooding causes the highest total damage expense. 4.Hurricanes and Typhoons, although they occur less frequently than flooding, causes a relatively high number of cost/impact per event.