Reproducible Research: Peer Assessment 2 - NOAA Storm Events Data Analysis

Synopsis

We use the NOAA Storm Events data to explore which types of severe weather events are most harmful with respect to population health and which have the greatest economic consequences. The analysis below can help the United States government or municipal manager who might be responsible for preparing for severe weather events. Based on our analysis, we found that tornadoes cause the highest fatalities and injuries. And floods cause the highest economic impact.

Data Processing

The data can be downloaded here https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2.

# download the file and use setwd() to set working directory
# we want to load only column number 8, 23, 24, 25, 26, 27, 28
data <- read.csv("StormData.csv", header=TRUE, stringsAsFactors = FALSE, 
                    colClasses=c('NULL', 'NULL', 'NULL', 'NULL', 'NULL', #column 1-5
                                 'NULL', 'NULL', NA, 'NULL', 'NULL',    #6-10
                                 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', #11-15
                                 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', #16-20
                                 'NULL', 'NULL', NA, NA, NA, #21-25
                                  NA, NA, NA, 'NULL', 'NULL', #26-30
                                 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', #31-35
                                 'NULL', 'NULL' #36-37
                                 ))
names(data)
## [1] "EVTYPE"     "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP"
## [6] "CROPDMG"    "CROPDMGEXP"

Here are brief definitions of the variables:

-EVTYPE: Event type

-FATALITIES: Headcount of deaths

-INJURIES: Headcount of people who got injured

-PROPDMG: Property damage

-PROPDMGEXP: Property damage multiplier (eg. H = 100, K = 1000)

-CROPDMG: Crop damage

-CROPDMGEXP: Crop damage multiplier

Note that to get the dollar amount of a property damage, the value from PROPDMG must be multiplied by the multiplier from PROPDMGEXP. For instance, if PROPDMG is 6 and PROPDMGEXP is K, then the property damage was estimated to be $6,000.

For rows with unidentifiable information or missing values, we do not include them in the analysis.

The data in raw form is not yet ready for analysis, we perform the following data cleansing.

#check and clean PROPDMGEXP
table(data$PROPDMGEXP)
## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      h      H      K      m      M 
##      4      5      1     40      1      6 424665      7  11330
#replacing content with NA's or the appropriate multiplier
data$PROPDMGEXP[(data$PROPDMGEXP == "+") | (data$PROPDMGEXP == "-") |
(data$PROPDMGEXP == "?")] <- NA
data$PROPDMGEXP[(data$PROPDMGEXP == "h") | (data$PROPDMGEXP == "H")] <- 100
data$PROPDMGEXP[(data$PROPDMGEXP == "k") | (data$PROPDMGEXP == "K")] <- 1000
data$PROPDMGEXP[(data$PROPDMGEXP == "m") | (data$PROPDMGEXP == "M")] <- 1000000
data$PROPDMGEXP[(data$PROPDMGEXP == "B")] <- 1000000000

for(i in 1:8){
data$PROPDMGEXP[(data$PROPDMGEXP == i)] <- NA
}

table(data$PROPDMGEXP) #check
## 
##             0    100   1000  1e+06  1e+09 
## 465934    216      7 424665  11337     40
data$PROPDMGEXP <- as.numeric(data$PROPDMGEXP) #appropriately classify PROPDMGEXP

We do the same to CROPDMGEXP variable

table(data$CROPDMGEXP)
## 
##             ?      0      2      B      k      K      m      M 
## 618413      7     19      1      9     21 281832      1   1994
data$CROPDMGEXP[(data$CROPDMGEXP == "+") | (data$CROPDMGEXP == "-") |
                  (data$CROPDMGEXP == "?")] <- NA
data$CROPDMGEXP[(data$CROPDMGEXP == "h") | (data$CROPDMGEXP == "H")] <- 100
data$CROPDMGEXP[(data$CROPDMGEXP == "k") | (data$CROPDMGEXP == "K")] <- 1000
data$CROPDMGEXP[(data$CROPDMGEXP == "m") | (data$CROPDMGEXP == "M")] <- 1000000
data$CROPDMGEXP[(data$CROPDMGEXP == "B")] <- 1000000000

for(i in 1:8){
  data$CROPDMGEXP[(data$CROPDMGEXP == i)] <- NA
}

table(data$CROPDMGEXP) #check
## 
##             0   1000  1e+06  1e+09 
## 618413     19 281853   1995      9
data$CROPDMGEXP <- as.numeric(data$CROPDMGEXP)  #appropriately classify CROPDMGEXP

Now, we want to find total economic damage, which we define as property plus crop damage.

data$TOTDMG <- data$PROPDMG * data$PROPDMGEXP + 
              data$CROPDMG * data$CROPDMGEXP

Now, we can aggregate the fatalities, injuries and damages per event type.

#Here is a function that takes 2 parameters: the variable of interest, and amount of top totals 
highestTotal <- function(varType, highestTot){
  aggTable <- aggregate(varType, list(event_type = data$EVTYPE), FUN=sum, na.rm = TRUE)
  names(aggTable)[names(aggTable) == 'x'] <- deparse(substitute(varType))
  aggTable <- aggTable[(order(aggTable[ ,deparse(substitute(varType))], decreasing=T)[c(1:highestTot)]),]
  return(aggTable)
}

#This will show the top 5 highest fatalities by event types 
highestFatalities <- highestTotal(data$FATALITIES, 5)
colnames(highestFatalities)[2] <- "fatalities"
highestFatalities
##         event_type fatalities
## 834        TORNADO       5633
## 130 EXCESSIVE HEAT       1903
## 153    FLASH FLOOD        978
## 275           HEAT        937
## 464      LIGHTNING        816
#This will show the top 5 highest injuries by event types
highestInjuries <- highestTotal(data$INJURIES, 5)
colnames(highestInjuries) [2] <- "injuries"
highestInjuries
##         event_type injuries
## 834        TORNADO    91346
## 856      TSTM WIND     6957
## 170          FLOOD     6789
## 130 EXCESSIVE HEAT     6525
## 464      LIGHTNING     5230
highestDamage <- highestTotal(data$TOTDMG, 5)
colnames(highestDamage) [2] <- "damage"
highestDamage
##            event_type    damage
## 170             FLOOD 1.380e+11
## 411 HURRICANE/TYPHOON 2.935e+10
## 834           TORNADO 1.657e+10
## 402         HURRICANE 1.241e+10
## 590       RIVER FLOOD 1.011e+10

Results

We can plot the highest counts in 3 different catergories, Fatalities, Injuries and Damage.

library(ggplot2)
ggplot(data=highestFatalities, aes(x = event_type, y = fatalities))+
         geom_bar(stat="identity")+
        ggtitle("Top 5 event types with highest fatalities")

plot of chunk unnamed-chunk-6

ggplot(data=highestInjuries, aes(x = event_type, y = injuries))+
         geom_bar(stat="identity")+
        ggtitle("Top 5 event types with highest injuries")

plot of chunk unnamed-chunk-6

Based on the figures, the tornadoes are most harmful to population health.

#If you want to turn off the scientific notation
options(scipen=999)

ggplot(data=highestDamage, aes(x = event_type, y = damage))+
         geom_bar(stat="identity")+
        theme(axis.text.x = element_text(angle=90))+
        ggtitle("Top 5 event types with highest economic impact")

plot of chunk unnamed-chunk-7

The figure above shows that the floods, hurricane, and hurrican/typoon cause the highest economic consequences!