Synopsis

This report will explore the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database contains characteristics of major storms and weather events in the United States during 1950-2011. This report will find answer for two questions.

  1. Which types of events are most harmful with respect to population health?
  2. Which types of events have the greatest economic consequences?

This report is part of a course project for Reproducible Research on Coursera.

Data Processing

This dataset can be downloaded from Storm Data

Reading raw data

#workingDirectory <- "C:\\Users\\jack\\Desktop\\DataScience Specialization\\5 Reproducible Research\\week4"

#if(!file.exists(workingDirectory)) {
#        dir.create(workingDirectory)
#}

#setwd(workingDirectory)

#url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#download.file(url, destfile = "StormData.csv.bz2", mode = "wb")

data.raw <- read.table("StormData.csv.bz2", comment.char = "#", header = TRUE, sep = ",", na.strings = "")

Prepare data for question 1

Data are filtered to have only necessary columns so that it is faster when doing a process. We select columns “FATALITIES” and “INJURIES” to address “population health”.

# data for question 1
data.q1 <- data.raw[data.raw$FATALITIES > 0 | data.raw$INJURIES > 0, c("EVTYPE","FATALITIES","INJURIES")]

We found that data on column “EVTYPE” (Event Type) does not consistent. For example, for event type “Huricane”, some row is “HURRICANE ERIN” and some row is “HURRICANE EDOUARD”. So we need to map these unofficial event type to be official event type. The official event type comes from Storm Data Documentation page 6. There are 48 event types officially.

# official event type
official_ev <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill", 
                 "Debris Flow", "Dense Fog", "Dense Smoke", "Drought", "Dust Devil", "Dust Storm", 
                 "Excessive Heat", "Extreme Cold/Wind Chill", "Flash Flood", "Flood", "Freezing Fog", 
                 "Frost/Freeze", "Funnel Cloud", "Hail", "Heat", "Heavy Rain", "Heavy Snow", "High Surf", 
                 "High Wind", "Hurricane/Typhoon", "Ice Storm", "Lakeshore Flood", "Lake-Effect Snow", 
                 "Lightning", "Marine Hail", "Marine High Wind", "Marine Strong Wind", "Marine Thunderstorm Wind", 
                 "Rip Current", "Seiche", "Sleet", "Storm Tide", "Strong Wind", "Thunderstorm Wind", 
                 "Tornado", "Tropical Depression", "Tropical Storm", "Tsunami", "Volcanic Ash", "Waterspout", 
                 "Wildfire", "Winter Storm", "Winter Weather")

Then we map EVTYPE to be official_ev. We use function ‘amatch’ to do mapping.

library(stringdist)
## Warning: package 'stringdist' was built under R version 3.3.2
unofficial_ev_q1 <- unique(as.character(toupper(data.q1$EVTYPE)))
compare_ev_q1 <- cbind(unofficial_ev_q1, official_ev[amatch(unofficial_ev_q1, toupper(official_ev), maxDist = 2)])
compare_ev_q1 <- as.data.frame(compare_ev_q1)
names(compare_ev_q1) <- c("unoffice_ev", "official_ev")

data.q1$official_ev <- NULL
for (i in unique(data.q1$EVTYPE)) {
        data.q1[data.q1$EVTYPE == i,c("official_ev")] = as.character(compare_ev_q1[compare_ev_q1$unoffice_ev == as.character(toupper(i)),2])
}

Now we have official_ev for each EVTYPE but not all of EVTYPE can be mapped. So We need to map them manually. We will aggregate FATALITIES and INJURIES based on each unmapped EVTYPE and then sorting them to find which unmapped EVTYPE should be mapped manually.

data.q1[is.na(data.q1$official_ev), c("official_ev")] = "unmapped"
data.q1.unmapped <- data.q1[data.q1$official_ev == "unmapped",]

tmp <- aggregate(data.q1.unmapped$FATALITIES ~ data.q1.unmapped$EVTYPE, FUN = sum )
tail(tmp[order(tmp[,2]),], n=10)
##       data.q1.unmapped$EVTYPE data.q1.unmapped$FATALITIES
## 153 UNSEASONABLY WARM AND DRY                          29
## 12                       COLD                          35
## 93                  LANDSLIDE                          38
## 64       HEAVY SURF/HIGH SURF                          42
## 76                  HURRICANE                          61
## 37                        FOG                          62
## 27               EXTREME HEAT                          96
## 26               EXTREME COLD                         160
## 53                  HEAT WAVE                         172
## 145                 TSTM WIND                         504
tmp <- aggregate(data.q1.unmapped$INJURIES ~ data.q1.unmapped$EVTYPE, FUN = sum )
tail(tmp[order(tmp[,2]),], n=10)
##     data.q1.unmapped$EVTYPE data.q1.unmapped$INJURIES
## 161                    WIND                        86
## 149          TSTM WIND/HAIL                        95
## 88                      ICE                       137
## 27             EXTREME HEAT                       155
## 45                    GLAZE                       216
## 26             EXTREME COLD                       231
## 53                HEAT WAVE                       309
## 160        WILD/FOREST FIRE                       545
## 37                      FOG                       734
## 145               TSTM WIND                      6957

So we will map these EVTYPE manually based on information from Storm Data Documentation (We are not sure that we map it properly).

data.q1[data.q1$EVTYPE == "TSTM WIND", c("official_ev")] = "Marine Thunderstorm Wind"
data.q1[data.q1$EVTYPE == "HEAT WAVE", c("official_ev")] = "Excessive Heat"
data.q1[data.q1$EVTYPE == "EXTREME COLD", c("official_ev")] = "Extreme Cold/Wind Chill"
data.q1[data.q1$EVTYPE == "EXTREME HEAT", c("official_ev")] = "Excessive Heat"
data.q1[data.q1$EVTYPE == "FOG", c("official_ev")] = "Dense Fog"
data.q1[data.q1$EVTYPE == "HURRICANE", c("official_ev")] = "Hurricane/Typhoon"
data.q1[data.q1$EVTYPE == "HEAVY SURF/HIGH SURF", c("official_ev")] = "High Surf"
data.q1[data.q1$EVTYPE == "COLD", c("official_ev")] = "Cold/Wind Chill"
data.q1[data.q1$EVTYPE == "UNSEASONABLY WARM AND DRY", c("official_ev")] = "Heat"

data.q1[data.q1$EVTYPE == "WILD/FOREST FIRE", c("official_ev")] = "Wildfire"
data.q1[data.q1$EVTYPE == "GLAZE", c("official_ev")] = "Freezing Fog"
data.q1[data.q1$EVTYPE == "ICE", c("official_ev")] = "Ice Storm"
data.q1[data.q1$EVTYPE == "TSTM WIND/HAIL", c("official_ev")] = "Marine Thunderstorm Wind"
data.q1[data.q1$EVTYPE == "WIND", c("official_ev")] = "Strong Wind"

Prepare data for question 2

Data are filtered to have only necessary columns for faster processing. We select columns “PROPDMG” (property damage) and “CROPDMG” (crop damage) to address “economic consequences”.

data.q2 <- data.raw[data.raw$PROPDMG > 0 | data.raw$CROPDMG > 0, c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]

PROPDMGEXP is exponent value for PROPDMG and CROPDMGEXP is the same for CROPDMG. We will find TOTALDMG (value of total Damage in $) which is sum of PROPDMG and CROPDMG.

data.q2$TOTALPROPDMG <- 0
data.q2[is.na(data.q2$PROPDMGEXP),c("PROPDMGEXP")] = "?"

data.q2[data.q2$PROPDMGEXP == "B",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "B",c("PROPDMG")] * 1000000000
data.q2[data.q2$PROPDMGEXP == "h" | data.q2$PROPDMGEXP == "H",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "h" | data.q2$PROPDMGEXP == "H",c("PROPDMG")] * 100
data.q2[data.q2$PROPDMGEXP == "K",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "K",c("PROPDMG")] * 1000
data.q2[data.q2$PROPDMGEXP == "m" | data.q2$PROPDMGEXP == "M",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "m" | data.q2$PROPDMGEXP == "M",c("PROPDMG")] * 1000000

data.q2$TOTALCROPDMG <- 0
data.q2[is.na(data.q2$CROPDMGEXP),c("CROPDMGEXP")] = "?"
data.q2[data.q2$CROPDMGEXP == "B",c("TOTALCROPDMG")] = data.q2[data.q2$CROPDMGEXP == "B",c("CROPDMG")] * 1000000000
data.q2[data.q2$CROPDMGEXP == "k" | data.q2$CROPDMGEXP == "K", c("TOTALCROPDMG")] = data.q2[data.q2$CROPDMGEXP == "k" | data.q2$CROPDMGEXP == "K",c("CROPDMG")] * 1000
data.q2[data.q2$CROPDMGEXP == "m" | data.q2$CROPDMGEXP == "M", c("TOTALCROPDMG")] = data.q2[data.q2$CROPDMGEXP == "m" | data.q2$CROPDMGEXP == "M",c("CROPDMG")] * 1000000

data.q2$TOTALDMG <- data.q2$TOTALPROPDMG + data.q2$TOTALCROPDMG

Same as question 1, We need to map unofficial EVTYPE to be official_ev.

library(stringdist)
unofficial_ev_q2 <- unique(as.character(toupper(data.q2$EVTYPE)))
compare_ev_q2 <- cbind(unofficial_ev_q2, official_ev[amatch(unofficial_ev_q2, toupper(official_ev), maxDist = 2)])
compare_ev_q2 <- as.data.frame(compare_ev_q2)
names(compare_ev_q2) <- c("unoffice_ev", "official_ev")

data.q2$official_ev <- NULL
for (i in unique(data.q2$EVTYPE)) {
        data.q2[data.q2$EVTYPE == i,c("official_ev")] = as.character(compare_ev_q2[compare_ev_q2$unoffice_ev == as.character(toupper(i)),2])
}

Since not all of EVTYPE can be mapped to official_ev. We will aggregate TOTALDMG based on each unmapped EVTYPE and then sorting them to find which unmapped EVTYPE should be mapped manually.

data.q2[is.na(data.q2$official_ev), c("official_ev")] = "unmapped"
data.q2.unmapped <- data.q2[data.q2$official_ev == "unmapped",]

tmp <- aggregate(data.q2.unmapped$TOTALDMG ~ data.q2.unmapped$EVTYPE, FUN = sum )
tail(tmp[order(tmp[,2]),], n=15)
##        data.q2.unmapped$EVTYPE data.q2.unmapped$TOTALDMG
## 171                  LANDSLIDE                 344613000
## 156             HURRICANE ERIN                 394110000
## 67                      FREEZE                 446430000
## 307                    TYPHOON                 601055000
## 215        SEVERE THUNDERSTORM                1205560000
## 40                EXTREME COLD                1360710400
## 284 TORNADOES, TSTM WIND, HAIL                1602500000
## 114  HEAVY RAIN/SEVERE WEATHER                2500000000
## 327           WILD/FOREST FIRE                3108626330
## 159             HURRICANE OPAL                3191846000
## 246           STORM SURGE/TIDE                4642038000
## 290                  TSTM WIND                5038935790
## 209                RIVER FLOOD               10148404500
## 153                  HURRICANE               14610229010
## 245                STORM SURGE               43323541000

So we will map these EVTYPE manually based on information from Storm Data Documentation (We are not sure that we map it properly).

data.q2[data.q2$EVTYPE == "TYPHOON", c("official_ev")] = "Hurricane/Typhoon"
data.q2[data.q2$EVTYPE == "SEVERE THUNDERSTORM", c("official_ev")] = "Thunderstorm Wind"
data.q2[data.q2$EVTYPE == "EXTREME COLD", c("official_ev")] = "Extreme Cold/Wind Chill"
data.q2[data.q2$EVTYPE == "TORNADOES, TSTM WIND, HAIL", c("official_ev")] = "Tornado"
data.q2[data.q2$EVTYPE == "HEAVY RAIN/SEVERE WEATHER", c("official_ev")] = "Heavy Rain"
data.q2[data.q2$EVTYPE == "WILD/FOREST FIRE", c("official_ev")] = "Wildfire"
data.q2[data.q2$EVTYPE == "HURRICANE OPAL", c("official_ev")] = "Hurricane/Typhoon"
data.q2[data.q2$EVTYPE == "STORM SURGE/TIDE", c("official_ev")] = "Storm Tide"
data.q2[data.q2$EVTYPE == "TSTM WIND", c("official_ev")] = "Marine Thunderstorm Wind"
data.q2[data.q2$EVTYPE == "RIVER FLOOD", c("official_ev")] = "Flash Flood"
data.q2[data.q2$EVTYPE == "HURRICANE", c("official_ev")] = "Hurricane/Typhoon"
data.q2[data.q2$EVTYPE == "STORM SURGE", c("official_ev")] = "Storm Tide"

Results

Question 1 Which types of events are most harmful with respect to population health?

We separate meaning of “population health”" into two terms “FATALITIES” and “INJURIES”.

Here are most FATALITIES events.

data.q1.plot1 <- aggregate(data.q1$FATALITIES ~ data.q1$official_ev, FUN = sum )
data.q1.plot1 <- data.q1.plot1[order(data.q1.plot1[,2]),]
names(data.q1.plot1) <- c("official_ev", "FATALITIES")
tail(data.q1.plot1, n=10)
##                 official_ev FATALITIES
## 10  Extreme Cold/Wind Chill        285
## 12                    Flood        470
## 34                 unmapped        491
## 26 Marine Thunderstorm Wind        519
## 27              Rip Current        572
## 23                Lightning        817
## 16                     Heat        966
## 11              Flash Flood        980
## 9            Excessive Heat       2171
## 31                  Tornado       5633
slices <- tail(data.q1.plot1,n=6)[,c("FATALITIES")]
lbls <- tail(data.q1.plot1,n=6)[,c("official_ev")]
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # add % to labels 
lbls <- paste(lbls, "(", slices, ")") # add fatalities value 
pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Most Fatalities Events") 

Here are most INJURIES events.

data.q1.plot2 <- aggregate(data.q1$INJURIES ~ data.q1$official_ev, FUN = sum )
data.q1.plot2 <- data.q1.plot2[order(data.q1.plot2[,2]),]
names(data.q1.plot2) <- c("official_ev", "INJURIES")
tail(data.q1.plot2, n=10)
##                 official_ev INJURIES
## 36                 Wildfire     1606
## 11              Flash Flood     1777
## 16                     Heat     2100
## 22                Ice Storm     2112
## 30        Thunderstorm Wind     2411
## 23                Lightning     5230
## 12                    Flood     6789
## 9            Excessive Heat     6989
## 26 Marine Thunderstorm Wind     7078
## 31                  Tornado    91346
slices <- tail(data.q1.plot2,n=6)[,c("INJURIES")]
lbls <- tail(data.q1.plot2,n=6)[,c("official_ev")]
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # add % to labels 
lbls <- paste(lbls, "(", slices, ")") # add injuries value
pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Most Injuries Events", cex = 0.8) 

Question 2 which types of events have the greatest economic consequences?

We use TOTALDMG (total damage in $) which is sum of PROPDMG (property damage) and CROPDMG (crop damage) to address economic consequences.

Here are greatest economic consequences events.

data.q2.plot <- aggregate(data.q2$TOTALDMG ~ data.q2$official_ev, FUN = sum )
data.q2.plot <- data.q2.plot[order(data.q2.plot[,2]),]
names(data.q2.plot) <- c("official_ev", "TOTALDMG")
tail(data.q2.plot, n=10)
##          official_ev     TOTALDMG
## 40    Tropical Storm   8382236550
## 45          Wildfire   8894313130
## 25         Ice Storm   8967041310
## 8            Drought  15018672000
## 18              Hail  18764271720
## 13       Flash Flood  27719811610
## 35        Storm Tide  47965579000
## 38           Tornado  58954615090
## 24 Hurricane/Typhoon  90316842810
## 14             Flood 150325728250
slices <- tail(data.q2.plot,n=8)[,c("TOTALDMG")]
lbls <- tail(data.q2.plot,n=8)[,c("official_ev")]
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # add % to labels 
lbls <- paste(lbls, "($", round(slices/1000000000,digits = 2) , "billion)") # add total damage in $billion 
pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Greatest Economic Consequences Events", cex=0.8)