Title: An Analysis of the NOAA Storm Database for High Impact Events

Synopsis: The basic goal of this assignment is to explore the NOAA Storm Database and answer the following basic questions about severe weather events.

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

  2. Across the United States, which types of events have the greatest economic consequences?

Data Processing: After downloading, the zip file is read by R command and stored in a data table named “storm_data”.

library(data.table)
library(ggplot2)

storm_data <- read.csv("repdata-data-StormData.csv.bz2", stringsAsFactors = FALSE)
storm_data <- as.data.table(storm_data)
setkey(storm_data, REFNUM)

dim(storm_data)
## [1] 902297     37
head(storm_data, 2)
##    STATE__          BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1:       1 4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2:       1 4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
##     EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1: TORNADO         0                                               0
## 2: TORNADO         0                                               0
##    COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1:         NA         0                        14   100 3   0          0
## 2:         NA         0                         2   150 2   0          0
##    INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1:       15    25.0          K       0                                    
## 2:        0     2.5          K       0                                    
##    LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1:     3040      8812       3051       8806              1
## 2:     3042      8755          0          0              2

Some of the data in the data table is not relevant to the requirements of the assignment. As the assignment deals with the total consequences of weather events with respect to human health and economic impact, information pertaining to the following are selected:

  1. origin, type and date of weather events

  2. fatalities and injuries

  3. property damage

  4. the NOAA reference number (for convenience)

relevant_data <- c("STATE__", "BGN_DATE", "STATE", "EVTYPE", "FATALITIES", 
                   "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", 
                   "CROPDMGEXP", "REFNUM")
storm_data <- subset(storm_data, select = relevant_data)
rm(relevant_data)

storm_data <- storm_data[, BGN_DATE := as.Date(BGN_DATE, "%m/%d/%Y")] # reclass begin_date
modern <- as.POSIXlt("1995/12/31") # set a threshold date
modern_index <- storm_data[, which(as.POSIXlt(BGN_DATE) > modern)] # identify events after threshold
storm_data <- storm_data[modern_index] # subset for events after threshold

rm(modern, modern_index)

event_types <- storm_data[, unique(EVTYPE)]
event_types <- sort(event_types)

Since we are only interested in the human health and economic impacts resulting from storm events, we could eliminate any events that have none of these consequences or more succinctly keep only those events that have an impact of any kind.

Differentiate between zero impact and non-zero impact events with a new factor variable

storm_data <- storm_data[, zero_impact := as.factor(ifelse(FATALITIES == 0 &
                                                             INJURIES == 0 &
                                                              PROPDMG == 0 &
                                                    CROPDMG == 0, "T", "F"))]
# create an index of non-zero impact events
impact_index <- storm_data[, which(zero_impact == "F")]
# subset on the index
storm_data <- storm_data[impact_index]
# remove the zero_impact column since it is no longer needed
storm_data <- storm_data[, zero_impact := NULL]
storm_data_check <- nrow(storm_data)
rm(impact_index)

# have another look at the unique storm events in the data table
event_types <- storm_data[, unique(EVTYPE)]
event_types <- sort(event_types)

After subsettting for storm events with impacts, there are 222 unique event types in the “storm_data” data table.

To further subset the data we examine the frequency of the types of storm events that have health and economic impacts.

Creating a new data table with occurrences by event type

event_type_freq <- storm_data[, as.data.table(table(EVTYPE))] 
setnames(event_type_freq, c("event_type", "occurrences"))

# add a column of frequencies representing the % occurrence of all event types
event_type_freq <- event_type_freq[, frequency := 
                                     round((occurrences/nrow(storm_data)*100),1)] 
setnames(event_type_freq, c("event_type", "occurrences", "frequency"))
setkey(event_type_freq, occurrences)

# discard any event with a frequecy < 0.1%
cutoff <- event_type_freq[, match(0.1, frequency)] 
event_type_freq <- event_type_freq[cutoff:nrow(event_type_freq)]
head(event_type_freq[order(-occurrences)], nrow(event_type_freq))
##                  event_type occurrences frequency
##  1:               TSTM WIND       61775      30.7
##  2:       THUNDERSTORM WIND       43097      21.4
##  3:                    HAIL       22679      11.3
##  4:             FLASH FLOOD       19011       9.4
##  5:                 TORNADO       12366       6.1
##  6:               LIGHTNING       11152       5.5
##  7:                   FLOOD        9513       4.7
##  8:               HIGH WIND        5402       2.7
##  9:             STRONG WIND        3367       1.7
## 10:            WINTER STORM        1460       0.7
## 11:              HEAVY RAIN        1047       0.5
## 12:              HEAVY SNOW        1029       0.5
## 13:                WILDFIRE         847       0.4
## 14:    URBAN/SML STREAM FLD         702       0.3
## 15:          EXCESSIVE HEAT         685       0.3
## 16:               ICE STORM         631       0.3
## 17:          TSTM WIND/HAIL         441       0.2
## 18:          TROPICAL STORM         410       0.2
## 19:          WINTER WEATHER         405       0.2
## 20:        WILD/FOREST FIRE         381       0.2
## 21:             RIP CURRENT         364       0.2
## 22:               AVALANCHE         264       0.1
## 23:                 DROUGHT         258       0.1
## 24:            RIP CURRENTS         239       0.1
## 25:                BLIZZARD         228       0.1
## 26:        LAKE-EFFECT SNOW         194       0.1
## 27:               LANDSLIDE         190       0.1
## 28:             STORM SURGE         169       0.1
## 29:            EXTREME COLD         164       0.1
## 30:                    HEAT         164       0.1
## 31:           COASTAL FLOOD         149       0.1
## 32:      WINTER WEATHER/MIX         139       0.1
## 33:               HURRICANE         126       0.1
## 34:               HIGH SURF         122       0.1
## 35:              LIGHT SNOW         119       0.1
## 36:            FROST/FREEZE         116       0.1
## 37: EXTREME COLD/WIND CHILL         111       0.1
## 38:        MARINE TSTM WIND         109       0.1
## 39:                     FOG         101       0.1
##                  event_type occurrences frequency

Therefore, we found 39 most frequent event types (where frequency of occurence >= 0.1%) represent 99.3% of the total number of storm events. This is a manageable number of events.

rm(cutoff)

# calculate property and crop loss
storm_data <- storm_data[, property_loss :=
              ifelse(PROPDMGEXP == "K", 1000 * PROPDMG,
              ifelse(PROPDMGEXP == "M", 1000000 * PROPDMG, 
              ifelse(PROPDMGEXP == "B", 1000000000 * PROPDMG, 0)))]

storm_data <- storm_data[, crop_loss :=
              ifelse(CROPDMGEXP == "K", 1000 * CROPDMG,
              ifelse(CROPDMGEXP == "M", 1000000 * CROPDMG, 
              ifelse(CROPDMGEXP == "B", 1000000000 * CROPDMG, 0)))]

# key the data set on the NOAA reference number

str(storm_data)
## Classes 'data.table' and 'data.frame':   201319 obs. of  13 variables:
##  $ STATE__      : num  6 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE     : Date, format: "1995-12-31" "1996-01-06" ...
##  $ STATE        : chr  "CA" "AL" "AL" "AL" ...
##  $ EVTYPE       : chr  "HIGH WINDS" "WINTER STORM" "TORNADO" "TSTM WIND" ...
##  $ FATALITIES   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PROPDMG      : num  5.5 380 100 3 5 2 400 12 8 12 ...
##  $ PROPDMGEXP   : chr  "M" "K" "K" "K" ...
##  $ CROPDMG      : num  7 38 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP   : chr  "M" "K" "" "" ...
##  $ REFNUM       : num  192312 248768 248769 248770 248771 ...
##  $ property_loss: num  5500000 380000 100000 3000 5000 2000 400000 12000 8000 12000 ...
##  $ crop_loss    : num  7000000 38000 0 0 0 0 0 0 0 0 ...
##  - attr(*, "sorted")= chr "REFNUM"
##  - attr(*, ".internal.selfref")=<externalptr>
# weather events with maximum property loss
pl_sort <- order(storm_data$property_loss, decreasing = TRUE)
storm_data[head(pl_sort), list(BGN_DATE, STATE, EVTYPE, property_loss)]
##      BGN_DATE STATE            EVTYPE property_loss
## 1: 2006-01-01    CA             FLOOD     1.150e+11
## 2: 2005-08-29    LA       STORM SURGE     3.130e+10
## 3: 2005-08-28    LA HURRICANE/TYPHOON     1.693e+10
## 4: 2005-08-29    MS       STORM SURGE     1.126e+10
## 5: 2005-10-24    FL HURRICANE/TYPHOON     1.000e+10
## 6: 2005-08-28    MS HURRICANE/TYPHOON     7.350e+09
# weather events with maximum crop loss
cl_sort <- order(storm_data$crop_loss, decreasing = TRUE)
storm_data[head(cl_sort), list(BGN_DATE, STATE, EVTYPE, crop_loss)]
##      BGN_DATE STATE            EVTYPE crop_loss
## 1: 2005-08-29    MS HURRICANE/TYPHOON 1.510e+09
## 2: 2006-01-01    TX           DROUGHT 1.000e+09
## 3: 1998-12-20    CA      EXTREME COLD 5.960e+08
## 4: 2001-08-01    IA           DROUGHT 5.788e+08
## 5: 2000-11-01    TX           DROUGHT 5.150e+08
## 6: 1998-07-06    OK           DROUGHT 5.000e+08

The highest property loss of $1.15e+11 is recorded for a flood in California in 2006. And the highest crop loss of $1.51e+09 is recorded for a Hurricane / Typhoon in MS in 2005.

rm(pl_sort, cl_sort)

Our goal is to review the following questions:-

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

  2. Across the United States, which types of events have the greatest economic consequences?

We address these questions with bubble plots. To draw the plots we need to prepare a summary data table from my “storm_data” data table that totals the human health impacts and totals the economic impacts by weather event. We use simple sums of fatalties and injuires to represent human health impacts. Similarly, we use simple sums of property loss and crop loss to represent economic impacts. Lastly, we add a column that totals the types of weather events.

bubble_plot <- storm_data[, list(sum(FATALITIES), sum(INJURIES),
                                                  sum(property_loss),
                                                  sum(crop_loss)), 
                                                  by = EVTYPE]
setnames(bubble_plot, c("EVTYPE", "FATALITIES", "INJURIES", "property_loss", 
                        "crop_loss"))
setkey(bubble_plot, EVTYPE)

event_count <- storm_data[, c(table(EVTYPE))]
bubble_plot <- bubble_plot[, events := event_count]

bubble_plot <- bubble_plot[, human_health := (FATALITIES + INJURIES)]
bubble_plot <- bubble_plot[, economic_impact := (property_loss + crop_loss)]

Since the questions are comparative and the totals for human health and economic impacts between particular event types can differ wildly, we decided to normalize the data. We set the events with maximum human health and economic impacts equal to 100. All other events, have human health and economic impacts at some proportion of these maximums. We added columns to the dataset accrodingly.

bubble_plot <- bubble_plot[, human_health_nml :=
                               round(human_health/max(human_health)*100,1)]
bubble_plot <- bubble_plot[, economic_impact_nml :=
                            round(economic_impact/max(economic_impact)*100,1)]

# re-order the columns using column numbers
setcolorder(bubble_plot, c(1, 6, 2, 3, 4, 5, 7, 8, 9, 10))

#Here is what structure of the "bubble_plot" data table looks like.
str(bubble_plot)
## Classes 'data.table' and 'data.frame':   222 obs. of  10 variables:
##  $ EVTYPE             : chr  "   HIGH SURF ADVISORY" " FLASH FLOOD" " TSTM WIND" " TSTM WIND (G45)" ...
##  $ events             : int  1 1 2 1 3 8 2 264 1 1 ...
##  $ FATALITIES         : num  0 0 0 0 0 0 0 223 1 70 ...
##  $ INJURIES           : num  0 0 0 0 0 0 0 156 24 385 ...
##  $ property_loss      : num  200000 50000 8100000 8000 0 ...
##  $ crop_loss          : num  0 0 0 0 28820000 ...
##  $ human_health       : num  0 0 0 0 0 0 0 379 25 455 ...
##  $ economic_impact    : num  200000 50000 8100000 8000 28820000 ...
##  $ human_health_nml   : num  0 0 0 0 0 0 0 1.7 0.1 2.1 ...
##  $ economic_impact_nml: num  0 0 0 0 0 0 0 0 0 0.4 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "EVTYPE"

We used quantiles to provide a data based rationalization of which weather events to classify as low, medium or high impact events. We chose 3 probabilities representing high, medium and low impacts.

Quantiles give a data based rationalization for low, medium and high impact events.

hh_quant <- quantile(bubble_plot$human_health_nml, c(0.33, 0.66, 1))
ei_quant <- quantile(bubble_plot$economic_impact_nml, c(0.33, 0.66, 1))

# look at the data
hh_quant
##  33%  66% 100% 
##    0    0  100
ei_quant
##  33%  66% 100% 
##    0    0  100

Results

Ploting data for weather events with relatively high human health impacts

p1 <- ggplot(bubble_plot[human_health_nml >= hh_quant[2]], 
             aes(human_health_nml, economic_impact_nml, size = events, 
             label = EVTYPE))
p1 <- p1 + geom_jitter(colour="wheat1", fill = "wheat3", shape = 21, 
                       alpha = 0.3) + scale_size_area(max_size = 25) + 
                       geom_text(size=3)
p1 <- p1 + xlim(c(0, 100)) + ylim(c(0, 100))
p1 <- p1 + labs(title = "Relative Comparison of the Cumulative Human Health 
                         Impacts of Severe Weather Events, \nUSA 1996-2011")
p1 <- p1 + xlab("Human Health Impact \n(Normalized: Tornado = 100 )") + 
           ylab("Economic Impact \n(Normalized: Storm Surge = 100 )")
g <- guide_legend("Number of Events")
p1 <- p1 + guides(size = g)
p1 <- p1 + theme(legend.position = "bottom")
p1 <- p1 + theme_bw()
print(p1)
## Warning: Removed 136 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-10

Ploting data for weather events with relatively high economic impacts

p2 <- ggplot(bubble_plot[economic_impact_nml >= ei_quant[2]], 
             aes(human_health_nml, economic_impact_nml, size = events, 
             label = EVTYPE))
p2 <- p2 + geom_jitter(colour="blue1", fill = "blue3", shape = 21, 
                       alpha = 0.3) + scale_size_area(max_size = 25) + 
                       geom_text(size = 3)
p1 <- p2 + xlim(c(0, 100)) + ylim(c(0, 100))
p2 <- p2 + labs(title= "Relative Comparison of the Cumulative Economic 
                        Impacts of Severe Weather Events, \nUSA 1996-2011")
p2 <- p2 + xlab("Human Health Impact \n(Normalized: Tornado = 100 )") + 
           ylab("Economic Impact \n(Normalized: Storm Surge = 100 )")
g <- guide_legend("Number of Events")
p2 <- p2 + guides(size = g)
p2 <- p2 + theme(legend.position = "bottom")
p2 <- p2 + theme_bw()
print(p2)

plot of chunk unnamed-chunk-11