Storm Data Analysis: Event type identification

The U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database tracks characteristics of major storms and weather events. Storm data documentation can be found here. The events in the database start in the year 1950 and end in November 2011.

The goal of the analysis is to explore NOAA storm databasedatabase and to identify event types that are most harmful with respect to population health and and to identify those event types that have the greatest economic consequences. US area (50 states) is only taken into account.

Data Processing

Loading raw data:

data <- read.csv("repdata_data_StormData.csv", stringsAsFactor = FALSE)
dim(data)
## [1] 902297     37

Data structure:

str(data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Subsetting data to use only those variables that are needed for the analysis:

d <- data[, c("COUNTYNAME", "STATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", 
    "CROPDMG")]

To subset US states only, we use state.abb dataset from dataset defauls R package:

state.abb
##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"

Subseting data to US states only (for this purposes we use state.abb dataset from dataset defauls R package.)

d <- d[d$STATE %in% state.abb, ]
dim(d)
## [1] 883186      7

First, we look at fatalities and Injuries scatterplot withou any aggregation:

library(ggplot2)
ggplot(d, aes(FATALITIES, INJURIES)) + geom_point() + ggtitle("Fatalities and Injuries Scatterplot")

plot of chunk plot

Results

Most harmful events with respect to population health

Harmful Evenst subsetting:

evHarm <- aggregate(d[, c("FATALITIES", "INJURIES")], by = list(d[, "EVTYPE"]), 
    FUN = sum)
colnames(evHarm)[1] <- "Event Type"

Fatal events:

fatal <- evHarm[order(evHarm[, 2], decreasing = T), c(1, 2)]
head(fatal)
##         Event Type FATALITIES
## 834        TORNADO       5633
## 130 EXCESSIVE HEAT       1903
## 153    FLASH FLOOD        978
## 275           HEAT        937
## 464      LIGHTNING        816
## 856      TSTM WIND        504

Injury events:

injur <- evHarm[order(evHarm[, 3], decreasing = T), c(1, 3)]
head(injur)
##         Event Type INJURIES
## 834        TORNADO    91346
## 856      TSTM WIND     6957
## 170          FLOOD     6789
## 130 EXCESSIVE HEAT     6525
## 464      LIGHTNING     5230
## 275           HEAT     2100

Total count of Fatals and Injuries aggregated by events type:

# top 5 fatal and injury
top5harm <- union(fatal[1:5, 1], injur[1:5, 1])

# find idx
top5harm_idx = which(evHarm[, 1] %in% top5harm)

# prepare labels
top5harm_labels <- rep("", nrow(evHarm))
top5harm_labels[top5harm_idx] <- evHarm[top5harm_idx, 1]

# prepare color
top5harm_col <- rep(1, nrow(evHarm))
top5harm_col[top5harm_idx] <- 0

library(ggplot2)
ggplot(evHarm, aes(FATALITIES, INJURIES)) + geom_point(aes(colour = factor(top5harm_col))) + 
    geom_text(label = top5harm_labels, size = 3, vjust = -0.25) + theme(legend.position = "none") + 
    ggtitle("Fatalities and Injuries Ammount by Event Type Scatterplot")

plot of chunk unnamed-chunk-3

Labeled events in details:

evHarm[evHarm[, 1] %in% top5harm, ]
##         Event Type FATALITIES INJURIES
## 130 EXCESSIVE HEAT       1903     6525
## 153    FLASH FLOOD        978     1777
## 170          FLOOD        470     6789
## 275           HEAT        937     2100
## 464      LIGHTNING        816     5230
## 834        TORNADO       5633    91346
## 856      TSTM WIND        504     6957

Types of events that have the greatest economic consequences

Prop and Crop data agregated by event type:

dmg <- aggregate(d[, c("PROPDMG", "CROPDMG")], by = list(d$EVTYPE), FUN = sum)

Property damage ammount (top 5 are shown):

dmgProp <- dmg[order(dmg[, 2], decreasing = T), c(1, 2)]
colnames(dmgProp) <- c("Event Type", "Total Property Damage Ammount")
dmgProp[1:5, ]
##            Event Type Total Property Damage Ammount
## 811           TORNADO                       3211786
## 152       FLASH FLOOD                       1402128
## 833         TSTM WIND                       1333810
## 167             FLOOD                        895925
## 737 THUNDERSTORM WIND                        875481

Crop damage ammount (top 5 are shown):

dmgCrop <- dmg[order(dmg[, 3], decreasing = T), c(1, 3)]
colnames(dmgCrop) <- c("Event Type", "Total Crop Damage Amount")
dmgCrop[1:5, ]
##      Event Type Total Crop Damage Amount
## 241        HAIL                   579596
## 152 FLASH FLOOD                   176776
## 167       FLOOD                   167990
## 833   TSTM WIND                   109203
## 811     TORNADO                   100018

Prop and Corp Damage Aggregated by Event Type Scatterplot:

top5 <- union(dmgProp[1:5, 1], dmgCrop[1:5, 1])

top5_idx = which(dmg[, 1] %in% top5)

top5_labels <- rep("", nrow(dmg))
top5_labels[top5_idx] <- dmg[top5_idx, 1]

top5_col <- rep(1, nrow(dmg))
top5_col[top5_idx] <- 0

library(ggplot2)
ggplot(dmg, aes(PROPDMG, CROPDMG)) + geom_point(aes(colour = factor(top5_col))) + 
    geom_text(label = top5_labels, size = 3.5, vjust = -0.4) + theme(legend.position = "none") + 
    ggtitle("Prop and Corp Damage Aggregated by Event Type Scatterplot")

plot of chunk unnamed-chunk-8