NOAA Storm Data Analysis

Synopsis

This is an analysis of data from the NOAA Storm Database.

After some necessary data cleansing, the analysis focuses on the last fifteen years of data (1997-2011) to provide results. First we find the top 5 event types by average annual Injuries, Fatalities, Crop Damage, and Property Damage. Then, we show the same top 5 event types for each category but with the annual effects plotted as a time series over the fifteen year period.

The purpose of the analysis is to answer the questions:

  1. Across the United States, which types of events are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

Data Processing

Download, unzip, read in data

# Download the data:
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","Storm.data.csv.bz2")

# Unzip and read the file:
stormdata<- read.csv(bzfile("Storm.data.csv.bz2"), stringsAsFactors = FALSE)

str(stormdata)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Clean up the EVTYPE column

library(dplyr)
library(lubridate)
library(stringdist)

# Make a copy of stormdata to start cleaning it up
stormdata_clean <- stormdata

# If there is no damage and no health effect, remove it from the analysis
#   Set all uppercase.  Trim whitespace.
stormdata_clean$PROPDMGEXP <- toupper(trimws(stormdata_clean$PROPDMGEXP))
stormdata_clean$PROPDMGEXPnew <- case_when(stormdata_clean$PROPDMGEXP == "B" ~ 1e9,
                                        stormdata_clean$PROPDMGEXP == "M" ~ 1e6,
                                        stormdata_clean$PROPDMGEXP == "K" ~ 1e3,
                                        stormdata_clean$PROPDMGEXP == "H" ~ 1e2,
                                        stormdata_clean$PROPDMGEXP == "1" ~ 1e1,
                                        stormdata_clean$PROPDMGEXP == "2" ~ 1e2,
                                        stormdata_clean$PROPDMGEXP == "3" ~ 1e3,
                                        stormdata_clean$PROPDMGEXP == "4" ~ 1e4,
                                        stormdata_clean$PROPDMGEXP == "5" ~ 1e5,
                                        stormdata_clean$PROPDMGEXP == "6" ~ 1e6,
                                        stormdata_clean$PROPDMGEXP == "7" ~ 1e7,
                                        stormdata_clean$PROPDMGEXP == "8" ~ 1e8,
                                        stormdata_clean$PROPDMGEXP == "9" ~ 1e9,
                                        TRUE ~ 1)
stormdata_clean$PROPDMGTOT <- stormdata_clean$PROPDMG * stormdata_clean$PROPDMGEXPnew
stormdata_clean$CROPDMGEXP <- toupper(trimws(stormdata_clean$CROPDMGEXP))
stormdata_clean$CROPDMGEXPnew <- case_when(stormdata_clean$CROPDMGEXP == "B" ~ 1e9,
                                        stormdata_clean$CROPDMGEXP == "M" ~ 1e6,
                                        stormdata_clean$CROPDMGEXP == "K" ~ 1e3,
                                        stormdata_clean$CROPDMGEXP == "H" ~ 1e2,
                                        stormdata_clean$CROPDMGEXP == "1" ~ 1e1,
                                        stormdata_clean$CROPDMGEXP == "2" ~ 1e2,
                                        stormdata_clean$CROPDMGEXP == "3" ~ 1e3,
                                        stormdata_clean$CROPDMGEXP == "4" ~ 1e4,
                                        stormdata_clean$CROPDMGEXP == "5" ~ 1e5,
                                        stormdata_clean$CROPDMGEXP == "6" ~ 1e6,
                                        stormdata_clean$CROPDMGEXP == "7" ~ 1e7,
                                        stormdata_clean$CROPDMGEXP == "8" ~ 1e8,
                                        stormdata_clean$CROPDMGEXP == "9" ~ 1e9,
                                        TRUE ~ 1)
stormdata_clean$CROPDMGTOT <- stormdata_clean$CROPDMG * stormdata_clean$CROPDMGEXPnew

stormdata_clean$TOTDMG <- stormdata_clean$PROPDMGTOT + stormdata_clean$CROPDMGTOT
stormdata_clean$HEALTHEFFECTS <- stormdata_clean$INJURIES + stormdata_clean$FATALITIES
stormdata_clean$ALLEFFECTS <- stormdata_clean$TOTDMG + stormdata_clean$HEALTHEFFECTS
stormdata_clean <- filter(stormdata_clean, ALLEFFECTS > 0)

# Get the year as a standalone column
stormdata_clean <- mutate(stormdata_clean, year = year(mdy_hms(gsub("/","-",BGN_DATE,fixed = TRUE))))
# Not many different EVTYPE were recorded before about 1995, so remove data before that.
stormdata_clean <- filter(stormdata_clean, year>1994)

# Get definitive list of evtypes (extracted from NWSI 10-1605 AUGUST 17, 2007 Table of Contents) 
evtypes_def_list <- toupper(trimws(read.table("evtypes_full_list.txt",sep = ",",colClasses = "character")$V1))

# Begin cleanup on EVTYPE column
#   Set all uppercase.  Trim whitespace.
stormdata_clean$EVTYPE <- toupper(trimws(stormdata_clean$EVTYPE))


# Build a map of old EVTYPE to the definitive list
evtypes_map <- tibble(orig = unique (stormdata_clean$EVTYPE), map_col = unique (stormdata_clean$EVTYPE))
evtypes_map$new <- ""

#Define pattern and replacement strings to use with sub(pattern, replacement, ... fixed = TRUE)
sub_strings_fixed <- matrix(byrow = TRUE, ncol = 2, 
                            data = c("SEVERE ", "",
                                     "DRY ","",
                                     "RIVER ","",
                                     "BREAKUP ","",
                                     "SHOWER","RAIN",
                                     "SWELLS","SURF",
                                     "TSTM", "THUNDERSTORM",
                                     "THUDERSTORM", "THUNDERSTORM",
                                     "NON-THUNDERSTORM", "STRONG",
                                     "NON THUNDERSTORM", "STRONG",
                                     "MICROBURST", "THUNDERSTORM",
                                     "MIRCOBURST", "THUNDERSTORM",
                                     "DOWNBURST", "THUNDERSTORM", 
                                     "GUSTNADO", "THUNDERSTORM WIND", 
                                     "THUNDERSNOW", "HEAVY SNOW", 
                                     "LAKE FLOOD", "LAKESHORE FLOOD", 
                                     "HEAVY LAKE SNOW", "LAKE-EFFECT SNOW",
                                     "HEAVY MIX", "HEAVY SNOW",
                                     "WIND/HAIL", "HAIL", 
                                     "WIND AND WAVE", "MARINE HIGH WIND", 
                                     "RAIN (HEAVY)", "HEAVY RAIN",
                                     "RURAL FLOOD", "FLOOD",
                                     "XYZ","XYZ"
                                     )
                                )

# Apply those substitutions
for (i in 1:nrow(sub_strings_fixed)){
    evtypes_map$map_col <- sub(sub_strings_fixed[i,1],sub_strings_fixed[i,2],evtypes_map$map_col,fixed=TRUE)
}

#Define pattern and replacement strings to use with sub(pattern, replacement, ... fixed = FALSE)
sub_strings <- matrix(byrow = TRUE, ncol = 2, 
                            data = c("^WIND", "HIGH WIND", 
                                     "^COLD$", "COLD/WIND/CHILL",
                                     "^FLOOD.*", "FLOOD",
                                     "^HEAT.*", "HEAT",
                                     ".* HEAT", "EXCESSIVE HEAT",
                                     "SEAS$", "SURF",
                                     ".* SURF", "HIGH SURF",
                                     ".*HIGH TIDE", "STORM TIDE",
                                     "MARINE MISHAP", NA,
                                     "^FREEZE$", "FROST/FREEZE",
                                     ".*HURRICANE.*", "HURRICANE/TYPHOON", 
                                     ".*SLIDE.*","DEBRIS FLOW",
                                     ".*H FLOOD.*","FLASH FLOOD",
                                     ".*L FLOOD.*","COASTAL FLOOD",
                                     "^SNOW.*","HEAVY SNOW",
                                     ".*MIXED PRECIP.*","HEAVY SNOW",
                                     "HEAVY PRECIP.*","HEAVY RAIN",
                                     ".*[ABDEFGHIJKLMNOPQRSTUZWXYZ]T SNOW.*","HEAVY SNOW",
                                     ".*[ABCDEFGHIJKLMNOPQRSUZWXYZ] SNOW.*","HEAVY SNOW",
                                     ".*[ABCDFGIJKMNOPQRSTUVWXYZ] FLOOD.*","FLOOD",
                                     ".*HAIL.*","HAIL",
                                     "XYZ","XYZ"
                                     )
                                )
# Apply those substitutions
for (i in 1:nrow(sub_strings)){
    evtypes_map$map_col <- sub(sub_strings[i,1],sub_strings[i,2],evtypes_map$map_col,fixed=FALSE)
}


numtypes <- length(evtypes_map$map_col)
for (itype in 1:numtypes){
    
    matchword <- trimws(evtypes_map$map_col[itype])
    #Remove certain words that are not part of the definitive list to improve amatch performance
    #matchword <- sub("SEVERE","",matchword,fixed=TRUE)
    #matchword <- sub("DRY","",matchword,fixed=TRUE)
    #matchword <- sub("RIVER ","",matchword,fixed=TRUE)
    #matchword <- sub("BREAKUP ","",matchword,fixed=TRUE)
    if (is.na(matchword)){
        evtypes_map$new[itype] <- NA
        next
    }
    
    matchmeth<-"soundex"
    matchdist<-10
    matchresult <- evtypes_def_list[amatch(matchword,evtypes_def_list,maxDist=matchdist, method=matchmeth)]
    # ASTRONOMICAL LOW TIDE only occurs once, but the amatch defaults to it whenever it gets stumped.
    # Try again unless orig was ASTRONOMICAL LOW TIDE
    if ((evtypes_map$orig[itype] != "ASTRONOMICAL LOW TIDE")&&(matchresult == "ASTRONOMICAL LOW TIDE")){
        matchresult<-NA
    }
    #if (is.na(matchresult)){  # If amatch didn't work, try matching against just the first half of the old evtype string.
    #  matchresult <- amatch(substr(matchword,1,ceiling(length(matchword)/2)),evtypes_def_list,maxDist=matchdist, method=matchmeth)
     #  cat ("NANA ")
    #}
    # Use the result to put the evtype from the definitive list in the new column
    evtypes_map$new[itype] <- matchresult
    
    
}



cat(length(unique(evtypes_map$orig)),"original evtypes (after filtering out rows with no economic/health effects).\n")
## 330 original evtypes (after filtering out rows with no economic/health effects).
cat(length(unique(evtypes_map$map_col)),"map_col evtypes.\n")
## 208 map_col evtypes.
cat(length(unique(evtypes_map$new)),"new evtypes.\n")
## 44 new evtypes.
cat(length(unique(evtypes_def_list)),"definitive evtypes.\n")
## 48 definitive evtypes.
cat(sum(is.na(evtypes_map$new)),"unmatched original evtypes.\n")
## 53 unmatched original evtypes.
cat(sum(grepl("ASTRONOMICAL LOW TIDE", evtypes_map$new)),"new ASTRONOMICAL LOW TIDEs.\n")
## 1 new ASTRONOMICAL LOW TIDEs.
cat(sum(grepl("LOW TIDE", evtypes_map$orig)),"orig LOW TIDEs.\n")
## 1 orig LOW TIDEs.
# Join the map to the original dataset
stormdata_clean <- left_join(stormdata_clean,evtypes_map, by = c("EVTYPE" = "orig"))
# Rename the columns so we are going to work with the new EVTYPE instead of the original
stormdata_clean <- rename(stormdata_clean, EVTYPE.orig = EVTYPE, EVTYPE = new )

Prepare data for analysis

We need to summarize the health and environmental effects by EVTYPE

library(dplyr)
library(lubridate)
library(ggplot2)
# Make another copy of the data to use in this code chunk
stormdata_prep <- stormdata_clean


# Group by year and EVTYPE, only use last fifteen years of data for this step
max_year <- max(stormdata_prep$year)
by_year_evtype <- group_by(filter(stormdata_prep,year>(max_year-15)), year, EVTYPE)
# Summarise all effects
eff_per_year <- summarise(by_year_evtype, 
                          HEALTHEFFECTS = sum(HEALTHEFFECTS), 
                          FATALITIES = sum(FATALITIES),
                          INJURIES = sum(INJURIES),
                          CROPDMGTOT = sum(CROPDMGTOT),
                          PROPDMGTOT = sum(PROPDMGTOT),
                          TOTDMG = sum(TOTDMG))

# Group by EVTYPE
by_evtype <- group_by(eff_per_year, EVTYPE)
# Summarise all effects
mean_eff_per_evtype <- summarise(by_evtype, 
                          HEALTHEFFECTS = mean(HEALTHEFFECTS), 
                          FATALITIES = mean(FATALITIES),
                          INJURIES = round(mean(INJURIES)),
                          CROPDMGTOT = mean(CROPDMGTOT),
                          PROPDMGTOT = mean(PROPDMGTOT),
                          TOTDMG = mean(TOTDMG))
sum_eff_per_evtype <- summarise(by_evtype, 
                          HEALTHEFFECTS = sum(HEALTHEFFECTS), 
                          FATALITIES = sum(FATALITIES),
                          INJURIES = sum(INJURIES),
                          CROPDMGTOT = sum(CROPDMGTOT),
                          PROPDMGTOT = sum(PROPDMGTOT),
                          TOTDMG = sum(TOTDMG))

#Filter only those EVTYPE which make up the top 5 by yearly average over the last fifteen years.
inj_evtypes <- head(arrange(mean_eff_per_evtype, desc(INJURIES)),5)$EVTYPE
fat_evtypes <- head(arrange(mean_eff_per_evtype, desc(FATALITIES)),5)$EVTYPE
cd_evtypes  <- head(arrange(mean_eff_per_evtype, desc(CROPDMGTOT)),5)$EVTYPE
pd_evtypes  <- head(arrange(mean_eff_per_evtype, desc(PROPDMGTOT)),5)$EVTYPE

rate_evtypes <- unique(c(inj_evtypes,fat_evtypes,cd_evtypes,pd_evtypes))

# Create colors for these event types 
evtypes_colors <- hcl.colors(length(rate_evtypes))
color_map <- tibble(evtype = rate_evtypes, color = evtypes_colors)
mean_eff_per_evtype <- left_join(mean_eff_per_evtype,color_map, by = c("EVTYPE" = "evtype"))
eff_per_year <- left_join(eff_per_year,color_map, by = c("EVTYPE" = "evtype"))


#inj_per_year <- filter(eff_per_year, EVTYPE %in% inj_evtypes)
#fat_per_year <- filter(eff_per_year, EVTYPE %in% fat_evtypes)
#cd_per_year <- filter(eff_per_year, EVTYPE %in% cd_evtypes)
#pd_per_year <- filter(eff_per_year, EVTYPE %in% pd_evtypes)

Results

Plot the results of the analysis.

Top Five Event Types by Average Annual Health and Economic Effects

Show the top 5 in each category based on greatest average annual effects over the fifteen year period.

library(dplyr)
library(lubridate)
library(ggplot2)
#Plot the Top 5 Event Types by Yearly Average <Effect> as a 2X2 grid of column charts
par(mfrow=c(2,2),pin=c(4.5,3))#   ,mai = c(.2,.2,.2,.2),omi = c(.2,.2,.2,.2))

# Make a table to loop through for the four charts needed with all variations
plot_index <- tibble(column = c("INJURIES","FATALITIES","CROPDMGTOT","PROPDMGTOT"),
                     friendly = c("Injuries","Fatalities","Crop Damage ($M)","Property Damage ($B)"),
                     scale = c(1,1,1e6,1e9),
                     digits = c(-2,-1,-2,0),
                     ylimadj = c(200,20,200,2),
                     textadj = c(50,5,50,0.5),
                     textrd = c(0,0,2,2),
                     text1 = c("","","$","$"),
                     text2 = c("","","M","B"))

for (i in 1:nrow(plot_index)){
    # Make a copy of the data for each plot type
    top5<-head(arrange(mean_eff_per_evtype, desc(!!as.name(plot_index$column[i]))),5)
    # Make the Bar Plot
    top5bar<- barplot(names.arg = top5$EVTYPE,
                      height = top5[[plot_index$column[i]]]/plot_index$scale[i],
                      beside = TRUE,
                      main = paste0("Top 5 Event Types by Average Annual ",
                                    plot_index$friendly[i],"\n(",max_year-14,"-",max_year,")"),
                      #ylab = paste0("Average Annual ",plot_index$friendly[i]),
                      xlab = "Event Type",
                      ylim = c(0, round(max(top5[[plot_index$column[i]]]/plot_index$scale[i]), 
                                        digits = plot_index$digits[i]) + plot_index$ylimadj[i]),
                      col = top5$color,
                      cex.names = 0.7,
                      xpd = FALSE
                      )
    # Put the data labels at the top of each bar
    text(top5bar, top5[[plot_index$column[i]]]/plot_index$scale[i]+plot_index$textadj[i], 
         paste0(plot_index$text1[i],round(top5[[plot_index$column[i]]]/plot_index$scale[i], 
                                          digits = plot_index$textrd[i]),plot_index$text2[i]),cex=1)
    # Include a legend since some of the bar names at the bottom are too long and will get cut off
    legend("topright", legend = top5$EVTYPE, fill = top5$color)
}

Figure 1 - Top Five Event Types by Average Annual Health and Economic Effects

On average, Tornadoes cause the most annual injuries (~1331 per year), but Excessive Heat causes the most deaths (~118 per year).

Drought causes the most Crop Damage (~$857.6M per year), while Flood causes the most property damage (~$9.53B per year).

Effects Over Time for the Top 5 Event Types

Show the annual effects of the Top 5 event types in each category over the last fifteen years as a time series. For the Crop Damage and Property Damage, use a logarithmic scale to make it easier to see differences at the lower end of the range.

library(dplyr)
library(lubridate)
library(ggplot2)
#Plot the Top 5 Event Types by Yearly Average <Effect> as a 2X2 grid of column charts
par(mfrow=c(4,1),pin=c(8,1.7),ylog=TRUE)#   ,mai = c(.2,.2,.2,.2),omi = c(.2,.2,.2,.2))

# Make a table to loop through for the four charts needed with all variations
plot_index <- tibble(column = c("INJURIES","FATALITIES","CROPDMGTOT","PROPDMGTOT"),
                     friendly = c("Injuries","Fatalities","Crop Damage ($M)","Property Damage ($B)"),
                     scale = c(1,1,1e6,1e9),
                     digits = c(-2,-1,-2,0),
                     ylimadj = c(200,20,200,2),
                     textadj = c(50,5,50,0.5),
                     textrd = c(0,0,2,2),
                     text1 = c("","","$","$"),
                     text2 = c("","","M","B"))

for (i in 1:nrow(plot_index)){
    # Make a copy of the data for each plot type
    top5evtypes<-head(arrange(mean_eff_per_evtype, desc(!!as.name(plot_index$column[i]))),5)$EVTYPE
    
    top5<-filter(eff_per_year, EVTYPE %in% top5evtypes)
    
    top5leg <- top5[!duplicated(top5[ , c("EVTYPE")]), ]
        
    firstline = TRUE
    
    for (evt in top5evtypes){
        top5f <- filter(top5, EVTYPE == evt)
        # Make the Line Plot
        if (firstline) {
            top5line<- plot(x = top5f$year,
                    y = top5f[[plot_index$column[i]]]/plot_index$scale[i],
                    main = paste0("Event Type ",plot_index$friendly[i]," each Year\n(",max_year-14,"-",max_year,")"),
                    ylab = plot_index$friendly[i],
                    xlab = "Year",
                    type = "l",
                    #ylim = c(0, round(max(top5[[plot_index$column[i]]]/plot_index$scale[i]), 
                    #                  digits = plot_index$digits[i]) + plot_index$ylimadj[i]),
                    col = top5f$color,
                    lwd = 2,
                    log = "y",
                    #cex.names = 0.7,
                    xpd = FALSE
                    )
            firstline <- FALSE
        } else {
        
            lines(x = top5f$year,
                  y = top5f[[plot_index$column[i]]]/plot_index$scale[i],
                  col = top5f$color,
                  lwd = 2,
                  #log = "y"
                  )
        }
        
     
        # Put the data labels at each point
        text(top5f$year,
               top5f[[plot_index$column[i]]]/plot_index$scale[i],
               top5f[[plot_index$column[i]]]/plot_index$scale[i]
               )
        #text(top5line, top5f[[plot_index$column[i]]]/plot_index$scale[i]+plot_index$textadj[i],
        #     paste0(plot_index$text1[i],round(top5f[[plot_index$column[i]]]/plot_index$scale[i],
        #                                      digits = plot_index$textrd[i]),plot_index$text2[i]),cex=1)
       
    }
    # Include a legend since some of the bar names at the bottom are too long and will get cut off
    legend("topright", legend = top5leg$EVTYPE, fill = top5leg$color)
}

Figure 2 - Effects Over Time for the Top 5 Event Types

The highest average annual effect doesn’t necessarily predict the most damaging event type in any single year. Over the last fifteen years, various event types have taken turns as the most damaging to health (measured by injuries or fatalities) or economics (measured by crop damage or property damage.) For example, Hurricanes caused significant property damage in 2004 and 2005.

End