knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)  # used to match keywords in EVTYPE variable
library(scales) # used to format damage values as dollars

Classifying Weather Events by Harm to Health and Economic Impact

Severe weather or storms, can result in harm to health and economic impact on communities experiencing it. We will try to classify which weather events are behind the most harm to health, as well as worst economic outcome. The data used is from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database.

Data Processing

The dataset repdata_data_StormData.csv.bz2 has 902297 observations and 37 variables. For our analysis we are only going to look at EVTYPE (event type), FATALITIES, INJURIES, PROPDMG (property damage), PROPDMGEXP (property damage magnitude). After loading the source data (DATA) we will transform the variables we need and add them to a tidy dataset (data).

#Selecting  only the columns we need. setting col_class to NULL skips the column
col_class <- c( rep("NULL", 7), #STATE__,BGN_DATE,BGN_TIME,TIME_ZONE,COUNTY, COUNTYNAME,STATE
               "character", # "EVTYPE"     
                rep("NULL",14), #BGN_RANGE,BGN_AZI,BGN_LOCATI,END_DATE,END_TIME, COUNTY_END,COUNTYENDN,END_RANGE,END_AZI,END_LOCATI,LENGTH,WIDTH,F,MAG        
               "numeric", # "FATALITIES" 
               "numeric", # "INJURIES"  
               "numeric", # "PROPDMG"   
               "character", # "PROPDMGEXP" 
               rep("NULL",11)#,CROPDMG,CROPDMGEXP,WFO,STATEOFFIC,ZONENAMES, LATITUDE,LONGITUDE,LATITUDE_E,LONGITUDE_,REMARKS,REFNUM
               )

# load data from csv files
DATA <- read.csv("repdata_data_StormData.csv.bz2", colClasses = col_class)

# lower the column names case
colnames(DATA) <- tolower(colnames(DATA))

event

There are 985 unique observations in EVTYPE (event type). We reduce that to 8: extreme rainfall, heat wave, fire, hurricane, tornado, tsunami, winter storm and other. We do so by matching keywords and other patterns with stringr::str_detect().

event_type <- DATA$evtype
# lower the case to reduce levels
event_type <- tolower(event_type) 

# create patterns to match each category 
extreme_rain <- "wind|flood|hail|heavy rains|rainstorm|storm surge|tropical storm|waterspouts|unseasonably wet|mud slide|rain|wet|severe thunderstorms|wnd|tropical depression|rock slide|coastal erosion|lightning|fld|shower|thunders|floooding|waterspout|precipitation|coastalstorm|funnels|high water|mudslide|dam|ligntning|wayterspout|funnel|spout|precipatation|lighting|landslide|rapidly rising"
winter_storm <- "snow|winter storm|blizzard|sleet|freez|cold|ice|low|wintery|winter|frost|cool spell|cool|hyperthermia|icy|avalance|hypothermia|wintry"
heat_wave <- "heat|high temp|warmth|dust|warm|dry|driest|hot|drought"
hurricane <- "hurricane|typhoon"
fire <- "fire|smoke"
tornado <- "tornado|torndao"
tsunami <- "tsunami|wave|surf|tide|swells|coastal surge|high seas"

# replace matching events
event_type[str_detect(event_type, extreme_rain )] <- "Extreme Rainfall"
event_type[str_detect(event_type, heat_wave )] <- "Heat Wave"
event_type[str_detect(event_type, winter_storm)] <- "Winter Storm"
event_type[str_detect(event_type, hurricane)] <- "Hurricane"
event_type[str_detect(event_type, tornado)] <- "Tornado"
event_type[str_detect(event_type, fire)] <- "Fire"
event_type[str_detect(event_type, tsunami)] <- "Tsunami"

# mark all others as other
other <- "Extreme Rainfall|Heat Wave|Winter Storm|Hurricane|Tornado|Fire|Tsunami"
event_type[str_detect(event_type, other, negate = TRUE)] <- "Other"

# add to dataset as a new column
data = NULL
data <- data.frame(event = factor(event_type))

fatalities and injuries

Fatalities and injuries variables are good as is.

data$fatalities <- DATA$fatalities
data$injuries <- DATA$injuries

damages

propdmgexp is a character variable used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions. We transform propdmg (property damage) accordingly

# recalculate damage cost with magnitude signifier 
damages <- DATA %>%
     transform(propdmg = case_when(propdmgexp == "H" | propdmgexp == "h" ~  (propdmg * 10),
                                propdmgexp == "K" ~ (propdmg * 1000),
                                propdmgexp == "M" | propdmgexp == "m" ~ (propdmg * 1000000),
                                propdmgexp == "B" ~ (propdmg * 1000000000))) %>%
    select(propdmg)
damages <- unlist(damages)

# add it to our tidy dataset
data$damages <- damages

Events Most Harmful to Population Health

Across the United States, which types of events are most harmful with respect to population health? We have two fields describing harm to health, FATALITIES and INJURIES. let see which event cause the most injuries and which cause the most fatalities.

# summarise fatalities
fatal <- data %>%
    group_by(event) %>%
    summarise(sum = sum(fatalities), 
              median = median(fatalities), 
              mean = round(mean(fatalities),4), 
              count = n()) %>%
    arrange(desc(sum))

# summarise injuries
injur <- data %>%
    group_by(event) %>%
    summarise(sum = sum(injuries), 
              median = median(injuries), 
              mean = round(mean(injuries),4),
              count = n()) %>%
    arrange(desc(sum))

# draw bar plots to demonstrate which has the largest share of harm
par(mfrow = c(1, 2), mar = c(10,4,6,2))
# bar plot fatalities
barplot(fatal$sum, 
        names.arg = fatal$event, 
        las=2,
        main = "Fatalities",
        ylim = c(0, max(fatal$sum) +500)
        )
# bar plot injuries
barplot(injur$sum, 
        names.arg = injur$event, 
        las = 2,
        main = "Injuries"
        )

mtext("Total Fatalities and Injuries by Weather Event", side = 3, line = - 1, outer = TRUE)

knitr::kable(fatal, caption ="Summary of fatalities statistics")
Summary of fatalities statistics
event sum median mean count
Tornado 5633 0 0.0928 60685
Extreme Rainfall 4095 0 0.0052 781171
Heat Wave 3205 0 0.5038 6362
Other 908 0 0.2671 3399
Winter Storm 878 0 0.0195 44913
Tsunami 203 0 0.1679 1209
Hurricane 133 0 0.4463 298
Fire 90 0 0.0211 4260

Looks like tornado have the highest fatalities and injuries all together. let’s explore more statistics. We can clearly see that even though many people have died from tornado and other extreme events the median for all events is 0. this tells us that the majority of events end with no fatalities. The mean shows that heat waves followed by hurricanes, cause on average more fatalities than tornado.

knitr::kable(injur, caption ="Summary of injury statistics")
Summary of injury statistics
event sum median mean count
Tornado 91364 0 1.5055 60685
Extreme Rainfall 27687 0 0.0354 781171
Heat Wave 9758 0 1.5338 6362
Winter Storm 6357 0 0.1415 44913
Other 2036 0 0.5990 3399
Fire 1608 0 0.3775 4260
Hurricane 1333 0 4.4732 298
Tsunami 385 0 0.3184 1209

Again, all events have median of 0 indicating most events end with no injury. Hurricane on average leads to the most injuries followed by heat waves and tornado.

Which Types of Events Have the Greatest Economic Consequences?

PROPDMG describe the damage in dollars from each event. Lets review which event resulted in most economic damage. this is simplistic review which doesn’t take into account inflation etc.

damages <- data %>%
    group_by(event) %>%
    summarise(sum = sum(damages, na.rm = TRUE) /1000000 , # divide by 1 million
              median =  median(damages, na.rm = TRUE), 
              mean = mean(damages, na.rm = TRUE), 
              count = n()
              ) %>%
    arrange(desc(sum))

par(mar = c(8,8,6,2), las=1)
yticks <- seq(0, 300000, by=50000)

barplot(damages$sum, 
        names.arg = damages$event, 
        las = 2,
        yaxt = "n",
        main = "Total Economic Damage by Weather Event",
        ylim = c(0, 300000))
axis(2, at=yticks, labels=paste(dollar(yticks), "M" ),cex.lab = 0.5)

damages %>%
    mutate(sum = dollar(sum, suffix = " M"),
           median = dollar(median),
           mean = dollar(mean /1000, suffix = " K")) %>%
    knitr::kable(caption = "Summary of Economic Damage from Weather Events")
Summary of Economic Damage from Weather Events
event sum median mean count
Extreme Rainfall $262,768 M $1,000 $752 K 781171
Hurricane $85,256 M $6,740,000 $382,316 K 298
Tornado $56,942 M $25,000 $1,098 K 60685
Winter Storm $12,489 M $0 $464 K 44913
Fire $8,497 M $0 $3,678 K 4260
Heat Wave $1,079 M $0 $339 K 6362
Tsunami $255 M $0 $516 K 1209
Other $32 M $0 $20 K 3399

Extreme Rainfall has the highest total economic consequences, $262.768 billion. However Hurricanes has the highest damage on average, $382,315,740 with median of $6,740,000 and the lowest number of events observed. Clearly the most costly per event.

Results

Tornado lead to the most harm with 5,633 fatalities and 91,364 injuries. However the data also suggests that the majority of events do not cause fatalities or injury with median of 0 for both. If we just compare the events outcome regardless of frequency (the average outcome for each event) we learn that heat waves cause more fatalities on average (0.5038 fatalities), followed by hurricanes (0.4463).

Extreme Rainfall has the highest altogether cost but also has the largest number of observations. when we look at the median damage we can see that at least half of hurricanes cause damage of $6,740,000 or more (median) and it is the second event with largest total damage cost with a fraction of the number of observations.

Note

The report uses a very crude classification of the events, therefore there is a good chance that some of the results can be misleading. Also the affects of tornados and hurricanes are so large that it is hard to evaluate the other events at the same scale.