knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr) # used to match keywords in EVTYPE variable
library(scales) # used to format damage values as dollars
Severe weather or storms, can result in harm to health and economic impact on communities experiencing it. We will try to classify which weather events are behind the most harm to health, as well as worst economic outcome. The data used is from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database.
The dataset repdata_data_StormData.csv.bz2
has 902297
observations and 37 variables. For our analysis we are only going to
look at EVTYPE
(event type), FATALITIES
,
INJURIES
, PROPDMG
(property damage),
PROPDMGEXP
(property damage magnitude). After loading the
source data (DATA
) we will transform the variables we need
and add them to a tidy dataset (data
).
#Selecting only the columns we need. setting col_class to NULL skips the column
col_class <- c( rep("NULL", 7), #STATE__,BGN_DATE,BGN_TIME,TIME_ZONE,COUNTY, COUNTYNAME,STATE
"character", # "EVTYPE"
rep("NULL",14), #BGN_RANGE,BGN_AZI,BGN_LOCATI,END_DATE,END_TIME, COUNTY_END,COUNTYENDN,END_RANGE,END_AZI,END_LOCATI,LENGTH,WIDTH,F,MAG
"numeric", # "FATALITIES"
"numeric", # "INJURIES"
"numeric", # "PROPDMG"
"character", # "PROPDMGEXP"
rep("NULL",11)#,CROPDMG,CROPDMGEXP,WFO,STATEOFFIC,ZONENAMES, LATITUDE,LONGITUDE,LATITUDE_E,LONGITUDE_,REMARKS,REFNUM
)
# load data from csv files
DATA <- read.csv("repdata_data_StormData.csv.bz2", colClasses = col_class)
# lower the column names case
colnames(DATA) <- tolower(colnames(DATA))
There are 985 unique observations in EVTYPE
(event
type). We reduce that to 8: extreme rainfall, heat wave, fire,
hurricane, tornado, tsunami, winter storm and other. We do so by
matching keywords and other patterns with
stringr::str_detect()
.
event_type <- DATA$evtype
# lower the case to reduce levels
event_type <- tolower(event_type)
# create patterns to match each category
extreme_rain <- "wind|flood|hail|heavy rains|rainstorm|storm surge|tropical storm|waterspouts|unseasonably wet|mud slide|rain|wet|severe thunderstorms|wnd|tropical depression|rock slide|coastal erosion|lightning|fld|shower|thunders|floooding|waterspout|precipitation|coastalstorm|funnels|high water|mudslide|dam|ligntning|wayterspout|funnel|spout|precipatation|lighting|landslide|rapidly rising"
winter_storm <- "snow|winter storm|blizzard|sleet|freez|cold|ice|low|wintery|winter|frost|cool spell|cool|hyperthermia|icy|avalance|hypothermia|wintry"
heat_wave <- "heat|high temp|warmth|dust|warm|dry|driest|hot|drought"
hurricane <- "hurricane|typhoon"
fire <- "fire|smoke"
tornado <- "tornado|torndao"
tsunami <- "tsunami|wave|surf|tide|swells|coastal surge|high seas"
# replace matching events
event_type[str_detect(event_type, extreme_rain )] <- "Extreme Rainfall"
event_type[str_detect(event_type, heat_wave )] <- "Heat Wave"
event_type[str_detect(event_type, winter_storm)] <- "Winter Storm"
event_type[str_detect(event_type, hurricane)] <- "Hurricane"
event_type[str_detect(event_type, tornado)] <- "Tornado"
event_type[str_detect(event_type, fire)] <- "Fire"
event_type[str_detect(event_type, tsunami)] <- "Tsunami"
# mark all others as other
other <- "Extreme Rainfall|Heat Wave|Winter Storm|Hurricane|Tornado|Fire|Tsunami"
event_type[str_detect(event_type, other, negate = TRUE)] <- "Other"
# add to dataset as a new column
data = NULL
data <- data.frame(event = factor(event_type))
Fatalities and injuries variables are good as is.
data$fatalities <- DATA$fatalities
data$injuries <- DATA$injuries
propdmgexp is a character variable used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions. We transform propdmg (property damage) accordingly
# recalculate damage cost with magnitude signifier
damages <- DATA %>%
transform(propdmg = case_when(propdmgexp == "H" | propdmgexp == "h" ~ (propdmg * 10),
propdmgexp == "K" ~ (propdmg * 1000),
propdmgexp == "M" | propdmgexp == "m" ~ (propdmg * 1000000),
propdmgexp == "B" ~ (propdmg * 1000000000))) %>%
select(propdmg)
damages <- unlist(damages)
# add it to our tidy dataset
data$damages <- damages
Across the United States, which types of events are most harmful with
respect to population health? We have two fields describing harm to
health, FATALITIES
and INJURIES
. let see which
event cause the most injuries and which cause the most fatalities.
# summarise fatalities
fatal <- data %>%
group_by(event) %>%
summarise(sum = sum(fatalities),
median = median(fatalities),
mean = round(mean(fatalities),4),
count = n()) %>%
arrange(desc(sum))
# summarise injuries
injur <- data %>%
group_by(event) %>%
summarise(sum = sum(injuries),
median = median(injuries),
mean = round(mean(injuries),4),
count = n()) %>%
arrange(desc(sum))
# draw bar plots to demonstrate which has the largest share of harm
par(mfrow = c(1, 2), mar = c(10,4,6,2))
# bar plot fatalities
barplot(fatal$sum,
names.arg = fatal$event,
las=2,
main = "Fatalities",
ylim = c(0, max(fatal$sum) +500)
)
# bar plot injuries
barplot(injur$sum,
names.arg = injur$event,
las = 2,
main = "Injuries"
)
mtext("Total Fatalities and Injuries by Weather Event", side = 3, line = - 1, outer = TRUE)
knitr::kable(fatal, caption ="Summary of fatalities statistics")
event | sum | median | mean | count |
---|---|---|---|---|
Tornado | 5633 | 0 | 0.0928 | 60685 |
Extreme Rainfall | 4095 | 0 | 0.0052 | 781171 |
Heat Wave | 3205 | 0 | 0.5038 | 6362 |
Other | 908 | 0 | 0.2671 | 3399 |
Winter Storm | 878 | 0 | 0.0195 | 44913 |
Tsunami | 203 | 0 | 0.1679 | 1209 |
Hurricane | 133 | 0 | 0.4463 | 298 |
Fire | 90 | 0 | 0.0211 | 4260 |
Looks like tornado have the highest fatalities and injuries all together. let’s explore more statistics. We can clearly see that even though many people have died from tornado and other extreme events the median for all events is 0. this tells us that the majority of events end with no fatalities. The mean shows that heat waves followed by hurricanes, cause on average more fatalities than tornado.
knitr::kable(injur, caption ="Summary of injury statistics")
event | sum | median | mean | count |
---|---|---|---|---|
Tornado | 91364 | 0 | 1.5055 | 60685 |
Extreme Rainfall | 27687 | 0 | 0.0354 | 781171 |
Heat Wave | 9758 | 0 | 1.5338 | 6362 |
Winter Storm | 6357 | 0 | 0.1415 | 44913 |
Other | 2036 | 0 | 0.5990 | 3399 |
Fire | 1608 | 0 | 0.3775 | 4260 |
Hurricane | 1333 | 0 | 4.4732 | 298 |
Tsunami | 385 | 0 | 0.3184 | 1209 |
Again, all events have median of 0 indicating most events end with no injury. Hurricane on average leads to the most injuries followed by heat waves and tornado.
PROPDMG
describe the damage in dollars from each event.
Lets review which event resulted in most economic damage. this is
simplistic review which doesn’t take into account inflation etc.
damages <- data %>%
group_by(event) %>%
summarise(sum = sum(damages, na.rm = TRUE) /1000000 , # divide by 1 million
median = median(damages, na.rm = TRUE),
mean = mean(damages, na.rm = TRUE),
count = n()
) %>%
arrange(desc(sum))
par(mar = c(8,8,6,2), las=1)
yticks <- seq(0, 300000, by=50000)
barplot(damages$sum,
names.arg = damages$event,
las = 2,
yaxt = "n",
main = "Total Economic Damage by Weather Event",
ylim = c(0, 300000))
axis(2, at=yticks, labels=paste(dollar(yticks), "M" ),cex.lab = 0.5)
damages %>%
mutate(sum = dollar(sum, suffix = " M"),
median = dollar(median),
mean = dollar(mean /1000, suffix = " K")) %>%
knitr::kable(caption = "Summary of Economic Damage from Weather Events")
event | sum | median | mean | count |
---|---|---|---|---|
Extreme Rainfall | $262,768 M | $1,000 | $752 K | 781171 |
Hurricane | $85,256 M | $6,740,000 | $382,316 K | 298 |
Tornado | $56,942 M | $25,000 | $1,098 K | 60685 |
Winter Storm | $12,489 M | $0 | $464 K | 44913 |
Fire | $8,497 M | $0 | $3,678 K | 4260 |
Heat Wave | $1,079 M | $0 | $339 K | 6362 |
Tsunami | $255 M | $0 | $516 K | 1209 |
Other | $32 M | $0 | $20 K | 3399 |
Extreme Rainfall has the highest total economic consequences, $262.768 billion. However Hurricanes has the highest damage on average, $382,315,740 with median of $6,740,000 and the lowest number of events observed. Clearly the most costly per event.
Tornado lead to the most harm with 5,633 fatalities and 91,364 injuries. However the data also suggests that the majority of events do not cause fatalities or injury with median of 0 for both. If we just compare the events outcome regardless of frequency (the average outcome for each event) we learn that heat waves cause more fatalities on average (0.5038 fatalities), followed by hurricanes (0.4463).
Extreme Rainfall has the highest altogether cost but also has the largest number of observations. when we look at the median damage we can see that at least half of hurricanes cause damage of $6,740,000 or more (median) and it is the second event with largest total damage cost with a fraction of the number of observations.
The report uses a very crude classification of the events, therefore there is a good chance that some of the results can be misleading. Also the affects of tornados and hurricanes are so large that it is hard to evaluate the other events at the same scale.