### Setup
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
The objective of this report is to look through our history of weather events from the last 60 years to identify the types of weather events that cause the greatest degree of harm to the population, and result in significant economic damages.
In short, this analsis addresses two questions:
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
The detail of the report identifies the top 10 types of environmental events resulting in harm to the population, and the top 10 events resulting in economic damages.
Tornados are the most significant risk to personal safety, fatalaties and injuries from Tornados are an order of magnitude greater than the next categories of Heat, Thunderstorm Winds, and Flooding.
In order, Floods, Hurricanes and Tornados are the weather events that cause the greatest economic damage. With the majority of the damage caused to personal property.
This report uses data supplied by the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The dataset was pre-processed by the wonderful people at John Hopkins University in the Biostatistics Department, and is available on their server.
fname <- "StormData.csv.bz2"
if (!file.exists(fname)){
furl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(furl, destfile = fname, method="curl")
# date of download noted.
StormDataDownloadDate <- date()
StormDataDownloadDate
}
## [1] "Sat Jan 14 15:13:26 2017"
data1 <- read.csv(fname, as.is=TRUE)
str(data1)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
From the initial exploration of the data with respect to the questions of concern, it is apparent that a number of the columns in the dataset will not be required. These are removed to improve computer performance. The resulting dataset contains the 7 columns required to address the report’s objectives.
data1 <- select(data1, EVTYPE, FATALITIES, INJURIES, PROPDMG,
PROPDMGEXP, CROPDMG, CROPDMGEXP)
str(data1)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
From the exploratory analysis it became apparent that a number of the event types are split into multiple groups resulting from variations in the labeling applied. Many of these consolidations arise from alternate abbreviations, upper/lower case usage, spelling mistakes. The initial file contained 985 EVTYPE factors. The following groups of event types have been combined. Refer to the R code in the ‘consolidateEvents’ code chunk for specific details.
In the event that a ranking deeper than the top 10 is required, further effort is required to group and consolidate the event types.
# Combine Tornado events
data1$EVTYPE <- sub("(.*)TORNADO(.*)", "TORNADO", data1$EVTYPE)
# Combine Heat events
gr1 <- c('HEAT', 'EXCESSIVE HEAT', 'HEAT WAVE', 'EXTREME HEAT', 'Heat Wave', 'RECORD HEAT', 'UNSEASONABLY WARM AND DRY', 'UNSEASONABLY WARM', 'HEAT WAVE DROUGHT', 'RECORD/EXCESSIVE HEAT', 'HEAT WAVES', 'DROUGHT/EXCESSIVE HEAT', 'WARM WEATHER', 'Record Heat', 'RECORD HEAT WAVE', 'Record High', 'RECORD HIGH', 'RECORD HIGH TEMPERATURE', 'RECORD HIGH TEMPERATURES', 'Record temperature', 'RECORD TEMPERATURE', 'Record Temperatures', 'RECORD TEMPERATURES', 'RECORD WARM', 'RECORD WARM TEMPS.', 'Record Warmth', 'RECORD WARMTH', 'VERY WARM')
data1$EVTYPE[data1$EVTYPE %in% gr1] <- gr1[1]
# Combine Thunderstorm wind events
gr2 <- c('THUNDERSTORM WIND', 'TSTM WIND', 'THUNDERSTORM WINDS', 'THUNDERSTORMW', 'THUNDERSTORM WINDS', 'THUNDERSTORM WINDSS', "LIGHTNING AND THUNDERSTORM WIN", "THUNDERSTORM WIND (G40)", "THUNDERSTORM WIND G52", "THUNDERSTORM WINDS 13", "THUNDERSTORM WINDS/HAIL", "THUNDERSTORMS WINDS", "THUNDERTORM WINDS", "TSTM WIND (G35)", "TSTM WIND (G40)", "TSTM WIND", "TSTM WIND (G45)", "GUSTY THUNDERSTORM WIND", "GUSTY THUNDERSTORM WINDS", "LIGHTNING THUNDERSTORM WINDS", "LIGHTNING THUNDERSTORM WINDSS", "SEVERE THUNDERSTORM", "SEVERE THUNDERSTORM WINDS", "SEVERE THUNDERSTORMS", "THUDERSTORM WINDS", "THUNDEERSTORM WINDS", "THUNDERESTORM WINDS", "THUNDERSTORM DAMAGE", "THUNDERSTORM DAMAGE TO", "THUNDERSTORM W INDS", "Thunderstorm Wind", "THUNERSTORM WINDS", "TUNDERSTORM WIND")
data1$EVTYPE[data1$EVTYPE %in% gr2] <- gr2[1]
# Combine HAIL events
data1$EVTYPE <- sub("(.*)HAIL(.*)", "HAIL", data1$EVTYPE)
# Second pass to combine combine additional THUNDERSTORM and TSTM events
data1$EVTYPE <- sub("(.*)THUNDERSTORM(.*)", "THUNDERSTORM WIND", data1$EVTYPE)
data1$EVTYPE <- sub("(.*)TSTM(.*)", "THUNDERSTORM WIND", data1$EVTYPE)
# Extreme Cold events
gr3 <- c("EXTREME COLD", "EXTREME COLD/WIND CHILL", "COLD/WIND CHILL", "COLD", "EXTREME WINDCHILL", "COLD WEATHER", "Cold", "COLD WAVE", "Cold Temperature", "Extreme Cold", "UNSEASONABLY COLD", "Extended Cold", "RECORD COLD", "SNOW/ BITTER COLD", "EXTREME/RECORD COLD")
data1$EVTYPE[data1$EVTYPE %in% gr3] <- gr3[1]
# Combine Hurricane events
data1$EVTYPE <- sub("(.*)HURRICANE(.*)", "HURRICANE", data1$EVTYPE)
# Tropical Storm events
data1$EVTYPE <- sub("(.*)TROPICAL(.*)", "TROPICAL STORM", data1$EVTYPE)
# Rip Current events
data1$EVTYPE <- sub("(.*)RIP(.*)", "RIP CURRENT", data1$EVTYPE)
# Lightning events
data1$EVTYPE <- sub("(.*)LIGHTN(.*)", "LIGHTNING", data1$EVTYPE)
# Fire events
data1$EVTYPE <- sub("(.*)FIRE(.*)", "WILD/FOREST FIRE", data1$EVTYPE)
# Set EVTYPE as factor
data1$EVTYPE <- as.factor(data1$EVTYPE)
str(data1$EVTYPE)
## Factor w/ 745 levels " HIGH SURF ADVISORY",..: 646 646 646 646 646 646 646 646 646 646 ...
From the dataset codebook, section 2.7 “Damage,”" we find that the property damage estimates PROPDMG need to be multiplied by PROPDMGEXP to arrive at the correct dollar figure. A similar multiplication is required for the crop damage estimates.
The multipliers used in the dataset are as follows (definitions are in the codebook):
levels(as.factor(data1$PROPDMGEXP))
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
levels(as.factor(data1$CROPDMGEXP))
## [1] "" "?" "0" "2" "B" "k" "K" "m" "M"
In the following code chunk, the exponential multipliers are set up and then applied to the damage cost estimates to provide the correct economic damages for analysis.
# Convert the PROPDMGEXP and CROPDMGEXP columns to numerical exponents of 10's
data1$PROPDMGEXP <- sub("[H|h]", "2", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[K|k]", "3", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[M|m]", "6", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[B|b]", "9", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("[-|+|?]", "0", data1$PROPDMGEXP)
data1$PROPDMGEXP <- sub("", "0", data1$PROPDMGEXP)
data1$PROPDMGEXP <- as.numeric(data1$PROPDMGEXP)
data1$CROPDMGEXP <- sub("[H|h]", "2", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[K|k]", "3", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[M|m]", "6", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[B|b]", "9", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("[-|+|?]", "0", data1$CROPDMGEXP)
data1$CROPDMGEXP <- sub("", "0", data1$CROPDMGEXP)
data1$CROPDMGEXP <- as.numeric(data1$CROPDMGEXP)
data1 <- data1 %>%
mutate(cost_property=PROPDMG*10**PROPDMGEXP,
cost_crops=CROPDMG*10**CROPDMGEXP)
# Remove unnecessary columns
data1$PROPDMG <- NULL
data1$CROPDMG <- NULL
data1$PROPDMGEXP <- NULL
data1$CROPDMGEXP <- NULL
Objective 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
For the purposes of this investigation, fatalities are combined with reportable injuries, giving a total casualty summation that is used for identifying the most harmful weather events. The total represents casualties over the entire reported period, 1950 through 2011.
health_ev <- group_by(data1, EVTYPE) %>%
summarize(Tot_fatal=sum(FATALITIES, na.rm=TRUE),
Max_fatal=max(FATALITIES, na.rm=TRUE),
Mean_fatal=mean(FATALITIES, na.rm=TRUE),
Tot_inj=sum(INJURIES, na.rm=TRUE),
Max_inj=max(INJURIES, na.rm=TRUE),
Mean_inj=mean(INJURIES, na.rm=TRUE) ) %>%
mutate(Tot_casualty=Tot_fatal + Tot_inj) %>%
arrange(desc(Tot_casualty))
# Adjust column ordering
health_ev <- select(health_ev, c(1,8,2:7))
#Create tables for examining most harmful weather events
health_comb <- select(health_ev, c(1,2))
health_fatal <- arrange(health_ev, desc(Tot_fatal)) %>% select(c(1,3:5))
health_injury <- arrange(health_ev, desc(Tot_inj)) %>% select(c(1,6:8))
head(health_comb, 10)
## # A tibble: 10 × 2
## EVTYPE Tot_casualty
## <fctr> <dbl>
## 1 TORNADO 97068
## 2 HEAT 12421
## 3 THUNDERSTORM WIND 10174
## 4 FLOOD 7259
## 5 LIGHTNING 6048
## 6 FLASH FLOOD 2755
## 7 ICE STORM 2064
## 8 WILD/FOREST FIRE 1698
## 9 WINTER STORM 1527
## 10 HAIL 1486
head(health_fatal, 10)
## # A tibble: 10 × 4
## EVTYPE Tot_fatal Max_fatal Mean_fatal
## <fctr> <dbl> <dbl> <dbl>
## 1 TORNADO 5661 158 0.093261944
## 2 HEAT 3178 583 1.068953919
## 3 FLASH FLOOD 978 20 0.018018682
## 4 LIGHTNING 817 5 0.051826947
## 5 THUNDERSTORM WIND 725 11 0.002159525
## 6 RIP CURRENT 577 6 0.742599743
## 7 FLOOD 470 15 0.018558004
## 8 EXTREME COLD 452 10 0.174787316
## 9 HIGH WIND 248 8 0.012269939
## 10 AVALANCHE 224 6 0.580310881
head(health_injury, 10)
## # A tibble: 10 × 4
## EVTYPE Tot_inj Max_inj Mean_inj
## <fctr> <dbl> <dbl> <dbl>
## 1 TORNADO 91407 1700 1.505881384
## 2 THUNDERSTORM WIND 9449 70 0.028145311
## 3 HEAT 9243 519 3.108980827
## 4 FLOOD 6789 800 0.268064440
## 5 LIGHTNING 5231 51 0.331832022
## 6 ICE STORM 1975 1568 0.984546361
## 7 FLASH FLOOD 1777 150 0.032739466
## 8 WILD/FOREST FIRE 1608 150 0.379334749
## 9 HAIL 1466 109 0.005048748
## 10 HURRICANE 1326 780 4.636363636
# melt down the top 10 rows of health_ev
health_top <- health_ev[1:10,]
health_top <- melt(health_top, id="EVTYPE", measure.vars=c("Tot_fatal", "Tot_inj"))
# plot
g <- ggplot(health_top, aes(x= reorder(EVTYPE, -value), y=value, fill=variable))
g <- g + geom_bar(position="stack", stat='identity')
g <- g + theme(axis.text.x=element_text(angle = 60, hjust = 1))
g <- g + labs(title="Population Health Affected By Weather Events",
x="Type of Weather Event",
y="Casualties")
#g <- g + scale_y_log10()
g <- g + scale_fill_manual(name="Health Effect",
values=c("red","skyblue"),
labels=c("Fatalities","Injuries"))
print(g)
Objective 2. Across the United States, which types of events have the greatest economic consequences?
For the purposes of this analysis, property damage is combined with agricultural damages for the total economic losses. The summary chart does show the relative magnitudes of property vs agricultural damages.
financial_ev <- group_by(data1, EVTYPE) %>%
summarize(Tot_prop=sum(cost_property, na.rm=TRUE),
Tot_crop=sum(cost_crops, na.rm=TRUE) ) %>%
mutate(Tot_cost=Tot_prop + Tot_crop) %>%
arrange(desc(Tot_cost))
# Fix column ordering
financial_ev <- select(financial_ev, c(1,4,2,3))
head(financial_ev,10)
## # A tibble: 10 × 4
## EVTYPE Tot_cost Tot_prop Tot_crop
## <fctr> <dbl> <dbl> <dbl>
## 1 FLOOD 150319678257 144657709807 5661968450
## 2 HURRICANE 90271472810 84756180010 5515292800
## 3 TORNADO 59020779447 58603317927 417461520
## 4 STORM SURGE 43323541000 43323536000 5000
## 5 HAIL 19134309410 16022626537 3111682873
## 6 FLASH FLOOD 18243991079 16822673979 1421317100
## 7 DROUGHT 15018672000 1046106000 13972566000
## 8 THUNDERSTORM WIND 12347318414 11140399676 1206918738
## 9 RIVER FLOOD 10148404500 5118945500 5029459000
## 10 ICE STORM 8967041360 3944927860 5022113500
# melt down the top 10 rows of health_ev
financial_top <- financial_ev[1:10,]
financial_top <- melt(financial_top, id="EVTYPE",
measure.vars=c("Tot_prop", "Tot_crop"))
# plot
g <- ggplot(financial_top, aes(x= reorder(EVTYPE, -value), y=value, fill=variable))
g <- g + geom_bar(position="stack", stat='identity')
g <- g + theme(axis.text.x=element_text(angle = 60, hjust = 1))
g <- g + labs(title="Damage Caused By Weather Events",
x="Type of Weather Event",
y="Cummulative Cost")
g <- g + scale_fill_manual(name="Damages",
values=c("darkgreen","royalblue"),
labels=c("Property","Crops"))
print(g)