This study involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the U.S. from 1950 to 2011, as well as estimates of any fatalities, injuries, and property damage.
By using this report government or municipal manager can better prepare for severe weather events and prioritize resources for different types of weather events.
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.2 magrittr_1.5 tools_3.5.2 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.0 stringi_1.2.4 rmarkdown_1.11
## [9] knitr_1.21 stringr_1.3.1 xfun_0.4 digest_0.6.18
## [13] evaluate_0.12
if (!file.exists("stormData.csv.bz2")){
fileurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileurl, "stormData.csv.bz2", method = "curl")
}
noaaStorm <- read.csv("StormData.csv.bz2", stringsAsFactors=F)
First we check out the data to find the columns needed for processing
# We check out the data labels
names(noaaStorm)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
# FATALITIES and INJURIES are reported in numbers.
# This means we can sum them up by event type (see section 'Results')
str(noaaStorm$FATALITIES)
## num [1:902297] 0 0 0 0 0 0 0 0 1 0 ...
str(noaaStorm$INJURIES)
## num [1:902297] 15 0 2 2 2 6 1 0 14 0 ...
# The amount ($) of property damages is given by an exponential number where the
# coefficient is given by PROPDMG and the powers of ten is given by PROPDMGEXP.
# PROPDMGEXP is a character where "H" = 10^2, "K" = 10^3, "M" = 10^6, "B" = 10^9.
str(noaaStorm$PROPDMG)
## num [1:902297] 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
str(noaaStorm$PROPDMGEXP)
## chr [1:902297] "K" "K" "K" "K" "K" "K" "K" "K" "K" "K" "M" "M" "K" ...
# Crop damages follows same principle with CROPDMG and CROPDMGEXP
str(noaaStorm$CROPDMG)
## num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
str(noaaStorm$CROPDMGEXP)
## chr [1:902297] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ...
We process the data needed for reporting property damages by event type
# We subset colums needed for summing property damages by event type
subProp <- noaaStorm[,c('EVTYPE','PROPDMG', 'PROPDMGEXP')]
# Since the highest values are in Billions ("B") the "H" (10^2) and "K" (10^3)
# values do not affect the top 10 order of damage sum ($) by event.
# We make a new subset containing only "M" = 10^6 and "B" = 10^9.
sub2Prop <- subset(subProp, PROPDMGEXP == "B" | PROPDMGEXP == "M",
select = c('EVTYPE','PROPDMG', 'PROPDMGEXP'))
# Since we will report the plot values in Billions we can make the plot by
# using only the PROPDMG column if we divide the PROPDMG values with 1000
# when PROPDMGEXP == "M"
for(i in 1:length(sub2Prop$PROPDMGEXP)) {
ifelse(sub2Prop$PROPDMGEXP[i] == "M",
sub2Prop$PROPDMG[i] <- sub2Prop$PROPDMG[i] / 1000,
sub2Prop$PROPDMG[i] <- sub2Prop$PROPDMG[i])
}
For PROPDMGEXP I did same data processing as above by using “b” instead of “B” and “m” instead of “M” but the damage sum ($) was neglible and do not affect the findings. Same goes for CROPDMGEXP.
We process the data needed for reporting crop damages by event type
# We subset colums needed for summing property damages by event type
subCrop <- noaaStorm[,c('EVTYPE','CROPDMG', 'CROPDMGEXP')]
# We make a new subset with same principle as we did earlier for PROPDMG and
# PROPDMGEXP
sub2Crop <- subset(subCrop, CROPDMGEXP == "B" | CROPDMGEXP == "M",
select = c('EVTYPE','CROPDMG', 'CROPDMGEXP'))
# We prepare the values for plotting by using same principle as for PROPDMG and
# PROPDMGEXP
for(i in 1:length(sub2Crop$CROPDMGEXP)) {
ifelse(sub2Crop$CROPDMGEXP[i] == "M",
sub2Crop$CROPDMG[i] <- sub2Crop$CROPDMG[i] / 1000,
sub2Crop$CROPDMG[i] <- sub2Crop$CROPDMG[i])
}
# We sum the FATALITIES event types
fatalities <- aggregate(FATALITIES ~ EVTYPE, noaaStorm, sum)
# We order the fatality numbers in descending order and select the top 10
fatalitiesTop10 <- fatalities[order(-fatalities$FATALITIES), ][1:10, ]
fatalitiesTop10
## EVTYPE FATALITIES
## 834 TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
## 856 TSTM WIND 504
## 170 FLOOD 470
## 585 RIP CURRENT 368
## 359 HIGH WIND 248
## 19 AVALANCHE 224
library(ggplot2)
# For plotting events in order of fatality numbers we use factor() with levels
fatalitiesTop10$EVTYPE <- factor(fatalitiesTop10$EVTYPE,
levels = fatalitiesTop10$EVTYPE)
ggplot(data = fatalitiesTop10,
aes(x = fatalitiesTop10$EVTYPE, y = fatalitiesTop10$FATALITIES)) +
geom_bar(stat = "identity") +
xlab("Event type") + ylab("Number of fatalities") +
ggtitle("Top 10 weather events with most fatalities in US, 1950 - 2011") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# We sum the INJURIES event types
injuries <- aggregate(INJURIES ~ EVTYPE, noaaStorm, sum)
# We order the fatality numbers in descending order and select the top 10
injuriesTop10 <- injuries[order(-injuries$INJURIES), ][1:10, ]
injuriesTop10
## EVTYPE INJURIES
## 834 TORNADO 91346
## 856 TSTM WIND 6957
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
## 275 HEAT 2100
## 427 ICE STORM 1975
## 153 FLASH FLOOD 1777
## 760 THUNDERSTORM WIND 1488
## 244 HAIL 1361
# For plotting events in order of injury numbers we use factor() with levels
injuriesTop10$EVTYPE <- factor(injuriesTop10$EVTYPE,
levels = injuriesTop10$EVTYPE)
ggplot(injuriesTop10, aes(x = EVTYPE, y = INJURIES)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Event type") + ylab("Number of injuries") +
ggtitle("Top 10 weather events with most injuries in US, 1950 - 2011")
property <- aggregate(PROPDMG ~ EVTYPE, sub2Prop, sum)
propertyTop10 <- property[order(-property$PROPDMG), ][1:10, ]
propertyTop10
## EVTYPE PROPDMG
## 26 FLOOD 143.77918
## 74 HURRICANE/TYPHOON 69.30387
## 108 TORNADO 53.76218
## 102 STORM SURGE 43.30493
## 41 HAIL 15.05226
## 22 FLASH FLOOD 14.73498
## 69 HURRICANE 11.85897
## 113 TROPICAL STORM 7.65798
## 131 WINTER STORM 6.55734
## 91 RIVER FLOOD 5.10520
# For plotting events in order of damage amount we use factor() with levels
propertyTop10$EVTYPE <- factor(propertyTop10$EVTYPE,
levels = propertyTop10$EVTYPE)
ggplot(propertyTop10, aes(x = EVTYPE, y = PROPDMG)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Event type") + ylab("Damages (Billion $)") +
ggtitle("Top 10 weather events with most property damages in US, 1950 - 2011")
crop <- aggregate(CROPDMG ~ EVTYPE, sub2Crop, sum)
cropTop10 <- crop[order(-crop$CROPDMG), ][1:10, ]
cropTop10
## EVTYPE CROPDMG
## 7 DROUGHT 13.95112
## 18 FLOOD 5.49943
## 47 RIVER FLOOD 5.02600
## 44 ICE STORM 5.02045
## 38 HURRICANE 2.73931
## 42 HURRICANE/TYPHOON 2.60417
## 26 HAIL 2.44883
## 13 EXTREME COLD 1.28814
## 16 FLASH FLOOD 1.24336
## 25 FROST/FREEZE 1.08814
# For plotting events in order of damage amount we use factor() with levels
cropTop10$EVTYPE <- factor(cropTop10$EVTYPE,
levels = cropTop10$EVTYPE)
ggplot(cropTop10, aes(x = EVTYPE, y = CROPDMG)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Event type") + ylab("Damages (Billion $)") +
ggtitle("Top 10 weather events with most crop damages in US, 1950 - 2011")