This analysis will show the effects of most common weather conditions on both fatalities / injuries and financial damage such as property damage and crop damage.
Due to some of the data being free text instead of a forced standardized input format, some data is subject to human error or unclear data (e.g. the type of weather effect is prone to spelling mistakes: THUNDERSTROM instead of THUNDERSTORM, or it is unclear what an exponent of “+” means when calculating damage). Any interpretation is done via script, so you can reproduce it and correct it, if deemed necessary. Depending on your county/location, the occuring events might differ greatly. Before making any decisions, I advice to verify the events are likely to happen in your area.
Download and unzuip the data. For this analysis, the data has been downloaded on 2017-06-18, around 23:00 CET.
library(R.utils)
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!dir.exists("downloads")) {
dir.create("downloads")
}
download.file(url, "downloads/stormdata.csv.bz2")
if (file.exists("downloads/stormdata.csv")) {
file.remove ("downloads/stormdata.csv")
}
## [1] TRUE
bunzip2("downloads/stormdata.csv.bz2")
Load data and remove unnecessary columns. Only keep what is necessary for further analysis.
stormdata <- read.csv("downloads/stormdata.csv")
cols <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
stormdata <- stormdata[,cols]
EVTYPE describes the type of event that occured. Unfortunatly it’s basically a free text field, so the same event can be entered in different ways. We do an interpretation of the most common events and standardize their names. Most common events and their spelling are:
events <- sort(table(stormdata$EVTYPE), decreasing = T)
head(events, 20)
##
## HAIL TSTM WIND THUNDERSTORM WIND
## 288661 219940 82563
## TORNADO FLASH FLOOD FLOOD
## 60652 54277 25326
## THUNDERSTORM WINDS HIGH WIND LIGHTNING
## 20843 20212 15754
## HEAVY SNOW HEAVY RAIN WINTER STORM
## 15708 11723 11433
## WINTER WEATHER FUNNEL CLOUD MARINE TSTM WIND
## 7026 6839 6175
## MARINE THUNDERSTORM WIND WATERSPOUT STRONG WIND
## 5812 3796 3566
## URBAN/SML STREAM FLD WILDFIRE
## 3392 2761
sum(head(events, 20)) / sum(events)
## [1] 0.9602814
We just need to standardize the 20 most common descriptions to catch 96% of the data.
myDict <- list()
myDict[["HAIL"]] <- "HAIL"
myDict[["TSTM WIND"]] <- "THUNDERSTORM"
myDict[["THUNDERSTORM WIND"]] <- "THUNDERSTORM"
myDict[["TORNADO"]] <- "TORNADO"
myDict[["FLASH FLOOD"]] <- "FLOOD"
myDict[["FLOOD"]] <- "FLOOD"
myDict[["THUNDERSTORM WINDS"]] <- "THUNDERSTORM"
myDict[["HIGH WIND"]] <- "WIND"
myDict[["LIGHTNING"]] <- "THUNDERSTORM"
myDict[["HEAVY SNOW"]] <- "WINTER"
myDict[["HEAVY RAIN"]] <- "RAIN"
myDict[["WINTER STORM"]] <- "WINTER"
myDict[["WINTER WEATHER"]] <- "WINTER"
myDict[["FUNNEL CLOUD"]] <- "TORNADO"
myDict[["MARINE TSTM WIND"]] <- "THUNDERSTORM"
myDict[["MARINE THUNDERSTORM WIND"]] <- "THUNDERSTORM"
myDict[["WATERSPOUT"]] <- "TORNADO"
myDict[["STRONG WIND"]] <- "WIND"
myDict[["URBAN/SML STREAM FLD"]] <- "FLOOD"
myDict[["WILDFIRE"]] <- "WILDFIRE"
newEVTYPE <- as.character(sapply(as.character(stormdata$EVTYPE), FUN = function(x) { a <- myDict[[x]] ; if (is.null(a)) { a <- "OTHER" } ; a } ))
stormdata$EVTYPE <- as.factor(newEVTYPE)
To compare damages done by an event, we need to calculate this number based on damage and exponent
exp2number <- function(x) {
e <- NA
if (x == "") e <- 1
if (tolower(x) == "k") e <- 1000
if (tolower(x) == "m") e <- 1000000
if (tolower(x) == "b") e <- 1000000000
e
}
propdmg <- sapply(stormdata$PROPDMGEXP, exp2number) * stormdata$PROPDMG
cropdmg <- sapply(stormdata$CROPDMGEXP, exp2number) * stormdata$CROPDMG
stormdata$PROPDMG <- propdmg
stormdata$CROPDMG <- cropdmg
stormdata <- stormdata[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG")]
library(ggplot2)
library(scales)
g <- ggplot(stormdata, aes(x = EVTYPE)) + geom_bar(fill = "blue") + xlab(label = "Event") + ylab(label = "Amount") + ggtitle(label = "Occurances per Event") + scale_y_continuous(labels = comma)
print(g)
library(reshape2)
df_names <- names(table(stormdata$EVTYPE))
df_fatalities <- sapply(df_names, FUN = function(x) { sum(stormdata$FATALITIES[stormdata$EVTYPE == x]) } )
df_injuries <- sapply(df_names, FUN = function(x) { sum(stormdata$INJURIES[stormdata$EVTYPE == x ]) } )
df <- data.frame(EVENT = df_names, FATALITIES = df_fatalities, INJURIES = df_injuries)
dfm <- melt(df[,c("EVENT", "FATALITIES", "INJURIES")], id.vars = 1)
ggplot(dfm, aes(x = EVENT, y = value)) + geom_bar(aes(fill = variable), stat = "identity", position = "dodge") + scale_y_continuous() + xlab(label = "Type of Event") + ylab(label = "Number of hurt people") + ggtitle(label = "Total Fatalities and Injuries split by Weather Events")
df_cropdmg <- sapply(df_names, FUN = function(x) { sum(stormdata$CROPDMG[stormdata$EVTYPE == x], na.rm = T)})
df_propdmg <- sapply(df_names, FUN = function(x) { sum(stormdata$PROPDMG[stormdata$EVTYPE == x], na.rm = T)})
df2 <- data.frame(EVENT = df_names, CROPDMG = df_cropdmg, PROPDMG = df_propdmg)
dfm2 <- melt(df2[,c("EVENT", "CROPDMG", "PROPDMG")], id.vars = 1)
ggplot(dfm2, aes(x = EVENT, y = value)) + geom_bar(aes(fill = variable), stat = "identity", position = "dodge") + xlab(label = "Type of Event") + ylab(label = "Damage in USD") + ggtitle(label = "Total Damage done by weather events") + scale_y_continuous(labels = comma)
While the OTHER events only make up about 4% of all weather events, there is still a major part of the total damages done. I suggest to further analyse those 4% to understand what types of other events have a major impact on total damages.
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.4
##
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.2 scales_0.4.1 ggplot2_2.2.1 R.utils_2.5.0
## [5] R.oo_1.21.0 R.methodsS3_1.7.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.9 knitr_1.16 magrittr_1.5 munsell_0.4.3
## [5] colorspace_1.3-2 stringr_1.1.0 plyr_1.8.4 tools_3.3.2
## [9] grid_3.3.2 gtable_0.2.0 htmltools_0.3.6 yaml_2.1.14
## [13] lazyeval_0.2.0 rprojroot_1.2 digest_0.6.12 assertthat_0.1
## [17] tibble_1.2 evaluate_0.10 rmarkdown_1.5 labeling_0.3
## [21] stringi_1.1.2 backports_1.1.0