The goal of this work is to estimate which weather phenomena causes the greatest harm, be it economic damage or damage on public health.
The data comes from the NOAA Storm Database which tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
After a cleanup of event types and exponents for the damage size the claims ( economic damage and public health ) were summed per event type and graphed to see what types of weather cause the greatest harm.
library(data.table)
library(stringr)
library(ggplot2)
srcFile <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
targetFile = "data/StormData.csv.bz2"
if(!dir.exists("data")) { dir.create("data") }
if(!file.exists(targetFile)) {
download.file(srcFile,targetFile)
}
stormData <- as.data.table(read.csv(targetFile))
For our research wee need only theese Columns:
stormDataClean <- stormData[,.(EVTYPE,FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)]
In the Dataset are some typos, to prevent commen errors we do some cleanup:
stormDataClean <- stormDataClean[, lapply(.SD,tolower)]
stormDataClean <- stormDataClean[!str_detect(EVTYPE,"summary"),]
## replace all punctuation chars mith a whitespace
stormDataClean$EVTYPE <- gsub("[[:punct:]]"," ",stormDataClean$EVTYPE)
# replace multiple whitespace
stormDataClean$EVTYPE <- gsub("\\s+"," ",stormDataClean$EVTYPE)
#remove leading and trailing whitespace
stormDataClean$EVTYPE <- gsub("^\\s+","",stormDataClean$EVTYPE)
stormDataClean$EVTYPE <- gsub("\\s+$","",stormDataClean$EVTYPE)
stormDataClean$EVTYPE <- gsub("w ?inds?|wins","wind",stormDataClean$EVTYPE)
stormDataClean$EVTYPE <- gsub("storms","storm",stormDataClean$EVTYPE)
stormDataClean$EVTYPE <- gsub("tstm","thunderstorm",stormDataClean$EVTYPE)
Show the Result of the cleaning for event type
paste("Number of unique event types before cleaning: ", length(unique(stormData$EVTYPE)))
## [1] "Number of unique event types before cleaning: 985"
paste("Number of unique event types after cleaning: ", length(unique(stormDataClean$EVTYPE)))
## [1] "Number of unique event types after cleaning: 735"
The first look on the exponents shows that that we have to clean the exponent:
unique(stormDataClean$PROPDMGEXP)
## [1] "k" "m" "" "b" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "-" "1" "8"
unique(stormDataClean$CROPDMGEXP)
## [1] "" "m" "k" "b" "?" "0" "2"
To clean the exponents I use this maping:
# Property damage exponent
stormDataClean$PROPDMGEXP <- gsub("[[:punct:]]","0",stormDataClean$PROPDMGEXP)
stormDataClean$PROPDMGEXP <- gsub("^$","0",stormDataClean$PROPDMGEXP)
stormDataClean$PROPDMGEXP <- gsub("h","2",stormDataClean$PROPDMGEXP)
stormDataClean$PROPDMGEXP <- gsub("k","3",stormDataClean$PROPDMGEXP)
stormDataClean$PROPDMGEXP <- gsub("m","6",stormDataClean$PROPDMGEXP)
stormDataClean$PROPDMGEXP <- gsub("b","9",stormDataClean$PROPDMGEXP)
# Crop damage exponent
stormDataClean$CROPDMGEXP <- gsub("[[:punct:]]","0",stormDataClean$CROPDMGEXP)
stormDataClean$CROPDMGEXP <- gsub("^$","0",stormDataClean$CROPDMGEXP)
stormDataClean$CROPDMGEXP <- gsub("h","2",stormDataClean$CROPDMGEXP)
stormDataClean$CROPDMGEXP <- gsub("k","3",stormDataClean$CROPDMGEXP)
stormDataClean$CROPDMGEXP <- gsub("m","6",stormDataClean$CROPDMGEXP)
stormDataClean$CROPDMGEXP <- gsub("b","9",stormDataClean$CROPDMGEXP)
unique(stormDataClean$PROPDMGEXP)
## [1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
unique(stormDataClean$CROPDMGEXP)
## [1] "0" "6" "3" "9" "2"
First check for NA values
paste("NA in property damage:", sum(is.na(stormDataClean$PROPDMG)))
## [1] "NA in property damage: 0"
paste("NA in crop damage:", sum(is.na(stormDataClean$CROPDMG)))
## [1] "NA in crop damage: 0"
paste("NA in fatalities:", sum(is.na(stormDataClean$FATALITIES)))
## [1] "NA in fatalities: 0"
paste("NA in injuries:", sum(is.na(stormDataClean$INJURIES)))
## [1] "NA in injuries: 0"
Change the type of the values to numeric
stormDataClean$PROPDMGEXP <- as.numeric(stormDataClean$PROPDMGEXP)
stormDataClean$CROPDMGEXP <- as.numeric(stormDataClean$CROPDMGEXP)
stormDataClean$PROPDMG <- as.numeric(stormDataClean$CROPDMG)
stormDataClean$CROPDMG <- as.numeric(stormDataClean$CROPDMG)
stormDataClean$FATALITIES <- as.numeric(stormDataClean$FATALITIES)
stormDataClean$INJURIES <- as.numeric(stormDataClean$INJURIES)
Calculate the damage costs
stormDataClean[, PROPDMGCOST := PROPDMG * 10 ^ PROPDMGEXP]
stormDataClean[, CROPDMGCOST := CROPDMG * 10 ^ CROPDMGEXP]
Prepare data
fatalities <- stormDataClean[, sum(FATALITIES), by=EVTYPE]
names(fatalities) <- c("Event", "fatalities")
injuries <- stormDataClean[, sum(INJURIES), by=EVTYPE]
names(injuries) <- c("Event", "injuries")
Select the Top-Ten events on public damage
topTenFatalities <- fatalities[order(-fatalities)][1:10]
topTenInjuries <- injuries[order(-injuries)][1:10]
show(topTenInjuries)
## Event injuries
## 1: tornado 91346
## 2: thunderstorm wind 9364
## 3: flood 6789
## 4: excessive heat 6525
## 5: lightning 5230
## 6: heat 2100
## 7: ice storm 1975
## 8: flash flood 1777
## 9: high wind 1439
## 10: hail 1361
show(topTenFatalities)
## Event fatalities
## 1: tornado 5633
## 2: excessive heat 1903
## 3: flash flood 978
## 4: heat 937
## 5: lightning 817
## 6: thunderstorm wind 701
## 7: flood 470
## 8: rip current 368
## 9: high wind 283
## 10: avalanche 224
Plot the result
require(gridExtra)
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 3.2.5
plot1 <- ggplot(data=topTenInjuries, aes(x=Event, y=injuries)) +
geom_bar(stat="identity", position=position_dodge())+
labs(title="Injuries by Weather event")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot2 <- ggplot(data=topTenFatalities, aes(x=Event, y=fatalities)) +
geom_bar(stat="identity", position=position_dodge())+
labs(title="Fatalities by Weather event")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol=2)
Prepare data
economicDamage <- stormDataClean[, sum(PROPDMGCOST + CROPDMGCOST), by=EVTYPE]
names(economicDamage) <- c("Event", "damage_cost")
Select the Top-Ten events on economic damage
topTenEconomicDamage <- economicDamage[order(-damage_cost)][1:10]
show(topTenEconomicDamage)
## Event damage_cost
## 1: hurricane 805623826053
## 2: hurricane typhoon 735376324130
## 3: flood 92913946818
## 4: flash flood 40286456989
## 5: tornado 28684831782
## 6: hail 18342778371
## 7: thunderstorm wind 18224098585
## 8: drought 17416084947
## 9: river flood 10600650725
## 10: hurricane opal high wind 10010000000
Plot the result
ggplot(data=topTenEconomicDamage, aes(x=Event, y=damage_cost)) +
geom_bar(stat="identity", position=position_dodge())+
labs(title="Economic damage (property+crop) by Weather event")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ylab("damage cost in dollar")