The NOAA Storm database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage between 1950 and 2011. Our analyse of the data will show that the top 10 most dangerous events in terms of fatalities and injuries as well as the top 10 most expensive events in terms of property and crop damage. From this analysis we can conclude that the tornado is the biggest hazard for the public health and that floods and droughts are respectively the most costly to properties and farmers.
# Download zip file containing NOOA storm data
zipUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
zipFile <- "/home/greg/DataScience/R/Coursera/weather.csv.bz2"
download.file(zipUrl, zipFile, mode = "wb")
# unzip and load file
weather <- read.csv("/home/greg/DataScience/R/Coursera/weather.csv.bz2")
weather$EVTYPE <- tolower(weather$EVTYPE)
fatsum <- aggregate(FATALITIES ~ EVTYPE,weather,sum)
fatsum <- fatsum[order(-fatsum$FATALITIES)[1:10],]
injsum <- aggregate(INJURIES ~ EVTYPE,weather,sum)
injsum <- injsum[order(-injsum$INJURIES)[1:10],]
weather$PROPEXP[weather$PROPDMGEXP == "K"] <- 1000
weather$PROPEXP[weather$PROPDMGEXP == "M"] <- 1e+06
weather$PROPEXP[weather$PROPDMGEXP == ""] <- 0
weather$PROPEXP[weather$PROPDMGEXP == "B"] <- 1e+09
weather$PROPEXP[weather$PROPDMGEXP == "m"] <- 1e+06
weather$PROPEXP[weather$PROPDMGEXP == "0"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "1"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "2"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "3"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "4"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "5"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "6"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "7"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "8"] <- 10
weather$PROPEXP[weather$PROPDMGEXP == "h"] <- 100
weather$PROPEXP[weather$PROPDMGEXP == "H"] <- 100
weather$PROPEXP[weather$PROPDMGEXP == "+"] <- 1
weather$PROPEXP[weather$PROPDMGEXP == "-"] <- 0
weather$PROPEXP[weather$PROPDMGEXP == "?"] <- 0
weather$prop<- weather$PROPDMG * weather$PROPEXP/10^9
propsum <- aggregate(prop ~ EVTYPE,weather,sum)
propsum <- propsum[order(-propsum$prop)[1:10],]
weather$CROPEXP[weather$CROPDMGEXP == "K"] <- 1000
weather$CROPEXP[weather$CROPDMGEXP == "M"] <- 1e+06
weather$CROPEXP[weather$CROPDMGEXP == ""] <- 0
weather$CROPEXP[weather$CROPDMGEXP == "B"] <- 1e+09
weather$CROPEXP[weather$CROPDMGEXP == "m"] <- 1e+06
weather$CROPEXP[weather$CROPDMGEXP == "0"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "1"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "2"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "3"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "4"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "5"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "6"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "7"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "8"] <- 10
weather$CROPEXP[weather$CROPDMGEXP == "h"] <- 100
weather$CROPEXP[weather$CROPDMGEXP == "H"] <- 100
weather$CROPEXP[weather$CROPDMGEXP == "+"] <- 1
weather$CROPEXP[weather$CROPDMGEXP == "-"] <- 0
weather$CROPEXP[weather$CROPDMGEXP == "?"] <- 0
weather$crop<- weather$CROPDMG * weather$CROPEXP/10^9
cropsum <- aggregate(crop ~ EVTYPE,weather,sum)
cropsum <- cropsum[order(-cropsum$crop)[1:10],]
library(gridExtra)
library(ggplot2)
p1 <- ggplot(fatsum, aes(reorder(EVTYPE,-FATALITIES), FATALITIES)) +
geom_bar(stat="identity",fill="red") +
theme_bw() +
labs(x="Event Type", y=expression("Number of fatalities")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Top 10 Number of fatalities \n per event type in the US")
p2 <- ggplot(injsum, aes(reorder(EVTYPE,-INJURIES), INJURIES)) +
geom_bar(stat="identity",fill="orange") +
theme_bw() +
labs(x="Event Type", y=expression("Number of injuries")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Top 10 Number of injuries \n per event type in the US")
grid.arrange(p1, p2, nrow = 1)
The above plots clearly show that tornadoes are by the far the most dangerous event types for the public health.
p3 <- ggplot(propsum, aes(reorder(EVTYPE,-prop), prop)) +
geom_bar(stat="identity",fill="red") +
theme_bw() +
labs(x="Event Type", y=expression("Damage (Billion US Dollar)")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Top 10 Property Damage \n per event type in the US")
p4 <- ggplot(cropsum, aes(reorder(EVTYPE,-crop), crop)) +
geom_bar(stat="identity",fill="orange") +
theme_bw() +
labs(x="Event Type", y=expression("Damage (Billion US Dollar)")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Top 10 Crop Damage \n per event type in the US")
grid.arrange(p3, p4, nrow = 1)
The above plots show that floods and droughts are respectively the most costly event types for the properties owners and farmers.