This report explores the NOAA dataset to answer two questions - 1. Which are the events most harmful to population health? 2. Which events are associated with greatest economic consequences?
For performing the tasks, data is first downloaded and processed in a form that helps in analysis to reach to conclusions. The variables named - FATALITIES and INJURIES represent the impact on human health. The variables named - PROPDMG and CROPDMG represent the economic consequences. The analysis tries to take into account both for finding out teh events which are significant for their impact. Significance is determined as the top 10 events comprising more than 95% of human health impact and economic consequences.
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "stormdata.csv.bz2", method = "curl")
x <- read.csv("stormdata.csv.bz2")
powervar <- function(v) {
l1 <- length(v)
y <- rep(0, l1)
for (i in 1:l1) {
if (v[i] == "m" | v[i] == "M") { y[i] <- 6 }
else if (v[i] == "b" | v[i] == "B") { y[i] <- 9 }
else if (v[i] == "k" | v[i] == "K") { y[i] <- 3 }
else if (v[i] %in% as.character(c(1:8))) { y[i] <- as.numeric(v[i])}
else {y[i] <- 0}
}
y
}
x$CROPDMGEXP <- as.character(x$CROPDMGEXP)
x$PROPDMGEXP <- as.character(x$PROPDMGEXP)
x$CROPDMGEXP <- powervar(x$CROPDMGEXP)
x$PROPDMGEXP <- powervar(x$PROPDMGEXP)
x$CROPDMGEXP <- 10^x$CROPDMGEXP
x$PROPDMGEXP <- 10^x$PROPDMGEXP
x$CROPDMG <- x$CROPDMG * x$CROPDMGEXP
x$PROPDMG <- x$PROPDMG * x$PROPDMGEXP
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
eventwise_fat <- with(x, tapply(FATALITIES, EVTYPE, sum))
eventwise_inj <- with(x, tapply(INJURIES, EVTYPE, sum))
eventwise_prop <- with(x, tapply(PROPDMG, EVTYPE, sum))
eventwise_crop <- with(x, tapply(CROPDMG, EVTYPE, sum))
eventwise_fat <- eventwise_fat[order(eventwise_fat, decreasing = TRUE)]
eventwise_inj <- eventwise_inj[order(eventwise_inj, decreasing = TRUE)]
eventwise_prop <- eventwise_prop[order(eventwise_prop, decreasing = TRUE)]
eventwise_crop <- eventwise_crop[order(eventwise_crop, decreasing = TRUE)]
To look at the distribution of damage among events, we draw boxplots of the four relevant variables.
par(mfrow = c(1, 4))
boxplot(eventwise_fat, main = "fatalities")
boxplot(eventwise_inj, main = "injuries")
boxplot(eventwise_prop, main = "property damage")
boxplot(eventwise_crop, main = "crop damage")
Clearly, from above, the damage caused in every type is limited to a few event types where most of the damage is concentrated. To confirm the findings above, we draw the element wise quantiles. (Note 985 is the number of type of events, so 985 quantiles represent one event for each quantile.)
tail(quantile(eventwise_fat, prob = seq(0, 1, length.out = 985)), 5)
## 99.5934959% 99.6951220% 99.7967480% 99.8983740% 100.0000000%
## 816 937 978 1903 5633
tail(quantile(eventwise_inj, prob = seq(0, 1, length.out = 985)), 5)
## 99.5934959% 99.6951220% 99.7967480% 99.8983740% 100.0000000%
## 5230 6525 6789 6957 91346
tail(quantile(eventwise_crop, prob = seq(0, 1, length.out = 985)), 5)
## 99.5934959% 99.6951220% 99.7967480% 99.8983740% 100.0000000%
## 3025954473 5022113500 5029459000 5661968450 13972566000
tail(quantile(eventwise_prop, prob = seq(0, 1, length.out = 985)), 5)
## 99.5934959% 99.6951220% 99.7967480% 99.8983740% 100.0000000%
## 16822673979 43323536000 56947380677 69305840000 144657709807
The result confirms that top 5 events contribute more than 90% of damage in each category.
To detail the events, we subset the data for top 10 categories of event types in each variable category i.e. fatalities, injuries, property damage and crop damage.
maxfat_events <- names(eventwise_fat[1:10])
maxinj_events <- names(eventwise_inj[1:10])
x_final <- subset(x, EVTYPE %in% c(maxfat_events, maxinj_events))
maxprop_events <- names(eventwise_prop[1:10])
maxcrop_events <- names(eventwise_crop[1:10])
x_final2 <- subset(x, EVTYPE %in% c(maxprop_events, maxcrop_events))
The dataset combines the fatalities and injuries with a separate variable/column to differentiate between the two. Event type is taken from original dataset.
l <- length(x_final$FATALITIES)
effect_count <- c(x_final$FATALITIES, x_final$INJURIES)
effect_type <- c(rep("FATALITIES", l), rep("INJURIES", l))
event_type <- c(x_final$EVTYPE, x_final$EVTYPE)
x3 <- data.frame(Effect_count = effect_count, Effect_type = effect_type, Event_type = event_type)
x3$Event_type <- as.factor(x3$Event_type)
levelnames <- as.numeric(levels(x3$Event_type))
levelnames <- levels(x$EVTYPE)[levelnames]
levels(x3$Event_type) <- levelnames
g <- ggplot(x3) + geom_bar(aes(x = Event_type, y = Effect_count, fill = Event_type), stat = "identity")
g <- g + facet_wrap( ~ Effect_type, scales = "free")
g <- g + theme(axis.text.x = element_blank())
g <- g + xlab("Event Types") + ylab("Count of human incidents")
g
Similarly, we carry out the analysis for property damage and crop damage
l <- length(x_final2$PROPDMG)
effect_count <- c(x_final2$PROPDMG, x_final2$CROPDMG)
effect_type <- c(rep("PROPERTY DAMAGE", l), rep("CROP DAMAGE", l))
event_type <- c(x_final2$EVTYPE, x_final2$EVTYPE)
x3 <- data.frame(Effect_count = effect_count, Effect_type = effect_type, Event_type = event_type)
x3$Event_type <- as.factor(x3$Event_type)
levelnames <- as.numeric(levels(x3$Event_type))
levelnames <- levels(x$EVTYPE)[levelnames]
levels(x3$Event_type) <- levelnames
#plotting event wise total damage
g <- ggplot(x3) + geom_bar(aes(x = Event_type, y = Effect_count, fill = Event_type), stat = "identity")
g <- g + facet_wrap( ~ Effect_type, scales = "free")
g <- g + theme(axis.text.x = element_blank())
g <- g + xlab("Event Types") + ylab("Damage")
g
Based on the analysis and plots above, following conclusions can be drawn -
Tornadoes are, by far, the biggest cause of fatalities and injuries for population. It constitute more than 50% of both fatalities and injuries.
Excessive Heat is the second major cause of fatalities with flash floods at the third position
For injuries, flash floods and hail come next to tornadoes
For property damage, floods are the biggest cause. Hurricane/typhoons and tornadoes are distant second and third caused of crop damage.
For Crop damage, Droughts are the biggest reason. Floods comes distant second and River flood and ice storm winds are a close third.