This analysis looks at data from the NOAA storm database of storm events in the United States from 1950-2011 and answers two questions: (1) Which types of events are most harmful to population health and (2) Which types of events had the greatest economic consequences? Population health impact is measured in terms of injuries and fatalities. Economic consequences are measured in terms of property and crop damage costs.
Loading data and assessing contents.
stormdata <- read.csv(bzfile("repdata_data_StormData.csv.bz2"), header = TRUE, stringsAsFactors = FALSE)
names(stormdata)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Processing fatalities data is done by aggegrating number of fatalities by event type and subsetting to only event types with any fatalities. Looking at the upper quantiles of this distribution, only the top 5% are selected to be ordered and plotted.
aggfatal <- aggregate(FATALITIES ~ EVTYPE, stormdata, FUN=sum)
fatal <- subset(aggfatal, FATALITIES >= 1) #subsets any fatalities
quantile(fatal$FATALITIES, probs = c(.75, .80, .85, .90, .95))
## 75% 80% 85% 90% 95%
## 19.75 34.20 64.00 109.60 239.60
fatal95 <- subset(fatal, FATALITIES >= 239) #subsets 95th percentile
fatal95 <- fatal95[order(fatal95$FATALITIES, decreasing = TRUE),]
Similarly, processing injuries data is done by aggegrating number of injuries by event type and subsetting to only event types with any injuries. Looking at the upper quantiles of this distribution, only the top 5% are selected to be ordered and plotted.
agginj <- aggregate(INJURIES ~ EVTYPE, stormdata, FUN=sum)
inj <- subset(agginj, INJURIES >= 1) #subsets any injuries
quantile(inj$INJURIES, probs = c(.75, .80, .85, .90, .95))
## 75% 80% 85% 90% 95%
## 78.50 164.00 305.15 835.90 1531.35
inj95 <- subset(inj, INJURIES >= 1531) #subsets 95th percentile
inj95 <- inj95[order(inj95$INJURIES, decreasing = TRUE),]
In processing the cost data the exponent variable has to be converted to the appropriate multiple of 10 and then matched to the cost variables. According the documentation, k/K = thousand, M/m = million, B/b = billion. Other values are converted to NA.
stormcost <- stormdata[, c("EVTYPE", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")] #subset needed variables
unique(stormdata$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
#make a separate dataset to simplify
propexp <- c("K", "M", "", "B", "m", "+", "0", "5", "6", "?", "4", "2", "3", "h", "7", "H", "-", "1", "8")
convert <- c(1000, 1e+06, 1, 1e+09, 1e+06, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
transform <- data.frame(propexp, convert)
stormcost$propcost <- stormcost$PROPDMG*transform[match(stormcost$PROPDMGEXP, transform$propexp),2]
unique(stormdata$CROPDMGEXP)
## [1] "" "M" "K" "m" "B" "?" "0" "k" "2"
cropexp <- c("", "M", "K", "m", "B", "?", "0", "k", "2")
convert2 <- c(1, 1e+06, 1000, 1e+06, 1e+09, NA, NA, 1000, NA)
transform2 <- data.frame(cropexp, convert2)
stormcost$cropcost <- stormcost$CROPDMG*transform2[match(stormcost$CROPDMGEXP, transform2$cropexp),2]
As is done for fatalities and injuries, costs are aggregated by event time and subsetted to look at events with any costs. Then only the top 5% of costly events are subsetted and ordered to be plotted.
aggcost_p <- aggregate(propcost ~ EVTYPE, stormcost, FUN=sum)
prop <- subset(aggcost_p, propcost >= 1)
quantile(prop$propcost, probs = c(.75, .80, .85, .90, .95))
## 75% 80% 85% 90% 95%
## 5000000 7753700 15000000 110500000 1600000000
prop95 <- subset(prop, propcost >= 1600000000) #subsets 95th percentile
prop95 <- prop95[order(prop95$propcost, decreasing = TRUE),]
aggcost_c <- aggregate(cropcost ~ EVTYPE, stormcost, FUN=sum)
crop <- subset(aggcost_c, cropcost >= 1)
quantile(crop$cropcost, probs = c(.75, .80, .85, .90, .95))
## 75% 80% 85% 90% 95%
## 46625000 106796830 244238677 523204675 1717956025
crop95 <- subset(crop, cropcost >= 1717956025) #subsets 95th percentile
crop95 <- crop95[order(crop95$cropcost, decreasing = TRUE),]
Tornados resulted in both the most fatalities and injuries. While floods resulted in the greatest amount of property damage, droughts resulted in the greatest amount of crop damage, followed by floods. In summary, tornados, droughts, and floods had the greatest health and economic impact.
par(mfrow=c(1,2))
par(mar=c(8,4,2,2))
barplot(fatal95$FATALITIES/1000, names.arg = fatal95$EVTYPE, ylim = c(0, 6),
ylab = "Total Fatalities (in thousands)", las = 2, col = rainbow(30),
main = "Storm Events with
Top 5% of Fatalities")
barplot(inj95$INJURIES/1000, names.arg = inj95$EVTYPE, las = 2,
ylab = "Total Injuries (in thousands)", ylim = c(0,100), col = rainbow(30),
main = "Storm Events with
Top 5% of Injuries")
par(mar=c(12,6,2,1))
barplot(prop95$propcost/1e09, names.arg = prop95$EVTYPE, las = 2, cex.names = 0.8,
ylab = "Total Cost (Billions USD)", col = rainbow(30),
main = "Storm Events with Top 5% of Property Damage")
par(mar=c(10,6,2,1))
barplot(crop95$cropcost/1e09, names.arg = crop95$EVTYPE, ylim = c(0, 15), las = 2,
cex.names = 0.8, ylab = "Total Cost (Billions USD)", col = rainbow(30),
main = "Storm Events with Top 5% of Crop Damage")