This analysis uses NOAA Storm Data from 1950 - 2011 to assess the damage caused by different types of weather events, specifically regarding population health and economic consequences. We find that TORNADO has by far the biggest effect on human casualties, followed by HEAT, WIND, FLOOD and LIGHTNING. In terms of property and crop damage, FLOOD has by far the biggest effect, followed by HURRICANE/TYPHOON, TORNADO, STORM SURGE and HAIL.
The raw data is read directly from the source file, and used to create a secondary data frame with a reformatted date and only the columns relevant for our analysis:
stormraw <- read.csv("repdata%2Fdata%2FStormData.csv.bz2",stringsAsFactors = FALSE)
stormraw$EVDATE <- as.Date(stormraw$BGN_DATE, "%m/%d/%Y")
stormdata <- stormraw[ ,c(38,7,8,23:28)]
names(stormdata)
[1] “EVDATE” “STATE” “EVTYPE” “FATALITIES” “INJURIES”
[6] “PROPDMG” “PROPDMGEXP” “CROPDMG” “CROPDMGEXP”
Some basic data quality checks were also performed (see Appendix).
Finally, a multiplier for the property and crop damage amounts was calculated and applied using a lookup table:
explkup <- rbind(c("k","1e+03"),c("K","1e+03"),
c("m","1e+06"),c("M","1e+06"),
c("b","1e+09"),c("B","1e+09"))
stormdata$PROPDMGMULT <- explkup[match(stormdata$PROPDMGEXP, explkup),2]
stormdata$PROPDMGMULT[is.na(stormdata$PROPDMGMULT)] <- 0
stormdata$PROPDMGX <- stormdata$PROPDMG * as.numeric(stormdata$PROPDMGMULT)
stormdata$CROPDMGMULT <- explkup[match(stormdata$CROPDMGEXP, explkup),2]
stormdata$CROPDMGMULT[is.na(stormdata$CROPDMGMULT)] <- 0
stormdata$CROPDMGX <- stormdata$CROPDMG * as.numeric(stormdata$CROPDMGMULT)
stormdata$EVTYPE <- as.factor(stormdata$EVTYPE)
Note that property and crop damage were considered as zeroes for records with invalid multipliers, as the volume was very low.
human <- stormdata %>% group_by(EVTYPE) %>%
summarise(FATALITIES = sum(FATALITIES),INJURIES = sum(INJURIES),
TOTAL = sum(FATALITIES + INJURIES)) %>%
arrange(desc(TOTAL))
kable(human[1:5, ], "html", caption = "Top 5 weather events affecting population health") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| EVTYPE | FATALITIES | INJURIES | TOTAL |
|---|---|---|---|
| TORNADO | 5633 | 91346 | 96979 |
| EXCESSIVE HEAT | 1903 | 6525 | 8428 |
| TSTM WIND | 504 | 6957 | 7461 |
| FLOOD | 470 | 6789 | 7259 |
| LIGHTNING | 816 | 5230 | 6046 |
humancutoff <- 100
humanx <- subset(human, TOTAL > humancutoff & !(EVTYPE == "TORNADO"))
humanother <- subset(human, TOTAL <= humancutoff & !(EVTYPE == "TORNADO"))
humanx <- rbind(humanx, c("OTHER", as.numeric(sum(humanother$FATALITIES)),
as.numeric(sum(humanother$INJURIES)),as.numeric(sum(humanother$TOTAL))))
humanlabel <- paste("OTHER: ",count(humanother)," categories with each total less than ",
humancutoff,".",sep = "")
humanmelt <- humanx[,c(1:3)] %>% melt(id.vars = "EVTYPE")
names(humanmelt) <- c("EVTYPE", "HHTYPE", "HHVALUE")
humanmelt$HHVALUE <- as.numeric(humanmelt$HHVALUE)
humanplot <- ggplot(data = humanmelt, aes(x = EVTYPE, y = HHVALUE, fill = HHTYPE)) +
geom_bar(stat = "identity") + coord_flip() +
ggtitle("Number of human casualties by weather event type") +
labs(x = "",y = "") + theme(legend.title = element_blank())
humanplot
OTHER: 943 categories with each total less than 100..
Note that ‘TORNADO’ was excluded from this graph, to avoid crowding out the other results.
econ <- stormdata %>% group_by(EVTYPE) %>%
summarise(PROPDMG = sum(PROPDMGX),CROPDMG = sum(CROPDMGX),
TOTAL = sum(PROPDMGX + CROPDMGX)) %>%
arrange(desc(TOTAL))
kable(econ[1:5, ], "html", caption = "Top 5 weather events by economic consequence",
format.args=list(big.mark = ',')) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| EVTYPE | PROPDMG | CROPDMG | TOTAL |
|---|---|---|---|
| FLOOD | 144,657,709,800 | 5,661,968,450 | 150,319,678,250 |
| HURRICANE/TYPHOON | 69,305,840,000 | 2,607,872,800 | 71,913,712,800 |
| TORNADO | 56,937,160,480 | 414,953,110 | 57,352,113,590 |
| STORM SURGE | 43,323,536,000 | 5,000 | 43,323,541,000 |
| HAIL | 15,732,266,720 | 3,025,954,450 | 18,758,221,170 |
econcutoff <- 1000000000
econx <- subset(econ, TOTAL > econcutoff & !(EVTYPE == "FLOOD"))
econother <- subset(econ, TOTAL <= econcutoff & !(EVTYPE == "FLOOD"))
econx <- rbind(econx, c("OTHER", as.numeric(sum(econother$PROPDMG)),
as.numeric(sum(econother$CROPDMG)),as.numeric(sum(econother$TOTAL))))
econlabel <- paste("OTHER: ",count(econother)," categories with each total less than $",
format(econcutoff, big.mark=",", scientific=FALSE), sep = "")
econmelt <- econx[,c(1:3)] %>% melt(id.vars = "EVTYPE")
names(econmelt) <- c("EVTYPE", "ECONTYPE", "ECONVALUE")
econmelt$ECONVALUE <- as.numeric(econmelt$ECONVALUE)
econmelt$ECONVALUEM <- econmelt$ECONVALUE/1000000
econplot <- ggplot(data = econmelt, aes(x = EVTYPE, y = ECONVALUEM, fill = ECONTYPE)) +
geom_bar(stat = "identity") + coord_flip() +
ggtitle("Property/Crop damage by weather event type ($M)") +
labs(x = "",y = "") + theme(legend.title = element_blank())
econplot
OTHER: 958 categories with each total less than $1,000,000,000.
Note that ‘FLOOD’ was excluded from this graph, to avoid crowding out the other results.
Data quality checks results:
qa <- complete.cases(stormdata)
summary(qa)
Mode TRUE logical 902297
range(stormdata$FATALITIES)
[1] 0 583
range(stormdata$INJURIES)
[1] 0 1700
range(stormdata$PROPDMG)
[1] 0 5000
range(stormdata$CROPDMG)
[1] 0 990
propqa <- stormdata %>% group_by(PROPDMGEXP) %>% summarise(COUNT=n()) %>% arrange(desc(COUNT))
kable(propqa, "html") %>% kable_styling(bootstrap_options = "striped", full_width = F)
| PROPDMGEXP | COUNT |
|---|---|
| 465934 | |
| K | 424665 |
| M | 11330 |
| 0 | 216 |
| B | 40 |
| 5 | 28 |
| 1 | 25 |
| 2 | 13 |
| ? | 8 |
| m | 7 |
| H | 6 |
|
|
5 |
| 7 | 5 |
| 3 | 4 |
| 4 | 4 |
| 6 | 4 |
|
|
1 |
| 8 | 1 |
| h | 1 |
cropqa <- stormdata %>% group_by(CROPDMGEXP) %>% summarise(COUNT=n()) %>% arrange(desc(COUNT))
kable(cropqa, "html") %>% kable_styling(bootstrap_options = "striped", full_width = F)
| CROPDMGEXP | COUNT |
|---|---|
| 618413 | |
| K | 281832 |
| M | 1994 |
| k | 21 |
| 0 | 19 |
| B | 9 |
| ? | 7 |
| 2 | 1 |
| m | 1 |