This analysis finds the most harmful weather events in the USA with respect to population health as well as events that have the greatest economic consequences. The analysis uses the NOAA Storm Database available at the following URL: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2.
To determine the most harmful weather events with respect to population health, the analysis will find 5 events that caused most fatalities and 5 events that caused injuries. Likewise, to determins weather events that have the greatest economic consequences, the anaysis will find 5 events that caused most property damage and 5 evets that caused most crop damage.
if (!require(lubridate)) {
install.packages("lubridate")
if (!require(lubridate)) {
stop("Unable to load 'lubridate' library")
}
}
if (!require(dplyr)) {
install.packages("dplyr")
if (!require(dplyr)) {
stop("Unable to load 'dplyr' library")
}
}
if (!require(ggplot2)) {
install.packages("ggplot2")
if (!require(ggplot2)) {
stop("Unable to load 'ggplot2' library")
}
}
stormdata <- read.csv(bzfile("repdata-data-StormData.csv.bz2"))
stormdata$BGN_DATE <- as.Date(stormdata$BGN_DATE, format = "%m/%d/%Y")
Let’s have a look at the number of non-zero events recorded over years:
tbl <- table(stormdata$EVTYPE, year(stormdata$BGN_DATE))
colSums(tbl != 0)
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964
## 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994
## 3 3 3 3 3 3 3 3 3 3 3 3 3 160 267
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 387 228 170 126 121 112 122 99 51 38 46 50 46 46 46
## 2010 2011
## 46 46
Obviously, event records starting from 1993 are much more complete, so we will use the period of 1993 - 2011 for our analysis.
# also we'll drop some columns unnecessary for the analysis
stormdata <- subset(stormdata, year(stormdata$BGN_DATE) > "1992",
select = c(BGN_DATE, EVTYPE, FATALITIES:CROPDMGEXP))
# sum of fatalities and injuries per event type
health_damage <- stormdata %>%
group_by(EVTYPE) %>%
summarize(fatalities_total = sum(FATALITIES), injuries_total = sum(INJURIES))
top5_fatalities <- arrange(health_damage, desc(fatalities_total))[1:5,1:2]
top5_injuries <- arrange(health_damage, desc(injuries_total))[1:5,c(1,3)]
Top 5 event types that caused most fatalities:
top5_fatalities
## Source: local data frame [5 x 2]
##
## EVTYPE fatalities_total
## 1 EXCESSIVE HEAT 1903
## 2 TORNADO 1621
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
Top 5 event types that caused most injuries:
top5_injuries
## Source: local data frame [5 x 2]
##
## EVTYPE injuries_total
## 1 TORNADO 23310
## 2 FLOOD 6789
## 3 EXCESSIVE HEAT 6525
## 4 LIGHTNING 5230
## 5 TSTM WIND 3631
Therefore, event types that are most harmful with respect to population health are as follows:
unique(append(as.character(top5_fatalities$EVTYPE), as.character(top5_injuries$EVTYPE)))
## [1] "EXCESSIVE HEAT" "TORNADO" "FLASH FLOOD" "HEAT"
## [5] "LIGHTNING" "FLOOD" "TSTM WIND"
Let’s see how health damage caused by the total most harmful event types changed over years:
total_fatalities_df <- subset(stormdata, EVTYPE %in% droplevels(top5_fatalities$EVTYPE))
total_injuries_df <- subset(stormdata, EVTYPE %in% droplevels(top5_injuries$EVTYPE))
total_fatalities_df <- total_fatalities_df %>%
group_by(year(BGN_DATE), EVTYPE) %>%
summarize(fatalities_total = sum(FATALITIES)) %>%
rename(harm_total = fatalities_total) %>%
mutate(harm_type = "fatalities")
total_injuries_df <- total_injuries_df %>%
group_by(year(BGN_DATE), EVTYPE) %>%
summarize(injuries_total = sum(INJURIES)) %>%
rename(harm_total = injuries_total) %>%
mutate(harm_type = "injuries")
harm_totals_df <- bind_rows(total_fatalities_df, total_injuries_df)
names(harm_totals_df)[1] = "year"
ggplot(data = harm_totals_df, aes(x = year, y = harm_total, color = EVTYPE)) +
geom_line(size = 2, alpha= 0.7) +
facet_grid(harm_type ~ ., scales = "free") +
scale_colour_discrete(name = "Event Type") +
ylab("Total events") +
ggtitle("Most harmful weather event types with respect to population health")
PROPDMGEXP and CROPDMGEXP variables contain the following values:
unique(stormdata$PROPDMGEXP)
## [1] B K M m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(stormdata$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
We will assume that those values are multipliers for PROPDMG and CROPDMG values and will perform the following conversions:
“1..8” -> 10
“-” -> 0
“” -> 1
“?” -> 0
“+” -> 1
“h” or “H” -> 100
“k” or “K” -> 1000
“m” or “M” -> 1000000
“b” or “B” -> 1000000000
stormdata$PROPDMGEXP <- gsub("[1-8]", "10", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("-", "0", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("^$", "1", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("\\?", "0", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("\\+", "1", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[hH]", "100", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[kK]", "1000", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[mM]", "1000000", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[bB]", "1000000000", stormdata$PROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[1-8]", "10", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("-", "0", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("^$", "1", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("\\?", "0", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("\\+", "1", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[hH]", "100", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[kK]", "1000", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[mM]", "1000000", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[bB]", "1000000000", stormdata$CROPDMGEXP)
stormdata <- mutate(stormdata,
PROPDMG = as.numeric(PROPDMG) * as.numeric(PROPDMGEXP),
CROPDMG = as.numeric(CROPDMG) * as.numeric(CROPDMGEXP))
economic_damage <- stormdata %>%
group_by(EVTYPE) %>%
summarize(prop_dmg_total = sum(PROPDMG), crop_dmg_total = sum(CROPDMG))
top5_prop_dmg <- arrange(economic_damage, desc(prop_dmg_total))[1:5,1:2]
top5_crop_dmg <- arrange(economic_damage, desc(crop_dmg_total))[1:5,c(1,3)]
Top 5 event types that caused most property damage:
top5_prop_dmg
## Source: local data frame [5 x 2]
##
## EVTYPE prop_dmg_total
## 1 FLOOD 144657709807
## 2 HURRICANE/TYPHOON 69305840000
## 3 STORM SURGE 43323536000
## 4 TORNADO 26338962995
## 5 FLASH FLOOD 16140812603
Top 5 event types that caused most crop damage:
top5_crop_dmg
## Source: local data frame [5 x 2]
##
## EVTYPE crop_dmg_total
## 1 DROUGHT 13972566000
## 2 FLOOD 5661968450
## 3 RIVER FLOOD 5029459000
## 4 ICE STORM 5022113500
## 5 HAIL 3025954453
Therefore, event types that caused greatest economic damage:
unique(append(as.character(top5_prop_dmg$EVTYPE), as.character(top5_crop_dmg$EVTYPE)))
## [1] "FLOOD" "HURRICANE/TYPHOON" "STORM SURGE"
## [4] "TORNADO" "FLASH FLOOD" "DROUGHT"
## [7] "RIVER FLOOD" "ICE STORM" "HAIL"
Let’s see how economic damage caused by the total most harmful event types changed over years:
total_propdmg_df <- subset(stormdata, EVTYPE %in% droplevels(top5_prop_dmg$EVTYPE))
total_cropdmg_df <- subset(stormdata, EVTYPE %in% droplevels(top5_crop_dmg$EVTYPE))
total_propdmg_df <- total_propdmg_df %>%
group_by(year(BGN_DATE), EVTYPE) %>%
summarize(harm_total = sum(PROPDMG)) %>%
mutate(harm_type = "property damage")
total_cropdmg_df <- total_cropdmg_df %>%
group_by(year(BGN_DATE), EVTYPE) %>%
summarize(harm_total = sum(CROPDMG)) %>%
mutate(harm_type = "crop damage")
harm_totals_df <- bind_rows(total_propdmg_df, total_cropdmg_df)
names(harm_totals_df)[1] = "year"
ggplot(data = harm_totals_df, aes(x = year, y = harm_total, color = EVTYPE)) +
geom_line(size = 2, alpha = 0.7) +
facet_grid(harm_type ~ ., scales = "free") +
scale_colour_discrete(name = "Event Type") +
ylab("Total events") +
ggtitle("Weather event types with greatest economic conseqences")