The U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database tracks characteristics of major storms and weather events. Storm data documentation can be found here. The events in the database start in the year 1950 and end in November 2011.
The goal of the analysis is to explore NOAA storm databasedatabase and to identify event types that are most harmful with respect to population health and and to identify those event types that have the greatest economic consequences. US area (50 states) is only taken into account.
Loading raw data:
data <- read.csv("repdata_data_StormData.csv", stringsAsFactor = FALSE)
dim(data)
## [1] 902297 37
Data structure:
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Subsetting data to use only those variables that are needed for the analysis:
d <- data[, c("COUNTYNAME", "STATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
"CROPDMG")]
To subset US states only, we use state.abb dataset from dataset defauls R package:
state.abb
## [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"
Subseting data to US states only (for this purposes we use state.abb dataset from dataset defauls R package.)
d <- d[d$STATE %in% state.abb, ]
dim(d)
## [1] 883186 7
First, we look at fatalities and Injuries scatterplot withou any aggregation:
library(ggplot2)
ggplot(d, aes(FATALITIES, INJURIES)) + geom_point() + ggtitle("Fatalities and Injuries Scatterplot")
Harmful Evenst subsetting:
evHarm <- aggregate(d[, c("FATALITIES", "INJURIES")], by = list(d[, "EVTYPE"]),
FUN = sum)
colnames(evHarm)[1] <- "Event Type"
Fatal events:
fatal <- evHarm[order(evHarm[, 2], decreasing = T), c(1, 2)]
head(fatal)
## Event Type FATALITIES
## 834 TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
## 856 TSTM WIND 504
Injury events:
injur <- evHarm[order(evHarm[, 3], decreasing = T), c(1, 3)]
head(injur)
## Event Type INJURIES
## 834 TORNADO 91346
## 856 TSTM WIND 6957
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
## 275 HEAT 2100
Total count of Fatals and Injuries aggregated by events type:
# top 5 fatal and injury
top5harm <- union(fatal[1:5, 1], injur[1:5, 1])
# find idx
top5harm_idx = which(evHarm[, 1] %in% top5harm)
# prepare labels
top5harm_labels <- rep("", nrow(evHarm))
top5harm_labels[top5harm_idx] <- evHarm[top5harm_idx, 1]
# prepare color
top5harm_col <- rep(1, nrow(evHarm))
top5harm_col[top5harm_idx] <- 0
library(ggplot2)
ggplot(evHarm, aes(FATALITIES, INJURIES)) + geom_point(aes(colour = factor(top5harm_col))) +
geom_text(label = top5harm_labels, size = 3, vjust = -0.25) + theme(legend.position = "none") +
ggtitle("Fatalities and Injuries Ammount by Event Type Scatterplot")
Labeled events in details:
evHarm[evHarm[, 1] %in% top5harm, ]
## Event Type FATALITIES INJURIES
## 130 EXCESSIVE HEAT 1903 6525
## 153 FLASH FLOOD 978 1777
## 170 FLOOD 470 6789
## 275 HEAT 937 2100
## 464 LIGHTNING 816 5230
## 834 TORNADO 5633 91346
## 856 TSTM WIND 504 6957
Prop and Crop data agregated by event type:
dmg <- aggregate(d[, c("PROPDMG", "CROPDMG")], by = list(d$EVTYPE), FUN = sum)
Property damage ammount (top 5 are shown):
dmgProp <- dmg[order(dmg[, 2], decreasing = T), c(1, 2)]
colnames(dmgProp) <- c("Event Type", "Total Property Damage Ammount")
dmgProp[1:5, ]
## Event Type Total Property Damage Ammount
## 811 TORNADO 3211786
## 152 FLASH FLOOD 1402128
## 833 TSTM WIND 1333810
## 167 FLOOD 895925
## 737 THUNDERSTORM WIND 875481
Crop damage ammount (top 5 are shown):
dmgCrop <- dmg[order(dmg[, 3], decreasing = T), c(1, 3)]
colnames(dmgCrop) <- c("Event Type", "Total Crop Damage Amount")
dmgCrop[1:5, ]
## Event Type Total Crop Damage Amount
## 241 HAIL 579596
## 152 FLASH FLOOD 176776
## 167 FLOOD 167990
## 833 TSTM WIND 109203
## 811 TORNADO 100018
Prop and Corp Damage Aggregated by Event Type Scatterplot:
top5 <- union(dmgProp[1:5, 1], dmgCrop[1:5, 1])
top5_idx = which(dmg[, 1] %in% top5)
top5_labels <- rep("", nrow(dmg))
top5_labels[top5_idx] <- dmg[top5_idx, 1]
top5_col <- rep(1, nrow(dmg))
top5_col[top5_idx] <- 0
library(ggplot2)
ggplot(dmg, aes(PROPDMG, CROPDMG)) + geom_point(aes(colour = factor(top5_col))) +
geom_text(label = top5_labels, size = 3.5, vjust = -0.4) + theme(legend.position = "none") +
ggtitle("Prop and Corp Damage Aggregated by Event Type Scatterplot")