2.3.1 Main data exploration and Date Conversion
There are 902297 Events in reduced Storm Data and 0 NA.
The events in the database start in the year 1950 and ends in 2011.
In the earlier years of the database there are generally fewer events recorded but in this analysis we consider all Events availables.
dim(reducedStormData)
## [1] 902297 9
sum (is.na (reducedStormData))
## [1] 0
# Add a new column YEAR and converting BGN_DATE in numeric
reducedStormData$YEAR <- as.numeric(format(as.Date(reducedStormData$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))
hist(reducedStormData$YEAR , main = "Fig. 1. Num. of Events per Year",
breaks = 16,
col="steelblue",
xlab="Year",
ylab="Tot. Count")

Fig.1. In the earlier years there are fewer events and most events are available after the 1990.
2.3.2 Property and Crop Damage Conversion
Converting property damage (PROPDMG) and crop damage (CROPDMG) into comparable numerical forms (PROPDAMAGE,CROPDAMAGE).
calculateExp is the function that converts symbol to a power of 10 (using PROPDMGEXP and CROPDMGEXP)
# some EXP level are wrong (- ? +) and some are number
levels(reducedStormData$PROPDMGEXP)
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
levels(reducedStormData$CROPDMGEXP)
## [1] "" "?" "0" "2" "B" "k" "K" "m" "M"
# Power of 10 function (value, exponent)
calculateExp <- function(x, exp = "") {
switch(as.character(exp), `-` = x, `?` = x, `+` = x, `1` = x, `2` = x*(10^2), `3` = x*(10^3),
`4` = x*(10^4), `5` = x*(10^5), `6` = x*(10^6), `7` = x*(10^7), `8` = x*(10^8),
H = x*100, K = x*1000, M = x*1e+06, B = x*1e+09,
h = x*100, k = x*1000, m = x*1e+06, b = x*1e+09, x)
}
# Create PROPDAMAGE CROPDAMAGE
reducedStormData$PROPDAMAGE <- mapply(calculateExp, reducedStormData$PROPDMG, reducedStormData$PROPDMGEXP)
reducedStormData$CROPDAMAGE <- mapply(calculateExp, reducedStormData$CROPDMG, reducedStormData$CROPDMGEXP)
2.3.3 Event Types Agregation
There are 985 Event Types in the Storm database.
Many Event Types have “overlapping labels” and can be reduced to similar instances.
The aggregation consider only the most frequent Event types and classify them into the NOAA Data table categories.
New Event Types categories (based on NOAA Data table) are stored in NEWEVTYPE column.
# There are 985 Event types
length(levels(factor(reducedStormData$EVTYPE)))
## [1] 985
# Some Event Type text modification
reducedStormData$NEWEVTYPE <- toupper(reducedStormData$EVTYPE)
reducedStormData$NEWEVTYPE <- gsub('TSTM', 'THUNDERSTORM',reducedStormData$NEWEVTYPE)
reducedStormData$NEWEVTYPE <- gsub('WINDS', 'WIND',reducedStormData$NEWEVTYPE)
reducedStormData$NEWEVTYPE <- gsub('FLD|FLLOODIN|FLDG|FLOODING', 'FLOOD',reducedStormData$NEWEVTYPE)
reducedStormData$NEWEVTYPE <- gsub('CURRENTS', 'CURRENT',reducedStormData$NEWEVTYPE)
reducedStormData$NEWEVTYPE <- gsub('WEATHER/MIX', 'WEATHER',reducedStormData$NEWEVTYPE)
# Aggregation based on Event Types frequency and NOAA Data table
reducedStormData[grepl("URBAN FLOOD|RIVER FLOOD|URBAN/SML STREAM|URBAN/SMALL STREAM", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "FLOOD"
reducedStormData[grepl("FLOOD/FLASH FLOOD", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "FLASH FLOOD"
reducedStormData[grepl("RECORD HEAT|EXTREME HEAT|HEAT WAVE|RECORD WARMTH|UNSEASONABLY WARM", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "EXCESSIVE HEAT"
reducedStormData[grepl("EXTREME COLD|COLD/WIND CHILL|WIND CHILL|EXTREME WINDCHILL", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "EXTREME COLD/WIND CHILL"
reducedStormData[grepl("HURRICANE/TYPHOON|HURRICANE", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "HURRICANE (TYPHOON)"
reducedStormData[grepl("ASTRONOMICAL HIGH TIDE|STORM SURGE", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "STORM SURGE/TIDE"
reducedStormData[grepl("WILD/FOREST FIRE", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "WILDFIRE"
reducedStormData[grepl("HEAVY SURF/HIGH SURF", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "HIGH SURF"
reducedStormData[grepl("TORNADOES", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "TORNADO"
reducedStormData[grepl("TROPICAL STORM", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "TROPICAL STORM"
reducedStormData[grepl("RECORD/EXCESSIVE HEAT", reducedStormData$EVTYPE, ignore.case = TRUE), "NEWEVTYPE"] <- "EXCESSIVE HEAT"
reducedStormData$NEWEVTYPE <- factor(reducedStormData$NEWEVTYPE)
After the Events aggregation there are 11 Top Events Types that are over 10.000 Events and candidates as most harmful for the population and with greatest economic consequences.
summary(reducedStormData$NEWEVTYPE, maxsum = 15)
## THUNDERSTORM WIND HAIL TORNADO
## 323355 288661 60655
## FLASH FLOOD FLOOD HIGH WIND
## 55586 29455 21747
## LIGHTNING HEAVY SNOW MARINE THUNDERSTORM WIND
## 15754 15708 11987
## HEAVY RAIN WINTER STORM WINTER WEATHER
## 11742 11433 8149
## FUNNEL CLOUD WILDFIRE (Other)
## 6844 4219 37002
topEvents <- count(reducedStormData$NEWEVTYPE)
colnames(topEvents)[1] <- "NEWEVTYPE"
colnames(topEvents)[2] <- "FREQ"
topEvents <- topEvents[order(topEvents$FREQ, decreasing = TRUE), ]
topEvents$REORDER <- reorder(topEvents$NEWEVTYPE, topEvents$FREQ)
ggplot(topEvents, aes(NEWEVTYPE, FREQ/1000)) +
geom_bar(aes(x=REORDER), data=topEvents[1:11,], stat = "identity", fill="steelblue") +
labs(title="Fig. 2. Top 11 Event Types frequency") +
xlab("Event Type") + ylab("Tot. Count (K)") +
(theme(axis.text.x = element_text(angle=50,hjust=1),
plot.title = element_text(hjust = 0.5)))

Fig.2: The two most frequent Events are Thunderstorm wind and Hail and together are more than the double of the others events sum