##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: grid
This report summarizes the top weather events in the United States between the years 1950 and 2011 as reported on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The analysis determined the most severe types of events according to their effects on population health and economic cost finding Tornado type of events with the highest impact across all effects under consideration.
The data is freely available from the NOOA site. We first proceed to download the data, select the relevant fields and inspect the format.
DBREMOTEURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
DBLOCALPATH <- "repdata.csv.bz2"
if (! file.exists(DBLOCALPATH)) {
download.file(url = DBREMOTEURL, destfile = DBLOCALPATH)
}
DBDATA <- read.csv(bzfile(DBLOCALPATH)) %>%
select(c(EVTYPE, STATE, COUNTY, COUNTYNAME, FATALITIES, INJURIES, LATITUDE, LONGITUDE, PROPDMG, CROPDMG))
head(DBDATA)
## EVTYPE STATE COUNTY COUNTYNAME FATALITIES INJURIES LATITUDE LONGITUDE
## 1 TORNADO AL 97 MOBILE 0 15 3040 8812
## 2 TORNADO AL 3 BALDWIN 0 0 3042 8755
## 3 TORNADO AL 57 FAYETTE 0 2 3340 8742
## 4 TORNADO AL 89 MADISON 0 2 3458 8626
## 5 TORNADO AL 43 CULLMAN 0 2 3412 8642
## 6 TORNADO AL 77 LAUDERDALE 0 6 3450 8748
## PROPDMG CROPDMG
## 1 25.0 0
## 2 2.5 0
## 3 25.0 0
## 4 2.5 0
## 5 2.5 0
## 6 2.5 0
The number of NA values is sow low that we’ll remove those cases from consideration. After this we introduce a helper column for the total damages for each event.
colMeans(apply(DBDATA, 2, is.na))
## EVTYPE STATE COUNTY COUNTYNAME FATALITIES
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## INJURIES LATITUDE LONGITUDE PROPDMG CROPDMG
## 0.000000e+00 5.208928e-05 0.000000e+00 0.000000e+00 0.000000e+00
DBDATA <- DBDATA %>% filter(complete.cases(.)) %>%
mutate(DAMAGES = PROPDMG + CROPDMG, COUNT = 1)
We check the number of events present in the data
str(DBDATA)
## 'data.frame': 902250 obs. of 12 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DAMAGES : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ COUNT : num 1 1 1 1 1 1 1 1 1 1 ...
We see a very high number of EVTYPE levels. We try to determine subgroup of the rows that lets reduce the dimensionality of the data while still keeping a reasonable degree of accuracy
SUMMRY <- ddply(DBDATA, .(EVTYPE), colwise(sum, .(DAMAGES, FATALITIES, INJURIES, COUNT)))
TOTALS <- apply(select(SUMMRY, c(DAMAGES, FATALITIES, INJURIES, COUNT)), 2, sum)
CUMDMG <- data.frame(
records = cumsum(rep(1, dim(SUMMRY)[1])),
cumdmgpct = cumsum(sort(SUMMRY$DAMAGES, decreasing = T) / TOTALS["DAMAGES"]),
cumfatpct = cumsum(sort(SUMMRY$FATALITIES, decreasing = T) / TOTALS["FATALITIES"]),
cuminjpct = cumsum(sort(SUMMRY$INJURIES, decreasing = T) / TOTALS["INJURIES"]),
cumcntpct = cumsum(sort(SUMMRY$COUNT, decreasing = T) / TOTALS["COUNT"]))
pd0 <- ggplot(CUMDMG, aes(x = records, color = Effect)) +
geom_line(aes(y = cumdmgpct, color = "Damages")) +
geom_line(aes(y = cumfatpct, color = "Fatalities")) +
geom_line(aes(y = cuminjpct, color = "Injuries")) +
geom_line(aes(y = cuminjpct, color = "Events")) +
xlab("Number of event categories") +
ylab("Cummulative effect percentage") +
ggtitle("Cumulative effects vs Number of event categories")
pd1 <- pd0 + xlim(0, 250)
pd2 <- pd0 + xlim(0, 25)
grid.arrange(pd1, pd2, ncol = 1)
We see that in all cases we cummulative distribution of the effects obeys an exponential relationship to the number of event categories. This means that going forward we’ll be able to focus on the top 25 event categories for each variable under study (total damages, fatalities and injuries) while still accounting for over 90% of the effects in each case.
We want to characterize the nominal effect of the events under each EVTYPE level for each variable along with their relative frequency.
We start first by retrieving the top event categories for each effect variable
NUMCAT <- 25
EVTYPE <- head(SUMMRY %>% arrange(desc(DAMAGES)), NUMCAT) %>%
join(head(SUMMRY %>% arrange(desc(FATALITIES)), NUMCAT), by = 'EVTYPE', type = 'full') %>%
join(head(SUMMRY %>% arrange(desc(INJURIES)), NUMCAT), by = 'EVTYPE', type = 'full')
EVTYPE
## EVTYPE DAMAGES FATALITIES INJURIES COUNT
## 1 TORNADO 3312276.68 5633 91346 60652
## 2 FLASH FLOOD 1598825.05 978 1777 54262
## 3 TSTM WIND 1445168.21 504 6957 219940
## 4 HAIL 1268289.66 15 1361 288661
## 5 FLOOD 1067976.36 470 6789 25326
## 6 THUNDERSTORM WIND 943635.62 133 1488 82563
## 7 LIGHTNING 606812.39 816 5230 15751
## 8 THUNDERSTORM WINDS 464978.11 64 908 20843
## 9 HIGH WIND 342014.77 248 1137 20212
## 10 WINTER STORM 134699.58 206 1321 11433
## 11 HEAVY SNOW 124417.71 127 1021 15708
## 12 WILDFIRE 88823.54 75 911 2761
## 13 ICE STORM 67689.62 89 1975 2006
## 14 STRONG WIND 64610.71 103 280 3566
## 15 HEAVY RAIN 61964.94 98 251 11695
## 16 HIGH WINDS 57384.60 35 302 1533
## 17 TROPICAL STORM 54322.80 58 340 690
## 18 WILD/FOREST FIRE 43534.49 12 545 1457
## 19 DROUGHT 37997.67 0 4 2488
## 20 FLASH FLOODING 33623.20 19 8 682
## 21 URBAN/SML STREAM FLD 28845.74 28 79 3392
## 22 BLIZZARD 25490.48 101 805 2719
## 23 HURRICANE 20852.99 61 46 174
## 24 FLOOD/FLASH FLOOD 20580.95 17 15 624
## 25 STORM SURGE 19398.49 13 38 261
## 26 EXCESSIVE HEAT 1954.40 1903 6525 1678
## 27 HEAT 961.20 937 2100 767
## 28 RIP CURRENT 1.00 368 232 470
## 29 AVALANCHE 1623.90 224 170 386
## 30 RIP CURRENTS 162.00 204 297 304
## 31 HEAT WAVE 1524.55 172 309 74
## 32 EXTREME COLD 13778.68 160 231 655
## 33 EXTREME COLD/WIND CHILL 2704.00 125 24 1002
## 34 HIGH SURF 3041.62 101 152 725
## 35 EXTREME HEAT 10.11 96 155 22
## 36 COLD/WIND CHILL 2590.00 95 12 539
## 37 HURRICANE/TYPHOON 10637.85 64 1275 88
## 38 FOG 8849.81 62 734 538
## 39 DUST STORM 6651.00 22 440 427
## 40 WINTER WEATHER 11989.90 33 398 7026
## 41 DENSE FOG 8225.45 18 342 1293
We observe there’s further opportunities to consolidate the number of EVTYPE factors
EVTYPE$EVTYPE <- revalue(EVTYPE$EVTYPE, c("FLASH FLOOD" = "FLOOD/FLASH FLOOD",
"FLOOD" = "FLOOD/FLASH FLOOD",
"FLASH FLOODING" = "FLOOD/FLASH FLOOD",
"URBAN/SML STREAM FLD" = "FLOOD/FLASH FLOOD",
"TSTM WIND" = "THUNDERSTORM WIND",
"THUNDERSTORM WINDS" = "THUNDERSTORM WIND",
"STRONG WIND" = "HIGH WIND",
"HIGH WINDS" = "HIGH WIND",
"WILDFIRE" = "WILD/FOREST FIRE",
"EXCESSIVE HEAT" = "HEAT",
"HEAT WAVE" = "HEAT",
"EXTREME HEAT" = "HEAT",
"HURRICANE" = "HURRICANE/TYPHOON",
"DENSE FOG" = "FOG",
"EXTREME COLD" = "COLD/WIND CHILL",
"EXTREME COLD/WIND CHILL" = "COLD/WIND CHILL",
"RIP CURRENTS" = "RIP CURRENT"))
EVTYPE <- ddply(EVTYPE, .(EVTYPE), colwise(sum, .(DAMAGES, FATALITIES, INJURIES, COUNT)))
EVTYPE
## EVTYPE DAMAGES FATALITIES INJURIES COUNT
## 1 AVALANCHE 1623.90 224 170 386
## 2 BLIZZARD 25490.48 101 805 2719
## 3 COLD/WIND CHILL 19072.68 380 267 2196
## 4 FOG 17075.26 80 1076 1831
## 5 DROUGHT 37997.67 0 4 2488
## 6 DUST STORM 6651.00 22 440 427
## 7 HEAT 4450.26 3108 9089 2541
## 8 FLOOD/FLASH FLOOD 2749851.30 1512 8668 84286
## 9 HAIL 1268289.66 15 1361 288661
## 10 HEAVY RAIN 61964.94 98 251 11695
## 11 HEAVY SNOW 124417.71 127 1021 15708
## 12 HIGH SURF 3041.62 101 152 725
## 13 HIGH WIND 464010.08 386 1719 25311
## 14 HURRICANE/TYPHOON 31490.84 125 1321 262
## 15 ICE STORM 67689.62 89 1975 2006
## 16 LIGHTNING 606812.39 816 5230 15751
## 17 RIP CURRENT 163.00 572 529 774
## 18 STORM SURGE 19398.49 13 38 261
## 19 THUNDERSTORM WIND 2853781.94 701 9353 323346
## 20 TORNADO 3312276.68 5633 91346 60652
## 21 TROPICAL STORM 54322.80 58 340 690
## 22 WILD/FOREST FIRE 132358.03 87 1456 4218
## 23 WINTER STORM 134699.58 206 1321 11433
## 24 WINTER WEATHER 11989.90 33 398 7026
We can graph the relevant statistics per event type to determine their relative effects
EVMELT <- melt(EVTYPE, id.vars ="EVTYPE", variable.name = "Effect", value.name = "Value")
ggplot(EVMELT, aes(EVTYPE, y = Value)) +
geom_bar(stat = "identity") +
facet_wrap(~ Effect, scales = "free_y", ncol = 1) +
theme(axis.text.x = element_text(angle = 90), axis.title.y = element_blank())
We find that the highest impact across all three of the effects under consideration is found for events in category ‘Tornado’, even though it’s only the 4th most frequent category of events among the sample.