This report was produced by the author as a student’s exercise during the online course “Data Science - Reproducible Research” taken in Dezember 2014.
Analysing the data from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database for the years 1950 to 2011, the following event types were found most damaging regarding:
* human fatalities and injuries: Tornadoes are, by far, the worst.
* material damage to properties and crops: Hurricanes, Flash Floods and Tornadoes are the leaders
The database is a mess and needs a lot of consideration regarding cleaning the data. Especially, there are many non standard Event Types and at least one event with largly exagerated damgae values.
StormDataFile <- "StormData.csv.bz2"
library(data.table)
library(plyr)
eval=FALSEurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url,StormDataFile,method="curl")
print(paste("the file was downloaded on:", file.info(StormDataFile)$mtime))
## [1] "the file was downloaded on: 2014-12-16 00:50:11"
STORM_RAW <- as.data.table(
read.csv(StormDataFile,
stringsAsFactors=FALSE)
)
# str(STORM_RAW)
head(STORM_RAW, 1)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1: TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1: NA 0 14 100 3 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1: 15 25 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1: 3040 8812 3051 8806 1
# remove rows without damage reported
STORM <- STORM_RAW[!(FATALITIES==0 & INJURIES==0 & PROPDMG==0 & CROPDMG==0), ]
print(paste("this removed", dim(STORM_RAW)[1]-dim(STORM)[1], "entries, leaving", dim(STORM)[1]))
print("")
# keep only needed columns
STORM <- STORM[, list(BGN_DATE, EVTYPE, FATALITIES, INJURIES,
PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, REFNUM)]
# for the begin-date, only the year is needed
STORM <- STORM[, EventYear:=sub(".*/([0-9]{4}).*", "\\1", BGN_DATE)][, BGN_DATE:=NULL]
str(STORM)
## [1] "this removed 647664 entries, leaving 254633"
## [1] ""
## Classes 'data.table' and 'data.frame': 254633 obs. of 9 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
## $ EventYear : chr "1950" "1950" "1951" "1951" ...
## - attr(*, ".internal.selfref")=<externalptr>
2. Tidy up the prop and crop damage values, putting them into one numeric column each
entries <- dim(STORM)[1]
# permitted exponent factors are "",K,M,B but allow also for lowercase
permittedEXP <- list("","K","M","B")
STORM <- STORM[lapply(PROPDMGEXP, toupper) %in% permittedEXP, ]
STORM <- STORM[lapply(CROPDMGEXP, toupper) %in% permittedEXP, ]
# insert the numeric columns
STORM <- STORM[ ,`:=`(PropDamage=as.numeric(PROPDMG),
CropDamage=as.numeric(CROPDMG))]
# multiply by EXP
STORM <- STORM[toupper(PROPDMGEXP)=="K", PropDamage:=1000*PropDamage]
STORM <- STORM[toupper(PROPDMGEXP)=="M", PropDamage:=1000000*PropDamage]
STORM <- STORM[toupper(PROPDMGEXP)=="B", PropDamage:=1000000000*PropDamage]
STORM <- STORM[toupper(CROPDMGEXP)=="K", CropDamage:=1000*CropDamage]
STORM <- STORM[toupper(CROPDMGEXP)=="M", CropDamage:=1000000*CropDamage]
STORM <- STORM[toupper(CROPDMGEXP)=="B", CropDamage:=1000000000*CropDamage]
print(paste("this removed", entries-dim(STORM)[1], "entries, leaving", dim(STORM)[1]))
## [1] "this removed 276 entries, leaving 254357"
summary(STORM[, list(PropDamage, CropDamage)])
## PropDamage CropDamage
## Min. :0.00e+00 Min. :0.000e+00
## 1st Qu.:2.00e+03 1st Qu.:0.000e+00
## Median :1.00e+04 Median :0.000e+00
## Mean :1.68e+06 Mean :1.928e+05
## 3rd Qu.:3.50e+04 3rd Qu.:0.000e+00
## Max. :1.15e+11 Max. :5.000e+09
looking for outliers:
summary(STORM[, list(FATALITIES, INJURIES)])
print(" ")
summary(STORM[, list(PropDamage, CropDamage)])
## FATALITIES INJURIES
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.0595 Mean : 0.5523
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :583.0000 Max. :1700.0000
## [1] " "
## PropDamage CropDamage
## Min. :0.00e+00 Min. :0.000e+00
## 1st Qu.:2.00e+03 1st Qu.:0.000e+00
## Median :1.00e+04 Median :0.000e+00
## Mean :1.68e+06 Mean :1.928e+05
## 3rd Qu.:3.50e+04 3rd Qu.:0.000e+00
## Max. :1.15e+11 Max. :5.000e+09
The maximum values for Prop and Crop damages look rather suspicious ($ 115 billions??). Let’s find out where these come from:
# get the REFNUM of the 2 max for prop and crop
pRef <- STORM[which.max(PropDamage), REFNUM]
cRef <- STORM[which.max(CropDamage), REFNUM]
# print the complete event reports
print(STORM_RAW[REFNUM==pRef], )
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1: 6 1/1/2006 0:00:00 12:00:00 AM PST 55 NAPA CA
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME
## 1: FLOOD 0 COUNTYWIDE 1/1/2006 0:00:00 07:00:00 AM
## COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG
## 1: 0 NA 0 COUNTYWIDE 0 0 NA 0
## FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO
## 1: 0 0 115 B 32.5 M MTR
## STATEOFFIC ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_
## 1: CALIFORNIA, Western 3828 12218 3828 12218
## REMARKS
## 1: Major flooding continued into the early hours of January 1st, before the Napa River finally fell below flood stage and the water receeded. Flooding was severe in Downtown Napa from the Napa Creek and the City and Parks Department was hit with $6 million in damage alone. The City of Napa had 600 homes with moderate damage, 150 damaged businesses with costs of at least $70 million.
## REFNUM
## 1: 605943
# print(STORM_RAW[REFNUM==cRef], )
# millions not billions:
STORM <- STORM[REFNUM==pRef, `:=`(PropDamage=PropDamage/1000,CropDamage=CropDamage/1000)]
3. Storm Data Event Types: As per documentation, table 2.1.1 page 6, the following event types are permitted (manually copy/paste/edited from the .pdf):
permitted_EVTYPES <- list(
"Astronomical Low Tide",
"Avalanche",
"Blizzard",
"Coastal Flood",
"Cold/Wind Chill",
"Debris Flow",
"Dense Fog",
"Dense Smoke",
"Drought",
"Dust Devil",
"Dust Storm",
"Excessive Heat",
"Extreme Cold/Wind Chill",
"Flash Flood",
"Flood",
"Frost/Freeze",
"Funnel Cloud",
"Freezing Fog",
"Hail",
"Heat",
"Heavy Rain",
"Heavy Snow",
"High Surf",
"High Wind",
"Hurricane (Typhoon)",
"Ice Storm",
"Lake-Effect Snow",
"Lakeshore Flood",
"Lightning",
"Marine Hail",
"Marine High Wind",
"Marine Strong Wind",
"Marine Thunderstorm Wind",
"Rip Current",
"Seiche",
"Sleet",
"Storm Surge/Tide",
"Strong Wind",
"Thunderstorm Wind",
"Tornado",
"Tropical Depression",
"Tropical Storm",
"Tsunami",
"Volcanic Ash",
"Waterspout",
"Wildfire",
"Winter Storm",
"Winter Weather"
)
# convert all event types to upper case
STORM <- STORM[ ,EVTYPE:=toupper(EVTYPE)]
permitted_EVTYPES <- lapply(permitted_EVTYPES, toupper)
# estimate what we are missing with the non standard EVTYPE
number_illegal <- dim(STORM[!EVTYPE %in% permitted_EVTYPES,])[1]
ALL <- STORM[, `:=`(sF=sum(FATALITIES),sI=sum(INJURIES),sP=sum(PropDamage),sC=sum(CropDamage))
][1, list(sF,sI,sP,sC)]
NON <- STORM[!EVTYPE %in% permitted_EVTYPES, ][,
`:=`(sF=sum(FATALITIES),sI=sum(INJURIES),sP=sum(PropDamage),sC=sum(CropDamage))
][1, list(sF,sI,sP,sC)]
pctNb <- round(100*number_illegal/dim(STORM)[1], 2) # %entries
pctHD <- round(100*(NON[1,sF]+NON[1,sI])/(ALL[1,sF]+ALL[1,sI]), 2) # %Human damage
pctMD <- round(100*(NON[1,sP]+NON[1,sC])/(ALL[1,sP]+ALL[1,sC]), 2) # %Material damage
There are 81555 entries with non permitted event types. They represent:
So let’s take a closer look at them:
# list the non permitted events
sort(unique(STORM[!EVTYPE %in% permitted_EVTYPES, EVTYPE]))
## [1] "?" "AGRICULTURAL FREEZE"
## [3] "APACHE COUNTY" "ASTRONOMICAL HIGH TIDE"
## [5] "AVALANCE" "BEACH EROSION"
## [7] "BLACK ICE" "BLIZZARD/WINTER STORM"
## [9] "BLOWING DUST" "BLOWING SNOW"
## [11] "BRUSH FIRE" "COASTAL EROSION"
## [13] "COASTAL FLOODING" "COASTAL FLOODING/EROSION"
## [15] "COASTAL FLOODING/EROSION" "COASTALSTORM"
## [17] "COASTAL STORM" "COASTAL SURGE"
## [19] "COLD" "COLD AIR TORNADO"
## [21] "COLD AND SNOW" "COLD AND WET CONDITIONS"
## [23] "COLD TEMPERATURE" "COLD WAVE"
## [25] "COLD WEATHER" "COLD/WINDS"
## [27] "COOL AND WET" "DAMAGING FREEZE"
## [29] "DAM BREAK" "DOWNBURST"
## [31] "DROUGHT/EXCESSIVE HEAT" "DROWNING"
## [33] "DRY MICROBURST" "DRY MIRCOBURST WINDS"
## [35] "DUST DEVIL WATERSPOUT" "DUST STORM/HIGH WINDS"
## [37] "EARLY FROST" "EROSION/CSTL FLOOD"
## [39] "EXCESSIVE RAINFALL" "EXCESSIVE SNOW"
## [41] "EXCESSIVE WETNESS" "EXTENDED COLD"
## [43] "EXTREME COLD" "EXTREME HEAT"
## [45] "EXTREME WINDCHILL" "EXTREME WIND CHILL"
## [47] "FALLING SNOW/ICE" " FLASH FLOOD"
## [49] "FLASH FLOOD/" "FLASH FLOOD/FLOOD"
## [51] "FLASH FLOOD FROM ICE JAMS" "FLASH FLOOD - HEAVY RAIN"
## [53] "FLASH FLOODING" "FLASH FLOODING/FLOOD"
## [55] "FLASH FLOODING/THUNDERSTORM WI" "FLASH FLOOD/LANDSLIDE"
## [57] "FLASH FLOOD LANDSLIDES" "FLASH FLOODS"
## [59] "FLASH FLOOD/ STREET" "FLOOD FLASH"
## [61] "FLOOD/FLASH" "FLOOD/FLASHFLOOD"
## [63] "FLOOD/FLASH FLOOD" "FLOOD/FLASH/FLOOD"
## [65] "FLOOD & HEAVY RAIN" "FLOODING"
## [67] "FLOOD/RAIN/WINDS" "FLOOD/RIVER FLOOD"
## [69] "FLOODS" "FOG"
## [71] "FOG AND COLD TEMPERATURES" "FOREST FIRES"
## [73] "FREEZE" "FREEZING DRIZZLE"
## [75] "FREEZING RAIN" "FREEZING RAIN/SLEET"
## [77] "FREEZING RAIN/SNOW" "FREEZING SPRAY"
## [79] "FROST" "FROST\\FREEZE"
## [81] "GLAZE" "GLAZE ICE"
## [83] "GLAZE/ICE STORM" "GRADIENT WIND"
## [85] "GRASS FIRES" "GROUND BLIZZARD"
## [87] "GUSTNADO" "GUSTY WIND"
## [89] "GUSTY WIND/HAIL" "GUSTY WIND/HVY RAIN"
## [91] "GUSTY WIND/RAIN" "GUSTY WINDS"
## [93] "HAIL 075" "HAIL 0.75"
## [95] "HAIL 100" "HAIL 125"
## [97] "HAIL 150" "HAIL 175"
## [99] "HAIL 200" "HAIL 275"
## [101] "HAIL 450" "HAIL 75"
## [103] "HAIL DAMAGE" "HAILSTORM"
## [105] "HAIL/WIND" "HAIL/WINDS"
## [107] "HARD FREEZE" "HAZARDOUS SURF"
## [109] "HEAT WAVE" "HEAT WAVE DROUGHT"
## [111] "HEAT WAVES" "HEAVY LAKE SNOW"
## [113] "HEAVY MIX" "HEAVY PRECIPITATION"
## [115] "HEAVY RAIN AND FLOOD" "HEAVY RAIN/HIGH SURF"
## [117] "HEAVY RAIN/LIGHTNING" "HEAVY RAINS"
## [119] "HEAVY RAIN/SEVERE WEATHER" "HEAVY RAINS/FLOODING"
## [121] "HEAVY RAIN/SMALL STREAM URBAN" "HEAVY RAIN/SNOW"
## [123] "HEAVY SEAS" "HEAVY SHOWER"
## [125] "HEAVY SNOW AND HIGH WINDS" "HEAVY SNOW AND STRONG WINDS"
## [127] "HEAVY SNOW/BLIZZARD" "HEAVY SNOW/BLIZZARD/AVALANCHE"
## [129] "HEAVY SNOW/FREEZING RAIN" "HEAVY SNOW/HIGH WINDS & FLOOD"
## [131] "HEAVY SNOW/ICE" "HEAVY SNOWPACK"
## [133] "HEAVY SNOW SHOWER" "HEAVY SNOW SQUALLS"
## [135] "HEAVY SNOW-SQUALLS" "HEAVY SNOW/SQUALLS"
## [137] "HEAVY SNOW/WIND" "HEAVY SNOW/WINTER STORM"
## [139] "HEAVY SURF" "HEAVY SURF AND WIND"
## [141] "HEAVY SURF COASTAL FLOODING" "HEAVY SURF/HIGH SURF"
## [143] "HEAVY SWELLS" "HIGH"
## [145] "HIGH SEAS" " HIGH SURF ADVISORY"
## [147] "HIGH SWELLS" "HIGH WATER"
## [149] "HIGH WAVES" "HIGH WIND 48"
## [151] "HIGH WIND/BLIZZARD" "HIGH WIND DAMAGE"
## [153] "HIGH WIND (G40)" "HIGH WIND/HEAVY SNOW"
## [155] "HIGH WINDS" "HIGH WINDS"
## [157] "HIGH WINDS/" "HIGH WINDS/COASTAL FLOOD"
## [159] "HIGH WINDS/COLD" "HIGH WIND/SEAS"
## [161] "HIGH WINDS/HEAVY RAIN" "HIGH WINDS HEAVY RAINS"
## [163] "HIGH WINDS/SNOW" "HURRICANE"
## [165] "HURRICANE EDOUARD" "HURRICANE EMILY"
## [167] "HURRICANE ERIN" "HURRICANE FELIX"
## [169] "HURRICANE-GENERATED SWELLS" "HURRICANE GORDON"
## [171] "HURRICANE OPAL" "HURRICANE OPAL/HIGH WINDS"
## [173] "HURRICANE/TYPHOON" "HVY RAIN"
## [175] "HYPERTHERMIA/EXPOSURE" "HYPOTHERMIA"
## [177] "HYPOTHERMIA/EXPOSURE" "ICE"
## [179] "ICE AND SNOW" "ICE FLOES"
## [181] "ICE JAM" "ICE JAM FLOODING"
## [183] "ICE JAM FLOOD (MINOR" "ICE ON ROAD"
## [185] "ICE ROADS" "ICE STORM/FLASH FLOOD"
## [187] "ICE/STRONG WINDS" "ICY ROADS"
## [189] "LAKE EFFECT SNOW" "LAKE FLOOD"
## [191] "LANDSLIDE" "LANDSLIDES"
## [193] "LANDSLUMP" "LANDSPOUT"
## [195] "LATE SEASON SNOW" "LIGHT FREEZING RAIN"
## [197] "LIGHTING" "LIGHTNING."
## [199] "LIGHTNING AND HEAVY RAIN" "LIGHTNING AND THUNDERSTORM WIN"
## [201] "LIGHTNING FIRE" "LIGHTNING/HEAVY RAIN"
## [203] "LIGHTNING INJURY" "LIGHTNING THUNDERSTORM WINDS"
## [205] "LIGHTNING WAUSEON" "LIGHT SNOW"
## [207] "LIGHT SNOWFALL" "LIGNTNING"
## [209] "LOW TEMPERATURE" "MAJOR FLOOD"
## [211] "MARINE ACCIDENT" "MARINE MISHAP"
## [213] "MARINE TSTM WIND" "MICROBURST"
## [215] "MICROBURST WINDS" "MINOR FLOODING"
## [217] "MIXED PRECIP" "MIXED PRECIPITATION"
## [219] "MUDSLIDE" "MUD SLIDE"
## [221] "MUDSLIDES" "MUD SLIDES"
## [223] "MUD SLIDES URBAN FLOODING" "NON-SEVERE WIND DAMAGE"
## [225] "NON TSTM WIND" "NON-TSTM WIND"
## [227] "OTHER" "RAIN"
## [229] "RAIN/SNOW" "RAINSTORM"
## [231] "RAIN/WIND" "RAPIDLY RISING WATER"
## [233] "RECORD COLD" "RECORD/EXCESSIVE HEAT"
## [235] "RECORD HEAT" "RECORD RAINFALL"
## [237] "RECORD SNOW" "RIP CURRENTS"
## [239] "RIP CURRENTS/HEAVY SURF" "RIVER AND STREAM FLOOD"
## [241] "RIVER FLOOD" "RIVER FLOODING"
## [243] "ROCK SLIDE" "ROGUE WAVE"
## [245] "ROUGH SEAS" "ROUGH SURF"
## [247] "RURAL FLOOD" "SEVERE THUNDERSTORM"
## [249] "SEVERE THUNDERSTORMS" "SEVERE THUNDERSTORM WINDS"
## [251] "SEVERE TURBULENCE" "SLEET/ICE STORM"
## [253] "SMALL HAIL" "SMALL STREAM FLOOD"
## [255] "SNOW" "SNOW ACCUMULATION"
## [257] "SNOW AND HEAVY SNOW" "SNOW AND ICE"
## [259] "SNOW AND ICE STORM" "SNOW/ BITTER COLD"
## [261] "SNOW/BLOWING SNOW" "SNOW/COLD"
## [263] "SNOW FREEZING RAIN" "SNOW/FREEZING RAIN"
## [265] "SNOW/HEAVY SNOW" "SNOW/HIGH WINDS"
## [267] "SNOW/ICE" "SNOW/ ICE"
## [269] "SNOW/ICE STORM" "SNOWMELT FLOODING"
## [271] "SNOW/SLEET" "SNOW/SLEET/FREEZING RAIN"
## [273] "SNOW SQUALL" "SNOW SQUALLS"
## [275] "STORM FORCE WINDS" "STORM SURGE"
## [277] "STRONG WINDS" "THUDERSTORM WINDS"
## [279] "THUNDEERSTORM WINDS" "THUNDERESTORM WINDS"
## [281] "THUNDERSNOW" "THUNDERSTORM"
## [283] "THUNDERSTORM DAMAGE TO" "THUNDERSTORM HAIL"
## [285] "THUNDERSTORMS" "THUNDERSTORMS WIND"
## [287] "THUNDERSTORMS WINDS" "THUNDERSTORMW"
## [289] "THUNDERSTORM WIND." "THUNDERSTORM WIND 60 MPH"
## [291] "THUNDERSTORM WIND 65MPH" "THUNDERSTORM WIND 65 MPH"
## [293] "THUNDERSTORM WIND 98 MPH" "THUNDERSTORM WIND/AWNING"
## [295] "THUNDERSTORM WIND (G40)" "THUNDERSTORM WIND G52"
## [297] "THUNDERSTORM WIND G55" "THUNDERSTORM WIND G60"
## [299] "THUNDERSTORM WIND/HAIL" "THUNDERSTORM WIND/LIGHTNING"
## [301] "THUNDERSTORMWINDS" "THUNDERSTORM WINDS"
## [303] "THUNDERSTORM WINDS" "THUNDERSTORM WINDS."
## [305] "THUNDERSTORM WINDS 13" "THUNDERSTORM WINDS53"
## [307] "THUNDERSTORM WINDS 63 MPH" "THUNDERSTORM WINDS AND"
## [309] "THUNDERSTORM WINDS/ FLOOD" "THUNDERSTORM WINDS/FLOODING"
## [311] "THUNDERSTORM WINDS/FUNNEL CLOU" "THUNDERSTORM WINDS G60"
## [313] "THUNDERSTORM WINDSHAIL" "THUNDERSTORM WINDS HAIL"
## [315] "THUNDERSTORM WINDS/HAIL" "THUNDERSTORM WINDS LIGHTNING"
## [317] "THUNDERSTORM WINDSS" "THUNDERSTORM WIND/ TREE"
## [319] "THUNDERSTORM WIND TREES" "THUNDERSTORM WIND/ TREES"
## [321] "THUNDERSTORM WINS" "THUNDERSTROM WIND"
## [323] "THUNDERTORM WINDS" "THUNERSTORM WINDS"
## [325] "TIDAL FLOODING" "TORNADOES"
## [327] "TORNADOES, TSTM WIND, HAIL" "TORNADO F0"
## [329] "TORNADO F1" "TORNADO F2"
## [331] "TORNADO F3" "TORNDAO"
## [333] "TORRENTIAL RAINFALL" "TROPICAL STORM ALBERTO"
## [335] "TROPICAL STORM DEAN" "TROPICAL STORM GORDON"
## [337] "TROPICAL STORM JERRY" "TSTMW"
## [339] " TSTM WIND" "TSTM WIND"
## [341] "TSTM WIND 40" "TSTM WIND (41)"
## [343] "TSTM WIND 45" "TSTM WIND 55"
## [345] "TSTM WIND 65)" "TSTM WIND AND LIGHTNING"
## [347] "TSTM WIND DAMAGE" "TSTM WIND (G35)"
## [349] "TSTM WIND (G40)" " TSTM WIND (G45)"
## [351] "TSTM WIND G45" "TSTM WIND (G45)"
## [353] "TSTM WIND (G45)" "TSTM WIND G58"
## [355] "TSTM WIND/HAIL" "TSTM WINDS"
## [357] "TUNDERSTORM WIND" "TYPHOON"
## [359] "UNSEASONABLE COLD" "UNSEASONABLY COLD"
## [361] "UNSEASONABLY WARM" "UNSEASONABLY WARM AND DRY"
## [363] "UNSEASONAL RAIN" "URBAN AND SMALL"
## [365] "URBAN AND SMALL STREAM FLOODIN" "URBAN FLOOD"
## [367] "URBAN FLOODING" "URBAN FLOODS"
## [369] "URBAN SMALL" "URBAN/SMALL STREAM"
## [371] "URBAN/SMALL STREAM FLOOD" "URBAN/SML STREAM FLD"
## [373] "WARM WEATHER" "WATERSPOUT-"
## [375] "WATERSPOUT TORNADO" "WATERSPOUT-TORNADO"
## [377] "WATERSPOUT/TORNADO" "WATERSPOUT/ TORNADO"
## [379] "WET MICROBURST" "WHIRLWIND"
## [381] "WILDFIRES" "WILD FIRES"
## [383] "WILD/FOREST FIRE" "WILD/FOREST FIRES"
## [385] "WIND" "WIND AND WAVE"
## [387] "WIND DAMAGE" "WIND/HAIL"
## [389] "WINDS" "WIND STORM"
## [391] "WINTER STORM HIGH WINDS" "WINTER STORMS"
## [393] "WINTER WEATHER MIX" "WINTER WEATHER/MIX"
## [395] "WINTRY MIX"
Correct obvious errors:
# remove leading spaces
STORM <- STORM[ ,EVTYPE:=sub("^ *([^ ].*)", "\\1", EVTYPE)]
# mispelling of THUNDERSTORM and abbrev. TSTM
STORM <- STORM[, EVTYPE:=sub("THUDERSTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("THUNDEERSTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("THUNDERESTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("THUNERSTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("THUNDERSTROM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("THUNDERSTORMS", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("TUNDERSTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("TUNDERTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("THUNDERTORM", "THUNDERSTORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("TSTM", "THUNDERSTORM", EVTYPE)]
# mispelled TORNADO
STORM <- STORM[, EVTYPE:=sub("TORNDAO", "TORNADO", EVTYPE)]
# some obvious adaptions: remove storm names, clean up floods, etc
STORM <- STORM[, EVTYPE:=sub("^HIGH *WIND.*", "HIGH WIND", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^HURRICANE.*", "HURRICANE (TYPHOON)", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^TYPHOON.*", "HURRICANE (TYPHOON)", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^THUNDERSTORM.*", "THUNDERSTORM WIND", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^SEVERE THUNDERSTORM.*", "THUNDERSTORM WIND", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^TORNADO.*", "TORNADO", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^TROPICAL STORM.*", "TROPICAL STORM", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^FLOOD.*FLASH.*", "FLASH FLOOD", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^FLASH FLOOD.*", "FLASH FLOOD", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^RIVER.*FLOOD.*", "FLOOD", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^URBAN FLOOD.*", "FLOOD", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^FLOOD.*", "FLASH FLOOD", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^RIP CURRENT.*", "RIP CURRENT", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^HAIL.*", "HAIL", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("SMALL HAIL", "HAIL", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^WILD.*", "WILDFIRE", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^STORM SURGE$", "STORM SURGE/TIDE", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^EXTREME COLD$", "EXTREME COLD/WIND CHILL", EVTYPE)]
STORM <- STORM[, EVTYPE:=sub("^FOG$", "DENSE FOG", EVTYPE)]
# estimate what we are missing now
number_illegal <- dim(STORM[!EVTYPE %in% permitted_EVTYPES,])[1]
ALL <- STORM[, `:=`(sF=sum(FATALITIES),sI=sum(INJURIES),sP=sum(PropDamage),sC=sum(CropDamage))
][1, list(sF,sI,sP,sC)]
NON <- STORM[!EVTYPE %in% permitted_EVTYPES, ][,
`:=`(sF=sum(FATALITIES),sI=sum(INJURIES),sP=sum(PropDamage),sC=sum(CropDamage))
][1, list(sF,sI,sP,sC)]
pctNb <- round(100*number_illegal/dim(STORM)[1], 2) # %entries
pctHD <- round(100*(NON[1,sF]+NON[1,sI])/(ALL[1,sF]+ALL[1,sI]), 2) # %Human damage
pctMD <- round(100*(NON[1,sP]+NON[1,sC])/(ALL[1,sP]+ALL[1,sC]), 2) # %Material damage
# remove the non-permitted EVTYPE rows
STORM <- STORM[EVTYPE %in% permitted_EVTYPES, ]
There remained 2489 entries with non permitted event types, representing:
That’s good enough, so they have been removed from the dataset. Let’s begin the analysis.
From the NOOA - Storm Events Database - EventTypes follows that the 48 defined Event Types are only registered since 1996, prior to this only a small subset of event types were recorded. This will affect the list (see tornadoes below) but, for the purpose here, I will leave it at this.
The problem here is to find the most damaging event types for fatalities and injuries together. Just adding them up doesn’t make sense, neither ethically nor economically. First, let’s find out whether the problem really arises:
# get total fatalities and injuries
HUMD <- unique(STORM[ ,`:=`(sF=sum(FATALITIES),sI=sum(INJURIES)),
by=EVTYPE][, list(EVTYPE, sF, sI)],
by="EVTYPE")
print("10 worst event types regarding fatalities:")
setorder(HUMD,-sF)
HUMD[1:10, ]
print("")
print("10 worst event types regarding injuries:")
setorder(HUMD,-sI)
HUMD[1:10, ]
## [1] "10 worst event types regarding fatalities:"
## EVTYPE sF sI
## 1: TORNADO 5655 91339
## 2: EXCESSIVE HEAT 1903 6525
## 3: FLASH FLOOD 1517 8594
## 4: HEAT 937 2100
## 5: LIGHTNING 816 5230
## 6: THUNDERSTORM WIND 711 9507
## 7: RIP CURRENT 577 529
## 8: HIGH WIND 288 1449
## 9: EXTREME COLD/WIND CHILL 287 255
## 10: AVALANCHE 224 170
## [1] ""
## [1] "10 worst event types regarding injuries:"
## EVTYPE sF sI
## 1: TORNADO 5655 91339
## 2: THUNDERSTORM WIND 711 9507
## 3: FLASH FLOOD 1517 8594
## 4: EXCESSIVE HEAT 1903 6525
## 5: LIGHTNING 816 5230
## 6: HEAT 937 2100
## 7: ICE STORM 89 1975
## 8: WILDFIRE 90 1606
## 9: HIGH WIND 288 1449
## 10: HAIL 15 1368
So, yes, the problem is there. What is worse
- avalanche (224 killed, 170 injured), or
- wildfire (90 killed, 1606 injured) ??
It would go far beyond the scope of this exercise to resolve this question. Despite the problematic of doing so, I will add the two together to classify by the total number of people affected.
This is the list and the corresponding bar plot:
# get total fatalities and injuries and total people affected
HUMD <- unique(STORM[ ,`:=`(sF= sum(FATALITIES), sI=sum(INJURIES), totAff=sF+sI), by=EVTYPE]
[, list(EVTYPE, sF, sI, totAff)],
by="EVTYPE")
# limit to 10 worst
setorder(HUMD,-totAff)
HUMD <- HUMD[1:10, ]
HUMD
## EVTYPE sF sI totAff
## 1: TORNADO 5655 91339 96994
## 2: THUNDERSTORM WIND 711 9507 10218
## 3: FLASH FLOOD 1517 8594 10111
## 4: EXCESSIVE HEAT 1903 6525 8428
## 5: LIGHTNING 816 5230 6046
## 6: HEAT 937 2100 3037
## 7: ICE STORM 89 1975 2064
## 8: HIGH WIND 288 1449 1737
## 9: WILDFIRE 90 1606 1696
## 10: WINTER STORM 206 1321 1527
# bar plot
hdm <- as.matrix(HUMD[ ,list(sF,sI)])
colnames(hdm) <- c("Fatalities","Injuries")
rownames(hdm) <- as.vector(HUMD[ ,EVTYPE])
hdm <- t(hdm) # rows <> cols
barplot(hdm,
col = c("navyblue", "royalblue2"),
ylim = c(0, 10000),
beside = FALSE, las = 3,
cex.axis = 0.7, cex.names = 0.4,
main = "Most damaging event types:\nHuman Damage 1950-2011\n(worst 10)",
ylab = "Total number of affected people",
legend.text = rownames(hdm))
This is the list and the corresponding bar plot:
# get total fatalities and injuries and total people affected
MATD <- unique(STORM[ ,`:=`(Prop=sum(PropDamage), Crop=sum(CropDamage)),
by=EVTYPE]
[, list(EVTYPE, Prop, Crop)],
by="EVTYPE")
MATD <- MATD[ ,Total:=Prop+Crop]
# limit to 10 worst
setorder(MATD,-Total)
MATD <- MATD[1:10, ]
# reduce to millions
MATD <- MATD[ ,`:=`(Prop=Prop/1000000, Crop=Crop/1000000, Total=Total/1000000)]
MATD
## EVTYPE Prop Crop Total
## 1: HURRICANE (TYPHOON) 85356.410 5516.1178 90872.528
## 2: FLASH FLOOD 52064.827 12341.5876 64406.415
## 3: TORNADO 58541.757 367.4583 58909.215
## 4: STORM SURGE/TIDE 47964.724 0.8550 47965.579
## 5: HAIL 15974.534 3021.8876 18996.422
## 6: DROUGHT 1046.106 13972.5660 15018.672
## 7: THUNDERSTORM WIND 10970.316 1271.6614 12241.978
## 8: ICE STORM 3944.928 5022.1100 8967.038
## 9: WILDFIRE 8491.564 402.7816 8894.345
## 10: TROPICAL STORM 7714.391 694.8960 8409.287
# bar plot
mdm <- as.matrix(MATD[ ,list(Prop,Crop)])
rownames(mdm) <- as.vector(MATD[ ,EVTYPE])
mdm <- t(mdm) # rows <> cols
barplot(mdm,
col = c("navyblue", "royalblue2"),
# ylim = c(0, 10000),
beside = FALSE, las = 3,
cex.axis = 0.7, cex.names = 0.4,
main = "Most damaging event types:\nMaterial Damage 1950-2011\n(worst 10, in million US$)",
ylab = "Total Damage in Million US$",
legend.text = rownames(mdm))
if (Sys.which("uname") != "") system("uname -srpi", intern=TRUE)
if (Sys.which("lsb_release") != "")
print(paste("Ubuntu:",system("lsb_release -rc", intern=TRUE)[1]))
#print(paste("Rstudio version:", rstudio::versionInfo()$version)) # does not work in Rmd
## [1] "Linux 3.13.0-43-generic x86_64 x86_64"
## [1] "Ubuntu: Release:\t14.04"
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=pt_BR.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=pt_BR.UTF-8 LC_COLLATE=pt_BR.UTF-8
## [5] LC_MONETARY=pt_BR.UTF-8 LC_MESSAGES=pt_BR.UTF-8
## [7] LC_PAPER=pt_BR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plyr_1.8.1 data.table_1.9.4
##
## loaded via a namespace (and not attached):
## [1] chron_2.3-45 digest_0.6.4 evaluate_0.5.5 formatR_1.0
## [5] htmltools_0.2.6 knitr_1.8 Rcpp_0.11.3 reshape2_1.4.1
## [9] rmarkdown_0.3.10 stringr_0.6.2 tools_3.1.2 yaml_2.1.13
The RMarkdown file has been published as a Github Project
The report was produced using RStudio/knittr
on 2014-12-19 at 22:09:10 (BRT, GMT-0300)