By far the most harmful weather, as judged by the number of people dead or injured in its wake, are tornadoes. It has killed or injured roughly 9 times more people than the second most harmful weather: Thunderstorm winds. Tornadoes’ victims number roughly 95000 people, thunderstom winds a little over 10000. Tornadoes are a tier unto itself. The second tier consists of: thunderstom winds, floods, excessive heat (heat) and lightning. This second tier has about the same number of victims.
In terms of economic – both property and crop – damage floods are by far the most costly. At roughly 180 billions in damage, floods are two or three the cost of the next most damaging extreme weather, namely Hurricanes (Typhoons). Hurricanes (typhoons) are part of a second tier that also include tornadoes and storm surges.
The most important task of this project is to clean the column for the types of extreme weather events. The names are not standardized: They contain typos, errors, long/short descriptions, etc. Here, I attempt to homogenize this column as much as possible. I use a string distance measure to compare and fix the potential errors. But this method is not foolproof either: it sometimes produces false positives. That’s the price I pay to work with more consistent types. This is the part of the project that requires the most care and time. Preferably a domain expert should catalog which of these events belong together or not.
options(digits=2)
#### Change working directory ####
proj_dir <- "C:/Users/Vathy/Documents/scripts_r/Coursera/RepData_PeerAssessment2/"
setwd( proj_dir )
#### Load dataset ####
df <- read.csv( 'repdata_data_StormData.csv.bz2', header = TRUE, stringsAsFactors = FALSE )
# Convert to data.table
library( data.table )
assign( "depthtrigger", 4, data.table:::.global ) # To stop printing of data.tables
setDT( df )
# Keep only subset of the dataset we need for analysis
df <- df[ , list(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) ]
#### Clean event type ####
library( stringr )
# Uppercase event type
df[ , EVTYPE := toupper( EVTYPE ) ];
# Remove all punctuation by whitespace
df[ , EVTYPE := str_replace_all( EVTYPE, '[[:punct:]]', ' ' ) ];
# Strip all whitespaces
df[ , EVTYPE := str_trim( EVTYPE ) ];
# Replace TSTM with THUNDERSTORM
df[ str_detect( EVTYPE, 'TSTM' ),
EVTYPE := str_replace_all( EVTYPE, 'TSTM', 'THUNDERSTORM' ) ];
# Create new column for cleaning event type
df[ , EVTYPE_CLEAN := EVTYPE ];
# Remove all numbers from event type
df[ , EVTYPE_CLEAN := str_replace_all( EVTYPE_CLEAN, '[[:digit:]]', '' ) ];
df[ , EVTYPE_CLEAN := str_trim( EVTYPE_CLEAN ) ];
df[ , EVTYPE_CLEAN := str_replace_all( EVTYPE_CLEAN, ' +', ' ' ) ];
# Create dataframe of most common event type
evtype_freq <- df[ , as.data.table( table( EVTYPE_CLEAN ) ) ];
# Order by count
evtype_freq <- as.data.table( evtype_freq );
evtype_freq <- evtype_freq[ order( N, decreasing = TRUE ) ];
# Inspect frequency dataset
evtype_freq
## EVTYPE_CLEAN N
## 1: THUNDERSTORM WIND 302532
## 2: HAIL 288761
## 3: TORNADO 60652
## 4: FLASH FLOOD 54279
## 5: FLOOD 25327
## ---
## 703: WIND CHILL HIGH WIND 1
## 704: WIND HAIL 1
## 705: WIND STORM 1
## 706: WINTER STORM HIGH WIND 1
## 707: WND 1
# Extract unique event type
evtype <- evtype_freq[ , EVTYPE_CLEAN ];
#### Manually clean dataset based on common event type ####
evtype_freq[1:20] # Top 20
## EVTYPE_CLEAN N
## 1: THUNDERSTORM WIND 302532
## 2: HAIL 288761
## 3: TORNADO 60652
## 4: FLASH FLOOD 54279
## 5: FLOOD 25327
## 6: THUNDERSTORM WINDS 20868
## 7: HIGH WIND 20217
## 8: LIGHTNING 15756
## 9: HEAVY SNOW 15708
## 10: MARINE THUNDERSTORM WIND 11987
## 11: HEAVY RAIN 11742
## 12: WINTER STORM 11433
## 13: WINTER WEATHER 7045
## 14: FUNNEL CLOUD 6845
## 15: WATERSPOUT 3808
## 16: STRONG WIND 3569
## 17: URBAN SML STREAM FLD 3392
## 18: WILDFIRE 2761
## 19: BLIZZARD 2719
## 20: DROUGHT 2488
df[ str_detect( EVTYPE_CLEAN, 'THUNDERSTORM WIND' ),
EVTYPE_CLEAN := 'THUNDERSTORM WIND' ]
df[ str_detect( EVTYPE_CLEAN, 'FLOOD' ),
EVTYPE_CLEAN := 'FLOOD' ]
# Create function to extract fuzzy matches in a list
# based on string distances
library( stringdist )
fuzzymatch <- function( str, threshold = 3.0, weight = c( d = 0.05, i = 1, s = 1, t = 1 ) ){
# Create empty list for duplicates
dups <- list()
for ( idx in 1:length( str ) ){
scores <- stringdist( str[idx], str[-idx], method = "dl", weight = weight )
if ( min( scores ) > threshold ) next
idx_found <- which( scores == min( scores ) )
# Find best possible matches
fuzzymatches <- str[-idx][ idx_found ]
dups[[ str[idx] ]] <- fuzzymatches
}
return( dups )
}
# Create list of potential duplicates
dups <- fuzzymatch( evtype, threshold = 2.0 )
# Inspect first 20
str( head( dups, 20 ) )
## List of 20
## $ THUNDERSTORM WIND : chr [1:2] "THUNDERSTORM WINDS" "THUNDERSTORMS WIND"
## $ HAIL : chr [1:4] "HAIL WIND" "HAILSTORM" "DEEP HAIL" "WIND HAIL"
## $ TORNADO : chr "TORNADOS"
## $ FLASH FLOOD : chr "FLASH FLOODS"
## $ FLOOD : chr "FLOODS"
## $ THUNDERSTORM WINDS : chr [1:6] "THUNDERSTORM WINDSS" "THUNDERSTORMS WINDS" "THUNDERSTORMW WINDS" "THUNDEERSTORM WINDS" ...
## $ HIGH WIND : chr "HIGH WINDS"
## $ LIGHTNING : chr "LIGHTNING FIRE"
## $ HEAVY SNOW : chr [1:4] "HEAVY SNOW ICE" "HEAVY SNOW AND" "HEAVY SNOWPACK" "HEAVY WET SNOW"
## $ MARINE THUNDERSTORM WIND: chr "THUNDERSTORM WIND"
## $ HEAVY RAIN : chr "HEAVY RAINS"
## $ WINTER STORM : chr "WINTER STORMS"
## $ WINTER WEATHER : chr "WINTER WEATHER MIX"
## $ FUNNEL CLOUD : chr "FUNNEL CLOUDS"
## $ WATERSPOUT : chr [1:3] "WATERSPOUTS" "WATER SPOUT" "WAYTERSPOUT"
## $ STRONG WIND : chr "STRONG WINDS"
## $ URBAN SML STREAM FLD : chr "URBAN SML STREAM FLDG"
## $ WILDFIRE : chr "WILDFIRES"
## $ BLIZZARD : chr "GROUND BLIZZARD"
## $ DROUGHT : chr [1:2] "SNOW DROUGHT" "HEAT DROUGHT"
# Inspect last 20
str( tail( dups, 20 ) )
## List of 20
## $ URBAN SML STREAM FLDG : chr "URBAN SMALL STREAM FLOODING"
## $ VERY WARM : chr "RECORD WARM"
## $ VOG : chr "FOG"
## $ VOLCANIC ASH PLUME : chr [1:2] "ICE" "SMOKE"
## $ WALL CLOUD FUNNEL CLOUD: chr "FUNNEL CLOUD"
## $ WARM DRY CONDITIONS : chr "DRY CONDITIONS"
## $ WARM WEATHER : chr [1:3] "DRY WEATHER" "HOT WEATHER" "WET WEATHER"
## $ WATER SPOUT : chr "LANDSPOUT"
## $ WATERSPOUT FUNNEL CLOUD: chr "FUNNEL CLOUD"
## $ WAYTERSPOUT : chr "WATERSPOUT"
## $ WET MICOBURST : chr "WET MICROBURST"
## $ WET SNOW : chr [1:2] "SNOW" "SLEET SNOW"
## $ WET WEATHER : chr "HOT WEATHER"
## $ WILD FOREST FIRES : chr "FOREST FIRES"
## $ WIND AND WAVE : chr "COLD WAVE"
## $ WIND CHILL HIGH WIND : chr "HIGH WIND"
## $ WIND HAIL : chr "HAIL"
## $ WIND STORM : chr "DUST STORM"
## $ WINTER STORM HIGH WIND : chr "WINTER STORM HIGH WINDS"
## $ WND : chr "WIND"
#### Create function to replace event types by most common spelling ####
most_common <- function( dups ){
l <- list()
for ( idx in 1:length( dups ) ){
all_matches <- c( dups[[ idx ]], names( dups[ idx ] ) )
common_df <- evtype_freq[ EVTYPE_CLEAN %in% all_matches ][ order( N, decreasing = TRUE ) ]
most_common <- common_df[ 1, EVTYPE_CLEAN ]
l[[ idx ]] <- data.table( EVTYPE_CLEAN = all_matches, EVTYPE_RPL = most_common )
}
return( unique( rbindlist( l ) ) )
}
# Get event type replacements to homogeneize event type
rep <- most_common( dups );
# Remove events containg some words because of false matches
rep <- rep[ !str_detect( EVTYPE_CLEAN, 'SUMMARY' ) ];
rep <- rep[ !str_detect( EVTYPE_CLEAN, 'WEATHER' ) ];
# Should remove more false positives if time permits
# Remove rows if it contains the word summary
# Inspect random rows of replacement data.table
set.seed( 943 )
rep[ sample.int( nrow( rep ), 30 ) ]
## EVTYPE_CLEAN EVTYPE_RPL
## 1: LOW TEMPERATURE RECORD LOW TEMPERATURE
## 2: FREEZING RAIN SNOW FREEZING RAIN SNOW
## 3: ICY ROADS ICY ROADS
## 4: LIGHTNING AND HEAVY RAIN HEAVY RAIN
## 5: SLEET SNOW HEAT
## 6: LACK OF SNOW SNOW
## 7: COLD AIR TORNADO TORNADO
## 8: HEAVY RAIN URBAN FLOOD HEAVY RAIN URBAN FLOOD
## 9: VOG SUMMARY NOV
## 10: HEAVY SNOW AND ICE STORM HEAVY SNOW AND ICE STORM
## 11: TORNADOES TORNADOES
## 12: HAIL WINDS HIGH WINDS
## 13: HEAVY SNOW AND HIGH WINDS HEAVY SNOW AND HIGH WINDS
## 14: MILD DRY PATTERN MILD DRY PATTERN
## 15: MILD PATTERN MILD DRY PATTERN
## 16: HIGH WINDS AND WIND CHILL WIND CHILL
## 17: RIP CURRENTS HEAVY SURF RIP CURRENTS
## 18: EXTREME HEAT HEAT
## 19: BLOW OUT TIDE ASTRONOMICAL LOW TIDE
## 20: ICE FOG FOG
## 21: UNSEASONABLY COOL UNSEASONABLY COOL
## 22: FLOODS FLOODS
## 23: COLD WIND CHILL TEMPERATURES EXTREME WINDCHILL TEMPERATURES
## 24: SEVERE COLD EXTREME COLD
## 25: ICE AND SNOW SNOW
## 26: RECORD SNOW RECORD SNOW
## 27: DRY FLOOD
## 28: HURRICANE EMILY DRY
## 29: WINTER STORM WINTER STORM
## 30: FLOOD FLASHFLOOD FLOOD FLASH
## EVTYPE_CLEAN EVTYPE_RPL
#### Add replacements for event types to data ####
idx_match <- match( df$EVTYPE_CLEAN, rep$EVTYPE_CLEAN );
df[ , EVTYPE_RPL := rep$EVTYPE_RPL[ idx_match ] ];
df[ is.na( EVTYPE_RPL ) , EVTYPE_RPL := EVTYPE_CLEAN ];
which types of events are most harmful with respect to population health?
which types of events have the greatest economic consequences?
#### Create columns for analysis ####
df[ , dead_injured := FATALITIES + INJURIES ];
# Convert damage to billions of dollar amounts
df[ str_detect( PROPDMGEXP, 'K' ), PROPDMG := PROPDMG*1000 ];
df[ str_detect( PROPDMGEXP, 'M' ), PROPDMG := PROPDMG*1000000 ];
df[ str_detect( PROPDMGEXP, 'B' ), PROPDMG := PROPDMG*1000000000 ];
df[ str_detect( CROPDMGEXP, 'K' ), CROPDMG := CROPDMG*1000 ];
df[ str_detect( CROPDMGEXP, 'M' ), CROPDMG := CROPDMG*1000000 ];
df[ str_detect( CROPDMGEXP, 'B' ), CROPDMG := CROPDMG*1000000000 ];
df[ , PROPDMGinBillion := PROPDMG/1000000000 ];
df[ , CROPDMGinBillion := CROPDMG/1000000000 ];
df[ , totaldmg := PROPDMGinBillion + CROPDMGinBillion ]
q <- quote( list( sum_dmg = sum( totaldmg, na.rm = TRUE ),
sum_health = sum( dead_injured, na.rm = TRUE )
)
)
dt <- df[ , eval( q ), by = EVTYPE_RPL ];
# Inspect
dt
## EVTYPE_RPL sum_dmg sum_health
## 1: TORNADO 57.3918 97025
## 2: THUNDERSTORM WIND 12.6202 10257
## 3: HAIL 19.0161 1431
## 4: FREEZING RAIN 0.0112 31
## 5: LAKE EFFECT SNOW 0.0550 36
## ---
## 249: SNOW ICE 0.0000 2
## 250: HEAT DROUGHT 0.0000 0
## 251: ASTRONOMICAL HIGH TIDE 0.0094 0
## 252: EXTREME COLD WIND CHILL 0.0113 256
## 253: SMOKE 0.0001 0
#### Create bar plots ####
# top 15 events that has killed or injured the most people
topMost <- 15
health <- dt[ order( sum_health, decreasing = TRUE ) ][ 1:topMost ];
health
## EVTYPE_RPL sum_dmg sum_health
## 1: TORNADO 5.7e+01 97025
## 2: THUNDERSTORM WIND 1.3e+01 10257
## 3: FLOOD 1.8e+02 10218
## 4: EXCESSIVE HEAT 5.0e-01 8447
## 5: LIGHTNING 9.5e-01 6048
## 6: HEAT 4.3e-01 4238
## 7: ICE STORM 9.0e+00 2064
## 8: HIGH WIND 6.6e+00 1725
## 9: WINTER STORM 6.7e+00 1554
## 10: HAIL 1.9e+01 1431
## 11: HURRICANE TYPHOON 7.3e+01 1344
## 12: HEAVY SNOW 1.1e+00 1158
## 13: RIP CURRENT 1.6e-04 1101
## 14: WILDFIRE 5.2e+00 986
## 15: BLIZZARD 7.7e-01 906
# top 15 events that has cost the most over time
econ <- dt[ order( sum_dmg, decreasing = TRUE ) ][ 1:topMost ];
econ
## EVTYPE_RPL sum_dmg sum_health
## 1: FLOOD 179.9 10218
## 2: HURRICANE TYPHOON 72.5 1344
## 3: TORNADO 57.4 97025
## 4: STORM SURGE 48.0 67
## 5: HAIL 19.0 1431
## 6: HURRICANE 18.2 116
## 7: DROUGHT 15.0 23
## 8: THUNDERSTORM WIND 12.6 10257
## 9: ICE STORM 9.0 2064
## 10: TROPICAL STORM 8.4 398
## 11: WINTER STORM 6.7 1554
## 12: HIGH WIND 6.6 1725
## 13: WILDFIRE 5.2 986
## 14: WILD FOREST FIRE 3.1 557
## 15: HEAVY RAIN SEVERE WEATHER 2.5 0
# Show most damaging for health
library( ggplot2 )
h <- ggplot( health, aes( x = EVTYPE_RPL, y = sum_health ) )
h <- h + geom_bar( stat = "identity" ) + coord_flip( )
h <- h + xlab( '' ) + ylab( 'Number of people injured or dead' )
h <- h + ggtitle( 'The Top 15 Most Lethal Weather Events' )
print( h )
# Show most damaging to economy
e <- ggplot( econ, aes( x = EVTYPE_RPL, y = sum_dmg ) )
e <- e + geom_bar( stat = "identity" ) + coord_flip( )
e <- e + xlab( '' ) + ylab( 'Billion of Dollars ($)' )
e <- e + ggtitle( 'The Top 15 Most Costly Weather Events' )
print( e )