Synopsis

By far the most harmful weather, as judged by the number of people dead or injured in its wake, are tornadoes. It has killed or injured roughly 9 times more people than the second most harmful weather: Thunderstorm winds. Tornadoes’ victims number roughly 95000 people, thunderstom winds a little over 10000. Tornadoes are a tier unto itself. The second tier consists of: thunderstom winds, floods, excessive heat (heat) and lightning. This second tier has about the same number of victims.

In terms of economic – both property and crop – damage floods are by far the most costly. At roughly 180 billions in damage, floods are two or three the cost of the next most damaging extreme weather, namely Hurricanes (Typhoons). Hurricanes (typhoons) are part of a second tier that also include tornadoes and storm surges.

Data Processing

The most important task of this project is to clean the column for the types of extreme weather events. The names are not standardized: They contain typos, errors, long/short descriptions, etc. Here, I attempt to homogenize this column as much as possible. I use a string distance measure to compare and fix the potential errors. But this method is not foolproof either: it sometimes produces false positives. That’s the price I pay to work with more consistent types. This is the part of the project that requires the most care and time. Preferably a domain expert should catalog which of these events belong together or not.

options(digits=2)

#### Change working directory ####
proj_dir <- "C:/Users/Vathy/Documents/scripts_r/Coursera/RepData_PeerAssessment2/"
setwd( proj_dir )

#### Load dataset ####
df <- read.csv( 'repdata_data_StormData.csv.bz2', header = TRUE, stringsAsFactors = FALSE )

# Convert to data.table
library( data.table )
assign( "depthtrigger", 4, data.table:::.global ) # To stop printing of data.tables
setDT( df )

# Keep only subset of the dataset we need for analysis
df <- df[ , list(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) ]

#### Clean event type ####
library( stringr )
# Uppercase event type
df[ , EVTYPE := toupper( EVTYPE ) ];
# Remove all punctuation by whitespace
df[ , EVTYPE := str_replace_all( EVTYPE, '[[:punct:]]', ' ' ) ];
# Strip all whitespaces
df[ , EVTYPE := str_trim( EVTYPE ) ];
# Replace TSTM with THUNDERSTORM
df[ str_detect( EVTYPE, 'TSTM' ), 
    EVTYPE := str_replace_all( EVTYPE, 'TSTM', 'THUNDERSTORM' ) ];
# Create new column for cleaning event type
df[ , EVTYPE_CLEAN := EVTYPE ];
# Remove all numbers from event type
df[ , EVTYPE_CLEAN := str_replace_all( EVTYPE_CLEAN, '[[:digit:]]', '' ) ];
df[ , EVTYPE_CLEAN := str_trim( EVTYPE_CLEAN ) ];
df[ , EVTYPE_CLEAN := str_replace_all( EVTYPE_CLEAN, ' +', ' ' ) ];

# Create dataframe of most common event type
evtype_freq <- df[ , as.data.table( table( EVTYPE_CLEAN ) ) ];
# Order by count
evtype_freq <- as.data.table( evtype_freq );
evtype_freq <- evtype_freq[ order( N, decreasing = TRUE ) ];
# Inspect frequency dataset
evtype_freq

##                EVTYPE_CLEAN      N
##   1:      THUNDERSTORM WIND 302532
##   2:                   HAIL 288761
##   3:                TORNADO  60652
##   4:            FLASH FLOOD  54279
##   5:                  FLOOD  25327
##  ---                              
## 703:   WIND CHILL HIGH WIND      1
## 704:              WIND HAIL      1
## 705:             WIND STORM      1
## 706: WINTER STORM HIGH WIND      1
## 707:                    WND      1

# Extract unique event type
evtype <- evtype_freq[ , EVTYPE_CLEAN ];

#### Manually clean dataset based on common event type ####
evtype_freq[1:20] # Top 20

##                 EVTYPE_CLEAN      N
##  1:        THUNDERSTORM WIND 302532
##  2:                     HAIL 288761
##  3:                  TORNADO  60652
##  4:              FLASH FLOOD  54279
##  5:                    FLOOD  25327
##  6:       THUNDERSTORM WINDS  20868
##  7:                HIGH WIND  20217
##  8:                LIGHTNING  15756
##  9:               HEAVY SNOW  15708
## 10: MARINE THUNDERSTORM WIND  11987
## 11:               HEAVY RAIN  11742
## 12:             WINTER STORM  11433
## 13:           WINTER WEATHER   7045
## 14:             FUNNEL CLOUD   6845
## 15:               WATERSPOUT   3808
## 16:              STRONG WIND   3569
## 17:     URBAN SML STREAM FLD   3392
## 18:                 WILDFIRE   2761
## 19:                 BLIZZARD   2719
## 20:                  DROUGHT   2488

df[ str_detect( EVTYPE_CLEAN, 'THUNDERSTORM WIND' ),
    EVTYPE_CLEAN := 'THUNDERSTORM WIND' ]
df[ str_detect( EVTYPE_CLEAN, 'FLOOD' ),
    EVTYPE_CLEAN := 'FLOOD' ]

# Create function to extract fuzzy matches in a list
# based on string distances
library( stringdist )
fuzzymatch <- function( str, threshold = 3.0, weight = c( d = 0.05, i = 1, s = 1, t = 1 ) ){
  # Create empty list for duplicates
  dups <- list()
  for ( idx in 1:length( str ) ){
    scores <- stringdist( str[idx], str[-idx], method = "dl", weight = weight )
    if ( min( scores ) > threshold ) next
    idx_found <- which( scores == min( scores ) )
    # Find best possible matches
    fuzzymatches <- str[-idx][ idx_found ]
    dups[[ str[idx] ]] <- fuzzymatches
  }
  return( dups )
}

# Create list of potential duplicates
dups <- fuzzymatch( evtype, threshold = 2.0 )
# Inspect first 20
str( head( dups, 20 ) )

## List of 20
##  $ THUNDERSTORM WIND       : chr [1:2] "THUNDERSTORM WINDS" "THUNDERSTORMS WIND"
##  $ HAIL                    : chr [1:4] "HAIL WIND" "HAILSTORM" "DEEP HAIL" "WIND HAIL"
##  $ TORNADO                 : chr "TORNADOS"
##  $ FLASH FLOOD             : chr "FLASH FLOODS"
##  $ FLOOD                   : chr "FLOODS"
##  $ THUNDERSTORM WINDS      : chr [1:6] "THUNDERSTORM WINDSS" "THUNDERSTORMS WINDS" "THUNDERSTORMW WINDS" "THUNDEERSTORM WINDS" ...
##  $ HIGH WIND               : chr "HIGH WINDS"
##  $ LIGHTNING               : chr "LIGHTNING FIRE"
##  $ HEAVY SNOW              : chr [1:4] "HEAVY SNOW ICE" "HEAVY SNOW AND" "HEAVY SNOWPACK" "HEAVY WET SNOW"
##  $ MARINE THUNDERSTORM WIND: chr "THUNDERSTORM WIND"
##  $ HEAVY RAIN              : chr "HEAVY RAINS"
##  $ WINTER STORM            : chr "WINTER STORMS"
##  $ WINTER WEATHER          : chr "WINTER WEATHER MIX"
##  $ FUNNEL CLOUD            : chr "FUNNEL CLOUDS"
##  $ WATERSPOUT              : chr [1:3] "WATERSPOUTS" "WATER SPOUT" "WAYTERSPOUT"
##  $ STRONG WIND             : chr "STRONG WINDS"
##  $ URBAN SML STREAM FLD    : chr "URBAN SML STREAM FLDG"
##  $ WILDFIRE                : chr "WILDFIRES"
##  $ BLIZZARD                : chr "GROUND BLIZZARD"
##  $ DROUGHT                 : chr [1:2] "SNOW DROUGHT" "HEAT DROUGHT"

# Inspect last 20
str( tail( dups, 20 ) )

## List of 20
##  $ URBAN SML STREAM FLDG  : chr "URBAN SMALL STREAM FLOODING"
##  $ VERY WARM              : chr "RECORD WARM"
##  $ VOG                    : chr "FOG"
##  $ VOLCANIC ASH PLUME     : chr [1:2] "ICE" "SMOKE"
##  $ WALL CLOUD FUNNEL CLOUD: chr "FUNNEL CLOUD"
##  $ WARM DRY CONDITIONS    : chr "DRY CONDITIONS"
##  $ WARM WEATHER           : chr [1:3] "DRY WEATHER" "HOT WEATHER" "WET WEATHER"
##  $ WATER SPOUT            : chr "LANDSPOUT"
##  $ WATERSPOUT FUNNEL CLOUD: chr "FUNNEL CLOUD"
##  $ WAYTERSPOUT            : chr "WATERSPOUT"
##  $ WET MICOBURST          : chr "WET MICROBURST"
##  $ WET SNOW               : chr [1:2] "SNOW" "SLEET SNOW"
##  $ WET WEATHER            : chr "HOT WEATHER"
##  $ WILD FOREST FIRES      : chr "FOREST FIRES"
##  $ WIND AND WAVE          : chr "COLD WAVE"
##  $ WIND CHILL HIGH WIND   : chr "HIGH WIND"
##  $ WIND HAIL              : chr "HAIL"
##  $ WIND STORM             : chr "DUST STORM"
##  $ WINTER STORM HIGH WIND : chr "WINTER STORM HIGH WINDS"
##  $ WND                    : chr "WIND"

#### Create function to replace event types by most common spelling ####
most_common <- function( dups ){
  l <- list()
  for ( idx in 1:length( dups ) ){
    all_matches <- c( dups[[ idx ]], names( dups[ idx ] ) )
    common_df <- evtype_freq[ EVTYPE_CLEAN %in% all_matches ][ order( N, decreasing = TRUE ) ]
    most_common <- common_df[ 1, EVTYPE_CLEAN ] 
    l[[ idx ]] <- data.table( EVTYPE_CLEAN = all_matches, EVTYPE_RPL = most_common )
  }
  return( unique( rbindlist( l ) ) )
}

# Get event type replacements to homogeneize event type
rep <- most_common( dups );

# Remove events containg some words because of false matches
rep <- rep[ !str_detect( EVTYPE_CLEAN, 'SUMMARY' ) ];
rep <- rep[ !str_detect( EVTYPE_CLEAN, 'WEATHER' ) ];
# Should remove more false positives if time permits

# Remove rows if it contains the word summary
# Inspect random rows of replacement data.table
set.seed( 943 )
rep[ sample.int( nrow( rep ), 30 ) ]

##                     EVTYPE_CLEAN                     EVTYPE_RPL
##  1:       LOW TEMPERATURE RECORD                LOW TEMPERATURE
##  2:           FREEZING RAIN SNOW             FREEZING RAIN SNOW
##  3:                    ICY ROADS                      ICY ROADS
##  4:     LIGHTNING AND HEAVY RAIN                     HEAVY RAIN
##  5:                   SLEET SNOW                           HEAT
##  6:                 LACK OF SNOW                           SNOW
##  7:             COLD AIR TORNADO                        TORNADO
##  8:       HEAVY RAIN URBAN FLOOD         HEAVY RAIN URBAN FLOOD
##  9:                          VOG                    SUMMARY NOV
## 10:     HEAVY SNOW AND ICE STORM       HEAVY SNOW AND ICE STORM
## 11:                    TORNADOES                      TORNADOES
## 12:                   HAIL WINDS                     HIGH WINDS
## 13:    HEAVY SNOW AND HIGH WINDS      HEAVY SNOW AND HIGH WINDS
## 14:             MILD DRY PATTERN               MILD DRY PATTERN
## 15:                 MILD PATTERN               MILD DRY PATTERN
## 16:    HIGH WINDS AND WIND CHILL                     WIND CHILL
## 17:      RIP CURRENTS HEAVY SURF                   RIP CURRENTS
## 18:                 EXTREME HEAT                           HEAT
## 19:                BLOW OUT TIDE          ASTRONOMICAL LOW TIDE
## 20:                      ICE FOG                            FOG
## 21:            UNSEASONABLY COOL              UNSEASONABLY COOL
## 22:                       FLOODS                         FLOODS
## 23: COLD WIND CHILL TEMPERATURES EXTREME WINDCHILL TEMPERATURES
## 24:                  SEVERE COLD                   EXTREME COLD
## 25:                 ICE AND SNOW                           SNOW
## 26:                  RECORD SNOW                    RECORD SNOW
## 27:                          DRY                          FLOOD
## 28:              HURRICANE EMILY                            DRY
## 29:                 WINTER STORM                   WINTER STORM
## 30:             FLOOD FLASHFLOOD                    FLOOD FLASH
##                     EVTYPE_CLEAN                     EVTYPE_RPL

#### Add replacements for event types to data ####
idx_match <- match( df$EVTYPE_CLEAN, rep$EVTYPE_CLEAN );
df[ , EVTYPE_RPL := rep$EVTYPE_RPL[ idx_match ] ];
df[ is.na( EVTYPE_RPL ) , EVTYPE_RPL := EVTYPE_CLEAN ];

Results

which types of events are most harmful with respect to population health?

which types of events have the greatest economic consequences?

#### Create columns for analysis ####
df[ , dead_injured := FATALITIES + INJURIES ];
# Convert damage to billions of dollar amounts
df[ str_detect( PROPDMGEXP, 'K' ), PROPDMG := PROPDMG*1000 ];
df[ str_detect( PROPDMGEXP, 'M' ), PROPDMG := PROPDMG*1000000 ];
df[ str_detect( PROPDMGEXP, 'B' ), PROPDMG := PROPDMG*1000000000 ];
df[ str_detect( CROPDMGEXP, 'K' ), CROPDMG := CROPDMG*1000 ];
df[ str_detect( CROPDMGEXP, 'M' ), CROPDMG := CROPDMG*1000000 ];
df[ str_detect( CROPDMGEXP, 'B' ), CROPDMG := CROPDMG*1000000000 ];
df[ , PROPDMGinBillion := PROPDMG/1000000000 ];
df[ , CROPDMGinBillion := CROPDMG/1000000000 ];
df[ , totaldmg := PROPDMGinBillion + CROPDMGinBillion ]
q <- quote( list( sum_dmg = sum( totaldmg, na.rm = TRUE ),
                  sum_health = sum( dead_injured, na.rm = TRUE )
                  ) 
            )
dt <- df[ , eval( q ), by = EVTYPE_RPL ];
# Inspect
dt

##                   EVTYPE_RPL sum_dmg sum_health
##   1:                 TORNADO 57.3918      97025
##   2:       THUNDERSTORM WIND 12.6202      10257
##   3:                    HAIL 19.0161       1431
##   4:           FREEZING RAIN  0.0112         31
##   5:        LAKE EFFECT SNOW  0.0550         36
##  ---                                           
## 249:                SNOW ICE  0.0000          2
## 250:            HEAT DROUGHT  0.0000          0
## 251:  ASTRONOMICAL HIGH TIDE  0.0094          0
## 252: EXTREME COLD WIND CHILL  0.0113        256
## 253:                   SMOKE  0.0001          0

#### Create bar plots ####

# top 15 events that has killed or injured the most people
topMost <- 15
health <- dt[ order( sum_health, decreasing = TRUE ) ][ 1:topMost ];
health

##            EVTYPE_RPL sum_dmg sum_health
##  1:           TORNADO 5.7e+01      97025
##  2: THUNDERSTORM WIND 1.3e+01      10257
##  3:             FLOOD 1.8e+02      10218
##  4:    EXCESSIVE HEAT 5.0e-01       8447
##  5:         LIGHTNING 9.5e-01       6048
##  6:              HEAT 4.3e-01       4238
##  7:         ICE STORM 9.0e+00       2064
##  8:         HIGH WIND 6.6e+00       1725
##  9:      WINTER STORM 6.7e+00       1554
## 10:              HAIL 1.9e+01       1431
## 11: HURRICANE TYPHOON 7.3e+01       1344
## 12:        HEAVY SNOW 1.1e+00       1158
## 13:       RIP CURRENT 1.6e-04       1101
## 14:          WILDFIRE 5.2e+00        986
## 15:          BLIZZARD 7.7e-01        906

# top 15 events that has cost the most over time
econ <- dt[ order( sum_dmg, decreasing = TRUE ) ][ 1:topMost ];
econ

##                    EVTYPE_RPL sum_dmg sum_health
##  1:                     FLOOD   179.9      10218
##  2:         HURRICANE TYPHOON    72.5       1344
##  3:                   TORNADO    57.4      97025
##  4:               STORM SURGE    48.0         67
##  5:                      HAIL    19.0       1431
##  6:                 HURRICANE    18.2        116
##  7:                   DROUGHT    15.0         23
##  8:         THUNDERSTORM WIND    12.6      10257
##  9:                 ICE STORM     9.0       2064
## 10:            TROPICAL STORM     8.4        398
## 11:              WINTER STORM     6.7       1554
## 12:                 HIGH WIND     6.6       1725
## 13:                  WILDFIRE     5.2        986
## 14:          WILD FOREST FIRE     3.1        557
## 15: HEAVY RAIN SEVERE WEATHER     2.5          0

# Show most damaging for health
library( ggplot2 )
h <- ggplot( health, aes( x = EVTYPE_RPL, y = sum_health ) )
h <- h + geom_bar( stat = "identity" ) + coord_flip( )
h <- h + xlab( '' ) + ylab( 'Number of people injured or dead' )
h <- h + ggtitle( 'The Top 15 Most Lethal Weather Events' )
print( h )

# Show most damaging to economy
e <- ggplot( econ, aes( x = EVTYPE_RPL, y = sum_dmg ) )
e <- e + geom_bar( stat = "identity" ) + coord_flip( )
e <- e + xlab( '' ) + ylab( 'Billion of Dollars ($)' )
e <- e + ggtitle( 'The Top 15 Most Costly Weather Events' )
print( e )

Severe weather events and their impacts

Synopsis

Data Processing

Results