Top Weather events affecting the United States

Synopsis

Weather events have great economic and health consequences to the United States population and economy. With data submited to the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, we are able to identify the top type of weather events that affect the United States.

Data Processing

The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm data consists of events submited from 1950 to 2011. The data set is available in a coma delimited (.csv) file, which we will download form Amazon’s cloudfront.net.

Obtaining the storm data

We downloaded and read the sample Storm Data into R, ensuring all column names are R friendly. Because of the size of the set, we subset only the columns relevant to this analysis.

library(knitr)

# get data file
if (!file.exists("repdata_data_StormData.csv.bz2")) {
    download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2")
}

# read the Storm Data compressed bzfile
storm.data <- read.csv(bzfile("repdata_data_StormData.csv.bz2"))

# remove all fields irrelevant to the analysis
storm.data <- storm.data[, c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES", 
    "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP", "REFNUM")]

# ensure colmn names are R friendly
names(storm.data) <- make.names(names(storm.data), unique = TRUE)

The columns relevant to this analysis consist of the BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, REFNUM columns.

Standardizing the values submited in the Event Types column

The event types (EVTYPE) values were entered manually by the record submiters. This approach created inconsistencies across a large percent of records. To address this issue, we cleaned the values by standardizing to a set of general events.

Events that spanned more than a month and were entered as one record (as summary) were omitted because separating costs would be too inacurate.

To preserve the original values we created a copy of the EVTYPE column named EVTYPE.COPY. This new column (EVTYPE.COPY) is where we will perform all cleaning and perform the analysis.

library(stringr)

storm.data$EVTYPE.COPY <- storm.data$EVTYPE  # make working copy of EVTYPE
storm.data$EVTYPE.COPY <- str_trim(storm.data$EVTYPE.COPY)  # trim the values
storm.data$EVTYPE.COPY <- toupper(storm.data$EVTYPE.COPY)  # convert values to upper case

gsub.EVTYPE.COPY <- function( replace.with , patern.list ){
    for(i.patern in patern.list){
        storm.data[grep(i.patern, storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- 
            gsub(i.patern , replace.with ,
                 storm.data[grep(i.patern,storm.data$EVTYPE.COPY),]$EVTYPE.COPY 
                 )
    }
    assign('storm.data',storm.data,envir=.GlobalEnv)
}

# function to fix special characters in EVTYPE.COPY ####
fix.special.characters <- function(){

    gsub.EVTYPE.COPY("",c("[(|)|.|:|&|0-9|?]","MPH","AGRICULTURAL "," AND$"," DAMAGE","EROSION","^NONE$","INJURY","MAJOR","MINOR","RECORD","RURAL","SEVERE ","UNSEASONAL","SPELL","(NO WEATHER)","UNSEASONAL","UNSEASONABLE","UNSEASONABLY","STREET","URBAN","EROSIN","DENSE","UNUSUALLY","(UNUSUAL)","DEPRESSION","TREE[S]","^/","/$"," .$"," -$"))
    storm.data$EVTYPE.COPY <- gsub(" {2,}", " ", storm.data$EVTYPE.COPY)
    gsub.EVTYPE.COPY("/",c(" AND ","-",","," /","/ ","\\\\","- "))
    storm.data$EVTYPE.COPY <- gsub(" {2,}", " ", storm.data$EVTYPE.COPY)
    storm.data$EVTYPE.COPY <- str_trim(storm.data$EVTYPE.COPY)

    assign('storm.data',storm.data,envir=.GlobalEnv)
}
fix.special.characters()

storm.data[grep("^SUMMARY ",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- ""  # delete all summary records

# STANDARDIZE SPELLING OF EVTYPE.COPY ####
storm.data$EVTYPE.COPY <- sub("AVALANCE","AVALANCHE", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("CURRENTS", "CURRENT", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("DEVEL", "DEVIL", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("DUSTSTORM", "DUST STORM", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("FLOOD",c("FLOODS","FLOODING","FLOODIN","FLD","FLDG"))
storm.data$EVTYPE.COPY <- sub("FLOWS", "FLOW", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("LANDSLIDES|LANDSLUMP", "LANDSLIDE", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("LIGHTING", "LIGHTNING", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("RAINS", "RAIN", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("ROADS", "ROAD", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("SNOWFALL", "SNOW", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("SQUALLS", "SQUALL", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("STORMS", "STORM", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("TEMPERATURES", "TEMPERATURE", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("TORNADO",c("TORNDAO","TORNADOES","TORNADOS","TORNDAO"))
gsub.EVTYPE.COPY("THUNDERSTORM", c("THUNDERSTORMS","THUNDERSNOW","THUNDERSNOW","THUNDERSTORMW","THUNDERSTROM","THUNDERSTROM","TUNDERSTORM","TSTMW","TSTM","THUNDESTORM","THUNERSTORM","THUNDERTSORM","THUNDEERSTORM","THUNDERESTORM","THUDERSTORM","THUNDERTORM"))
storm.data$EVTYPE.COPY <- sub("TIDES", "TIDE", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("WILDFIRE",c("WILDFIRES","WILD FIRES","FIRES"))
gsub.EVTYPE.COPY("WIND",c("WINDS","WINS","WINDD","WND","W INDS","WIND G"))
gsub.EVTYPE.COPY("WINTER",c("WINTRY","WINTERY"))
storm.data$EVTYPE.COPY <- sub("FUNNEL CLOUD","TORNADO", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("VOG","FOG", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("WAVES","WAVE", storm.data$EVTYPE.COPY)

fix.special.characters()

# STANDARDIZE EVTYPE.COPY ####
storm.data[grep("(TORNADO)|(FUNNEL)|NADO|ROTATING|LANDSPOUT",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "TORNADO"
storm.data[grep("(HURRIC)|(TYPHO)|(FLOYD)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HURRICANE"
storm.data[grep("BLIZZARD",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "BLIZZARD"
storm.data[grep("THUNDERS|(METRO STORM)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "THUNDERSTORM_WIND"
storm.data[grep("TROPICAL",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "TROPICAL_STORM"
storm.data[grep("(WAYTERSPOUT)|(WATER SPOU)|(WATERSPOU)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WATERSPOUT"
storm.data[grep("FLOOD|DAM|SURGE|(FLASH FLOO)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "FLOOD"
storm.data[grep("HEAVY RAIN|RAIN|SHOWER|WET|PRECIPITATION|PRECIP",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HEAVY_RAIN"
storm.data[grep("SNOW",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HEAVY_SNOW"
storm.data[grep("HAIL|ICE PELLETS",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HAIL"
storm.data[grep("MICROBURST",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "DOWNBURST"
storm.data[grep("DRY|LOW RAIN|DRIEST",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "DROUGHT"
storm.data[grep("HEAT|HOT|WARM|HIGH TEMP",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HEAT"
storm.data[grep("COLD|LOW TEMP|HYPOTHERMIA|COOL|HYPERTHERMIA|(EXTREME WINDCHILL TEMPERATURE)",
                storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "COLD"
storm.data[grep("FROST|FREEZE|FREEZ",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "FROST"
storm.data[grep("(MUD)|(DEBR)|(LANDSLIDE)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "DEBRIS_FLOW"
storm.data[grep("VOLCAN",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "VOLCANIC_ERUPTION"
storm.data[grep("FIRE",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WILDFIRE"
storm.data[grep("WINTER WEAT|WINTER MIX",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WINTER_WEATHER"
storm.data[grep("WINTER STORM",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WINTER_STORM"
storm.data[grep("(WIND CHILL)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "COLD_WIND_CHILL"
storm.data[grep("SURF|SEA|SWELL|(HIGH TIDE)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HIGH_SURF"
storm.data[grep("FOG",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "FOG"
storm.data[grep("SLEET|ROAD|BLACK ICE|PATCHY ICE",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "SLEET"
storm.data[grep("COASTALSTORM|COASTAL",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "COASTAL_STORM"
storm.data[grep("LIGHTNING",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "LIGHTNING"
storm.data[grep("ICE FLOES|ICE STORM|(ICE/STRONG WIND)|(GLAZE ICE)", storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "ICE_STORM"
storm.data[grep("(WIND STORM)|^WIND$|(WIND ADVISORY)|(WIND ADVISORY)|(HIGH WIND)|(GUSTY WIND)|STRONG WIND|(STORM FORCE WIND)", storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HIGH_WIND"

storm.data$EVTYPE.COPY <- gsub("_"," ", storm.data$EVTYPE.COPY)  # remove underscore
storm.data <- storm.data[storm.data$EVTYPE.COPY!='',]  # remove records without EVTYPE.COPY value

In addition to cleaning the event values (in EVTYPE.COPY), we also

  • cast the begining date of the event (BGN_DATE) as a date R class and
  • created a column to hold the year for BGN_DATE, which we will call YEAR.
storm.data$BGN_DATE <- as.Date(storm.data$BGN_DATE,"%m/%d/%Y 0:00:00")  # fix begining date
storm.data$YEAR <- as.POSIXlt(storm.data$BGN_DATE)$year + 1900  # add year column

Because in the earlier years of the database there were fewer events recorded, most likely due to a lack of good records, we will use more recent years in the analysis. Records from 1995 to 2011 will be used to determine events with the greatest effect on population health and economic cost.

events.by.year <- aggregate(REFNUM ~ YEAR , data=storm.data, FUN=length)  # total values by EVTYPE
plot(events.by.year$YEAR, events.by.year$REFNUM, xlab="Year", ylab="Events", type="l", main="Events Submited by Year", las=1, , cex.axis=0.7)
abline(v="1995", col="red")  # add 1995 as break point for more complete data
Count of events submited by year

Count of events submited by year

Health Effect Analysis

To identify which events are most harmful to the population health, we will use the FATALITIES and INJURIES columns, which are in count of occurences.
We first total annual fatalities and injuries by event type from 1995 to 2011. Then, we will select the top 3 events for fatalities and injuries.

fatality.injuries.total <- aggregate(cbind(FATALITIES,INJURIES) ~ EVTYPE.COPY , data=storm.data[storm.data$YEAR>=2000,], FUN=sum)

Fatalities

The top 3 event types with fatal consequences include

# top event types for fatalities
(top.fatality.events <- head(fatality.injuries.total[order(-fatality.injuries.total$FATALITIES),] ,
                             n=n.top.events)[,c("EVTYPE.COPY","FATALITIES")])  
##    EVTYPE.COPY FATALITIES
## 22        HEAT       1244
## 41     TORNADO       1193
## 15       FLOOD        893

Injuries

The top 3 event types causing injuries to the US pupulation include

# top event types for injuries
(top.injuries.events <- head(fatality.injuries.total[order(-fatality.injuries.total$INJURIES),] , 
                             n=n.top.events)[,c("EVTYPE.COPY","INJURIES")])  
##          EVTYPE.COPY INJURIES
## 41           TORNADO    15213
## 22              HEAT     4933
## 40 THUNDERSTORM WIND     3225

Economic Told Analysis

To determine events across the United States with the greatest economic consequences we looked at Property and Crop damages in the Us.

Property Damage

To determine the events, which produce the most property damages, we used the following columns

  • PROPDMG - Property Damage Cost
  • PROPDMGEXP - Property Damage Explanation, which is a multiplier for Property Damage Cost PROPDMG. The values included for this column consist of B (Billions), M (Millions), K (Thousands), and H (Hundreds). Other values were excluded.
  • PROPDMGEXP.COPY - a cleaned working copy of PROPDMGEXP
  • PROPERTY.DAMAGE.COST - the event’s net cost calculated as PROPDMGEXP.COPY x PROPDMG
library(stringr)
storm.data$PROPDMGEXP <- toupper(storm.data$PROPDMGEXP)  # convert values to upper case
storm.data$PROPDMGEXP <- str_trim(storm.data$PROPDMGEXP)  # trim the values
storm.data$PROPDMGEXP.COPY <- NA
storm.data[grep("[B|M|K|H]",storm.data$PROPDMGEXP),]$PROPDMGEXP.COPY <- storm.data[grep("[B|M|K|H]",storm.data$PROPDMGEXP),]$PROPDMGEXP
storm.data[grep("[B]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1000000000  # Billion
storm.data[grep("[M]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1000000  # Million
storm.data[grep("[K]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1000  # Thousand
storm.data[grep("[H]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 100  # Hundred
storm.data[is.na(storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1
storm.data$PROPERTY.DAMAGE.COST <- as.numeric(storm.data$PROPDMGEXP.COPY) * storm.data$PROPDMG / 1000000  # in $1,000,000

The top 3 event types contributing to property damages include

property.total.cost <- aggregate(PROPERTY.DAMAGE.COST ~ EVTYPE.COPY, data = storm.data[storm.data$YEAR >= 
    1995, ], FUN = sum)  # total values by EVTYPE
(top.property.damage.events <- head(property.total.cost[order(-property.total.cost$PROPERTY.DAMAGE.COST), 
    ], n = n.top.events)[, c("EVTYPE.COPY", "PROPERTY.DAMAGE.COST")])  # top event types for injuries
##    EVTYPE.COPY PROPERTY.DAMAGE.COST
## 20       FLOOD               208054
## 36   HURRICANE                85250
## 63     TORNADO                24931

Crop Damage

To determine the events, which produce the most crop damages, we used the following columns

  • CROPDMG - Crop Damage Cost
  • CROPDMGEXP - Crop Damage Explanation, which is a multiplier for Crop Damage Cost CROPDMG. The values included for this column consist of B (Billions), M (Millions), K (Thousands), and H (Hundreds). Other values were excluded.
  • CROPDMGEXP.COPY - a cleaned working copy of the CROPDMGEXP
  • CROP.DAMAGE.COST - the event’s net cost calculated as CROPDMGEXP.COPY x CROPDMG
storm.data$CROPDMGEXP <- toupper(storm.data$CROPDMGEXP)  # convert values to upper case
storm.data$CROPDMGEXP <- str_trim(storm.data$CROPDMGEXP)  # trim the values
storm.data$CROPDMGEXP.COPY <- NA
storm.data[grep("[B|M|K|H]",storm.data$CROPDMGEXP),]$CROPDMGEXP.COPY <- storm.data[grep("[B|M|K|H]",storm.data$CROPDMGEXP),]$CROPDMGEXP
storm.data[grep("[B]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1000000000  # Billion
storm.data[grep("[M]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1000000  # Million
storm.data[grep("[K]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1000  # Thousand
## storm.data[grep("[H]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 100  # Hundred
storm.data[is.na(storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1
storm.data$CROP.DAMAGE.COST <- as.numeric(storm.data$CROPDMGEXP.COPY) * storm.data$CROPDMG / 1000000  # in $1,000,000

The top 3 event types contributing to crop damages include

crop.total.cost <- aggregate(CROP.DAMAGE.COST ~ EVTYPE.COPY, data = storm.data[storm.data$YEAR >= 
    1995, ], FUN = sum)  # total values by EVTYPE
(top.crop.damage.events <- head(crop.total.cost[order(-crop.total.cost$CROP.DAMAGE.COST), 
    ], n = n.top.events)[, c("EVTYPE.COPY", "CROP.DAMAGE.COST")])  # top event types for injuries
##    EVTYPE.COPY CROP.DAMAGE.COST
## 14     DROUGHT            13922
## 20       FLOOD             7338
## 36   HURRICANE             5516

Results

The top weather events affecting the United States, between 1995 and 2011, were identified as

  • Heat, Tornadoes, and Floods as having caused the most fatalities in the US between 1995 and 2011
  • Tornadoes, Heat, and Thunderstorm Winds as having caused the most injuries in the US between 1995 and 2011
  • Floods, Hurricanes, and Tornadoes as having caused the most property damages in the US between 1995 and 2011
  • Drought, Floods, and Hurricanes as having caused the most crop damages in the US between 1995 and 2011
par(mfrow=c(2,2), mar=c(6,5,2.1,2.1))
barplot(height=top.fatality.events$FATALITIES, names.arg=top.fatality.events$EVTYPE.COPY, 
        las=1, ylab="Fatalities", xlab="Top Weather Events contributing to Fatalities", cex.axis=0.7)
barplot(height=top.injuries.events$INJURIES, names.arg=top.injuries.events$EVTYPE.COPY, 
        las=1, ylab="Injuries", xlab="Top Weather Events contributing to Injuries", cex.axis=0.7)
barplot(height=top.property.damage.events$PROPERTY.DAMAGE.COST, names.arg=top.property.damage.events$EVTYPE.COPY, 
        las=1, ylab="Property Damages ($1M)", xlab="Top Weather Events contributing to Property Damages", cex.axis=0.7)
barplot(height=top.crop.damage.events$CROP.DAMAGE.COST, names.arg=top.crop.damage.events$EVTYPE.COPY, 
        las=1, ylab="Crop Damages ($1M)", xlab="Top Weather Events contributing to Crop Damaages", cex.axis=0.7)
Top Events affecting to population health and economic impact

Top Events affecting to population health and economic impact