Abstract

The research is based on statistics from U.S. National Oceanic and Atmospheric Administration’s (NOAA) database. Data contains statistics of severe weather events at USA territory since 1950 to 2011. The analysis here has an objective to answer two questions:
1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
2. Across the United States, which types of events have the greatest economic consequences?
Brief exploration of data allowed us to make following conclusions:
+ Tornado caused more inguries, fatalities and damage to property then other types of severe weather events.
+ Hail is the main cause of damage to crops.
+ There are only 5 types of weather events caused more than 5% fatalities each: TORNADO, EXCESSIVE HEAT, FLASH FLOOD, HEAT, LIGHTNING.
+ ‘Short list’ of weather events each caused more than 5% damage to property includes 6 positions: TORNADO, FLASH FLOOD, THUNDERSTORM WIND, FLOOD, HAIL, LIGHTNING.

Data Processing

Data was downloaded, processed and analysed in R version 3.2.3 (2015-12-10) via RStudio 0.99.441. List of used packages is shown below.

library('plyr')
library('dplyr')
library('stringr')
library('lubridate')
library('reshape')
library('data.table')
library('ggplot2')

Data set was loaded from URL and imported as data table object.

# load data file
fileURL <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
if(!file.exists('./data')) dir.create('./data')
if(!file.exists('./data/repdata_Fdata_FStormData.csv.bz2')) {
    download.file(fileURL, destfile = './data/repdata_Fdata_FStormData.csv.bz2')
}
# read data
if(!exists('df.raw')) {
    df.raw <- data.table(read.csv(bzfile('./data/repdata_Fdata_FStormData.csv.bz2'), 
                                  as.is = T, na.strings = ""))
}

Raw data table has 902297 rows and 37 columns. Variables were identified using Storm Data Documentation. Column names were formatted for better understanding. Before further processing missing values were counted.

# rename columns
nms <- colnames(df.raw)
nms[nms == 'STATE'] <- 'state.name'
nms[nms == 'LONGITUDE_'] <- 'LONGITUDE_E'
nms[nms == 'F'] <- 'f.scale'
nms <- gsub('_', '.', nms)
nms <- tolower(nms)
nms <- gsub('state', '.state.', nms)
nms <- gsub('county', '.county.', nms)
nms <- gsub('type', '.type.', nms)
nms <- gsub('zone', '.zone.', nms)
nms <- gsub('end', '.end.', nms)
nms <- gsub('mag', '.magnitude.', nms)
nms <- gsub('dmg', '.damage.', nms)
nms <- gsub('prop', '.property.', nms)
nms <- gsub('crop', '.crops.', nms)
nms <- gsub('[.]+', '.', nms)
nms <- gsub('[.]+$', '', nms)
nms <- gsub('^[.]+', '', nms)
colnames(df.raw) <- nms

# count NAs
na.counts <- sapply(lapply(df.raw, is.na), sum)
sort(na.counts[na.counts > 0], decreasing = T)
##        county.end.n             f.scale             end.azi 
##              902297              843563              724837 
##    crops.damage.exp          zone.names             bgn.azi 
##              618413              594029              547332 
##          end.locati property.damage.exp          bgn.locati 
##              499225              465934              287743 
##             remarks         state.offic            end.date 
##              287433              248769              243411 
##            end.time                 wfo         county.name 
##              238978              142069                1589 
##            latitude          latitude.e 
##                  47                  40

As we can see, a lot of variables have missing values. Still, columns which contain estimates of fatalities, injuries, property and crops damage have no NA’s. To continue the analysis only important variables were selected. Among all date and time variables only date when event began was selected to calculate year.

# names of variables to select
cols.to.select <- c('state', 'state.name', 'county', 'county.name',
                 'bgn.date', 'ev.type',
                 'f.scale', 'magnitude',  
                 'length', 'width', 'fatalities', 'injuries',
                 'property.damage', 'property.damage.exp', 
                 'crops.damage', 'crops.damage.exp')
# new, 'clean', data file
df.data <- select_(df.raw, .dots = cols.to.select)
rm(nms, cols.to.select, df.raw)
# find year of the event
df.data[, ev.year := year(strptime(bgn.date, format = '%m/%d/%Y %H:%M:%S'))]
# columns with '.exp' postfix contain units of estimated damage
df.data[, property.damage.exp := toupper(property.damage.exp)]
df.data[, crops.damage.exp := toupper(crops.damage.exp)]
# this operation removes minor misspellings in data units
df.data[!(property.damage.exp %in% c('K', 'M', 'B')), property.damage.exp := NA]
df.data[!(crops.damage.exp %in% c('K', 'M', 'B')), crops.damage.exp := NA]
# all damage estimates to thousands USD
df.data[property.damage.exp == 'K', property.damage.exp := '1']
df.data[property.damage.exp == 'M', property.damage.exp := '1000']
df.data[property.damage.exp == 'B', property.damage.exp := '1000000']
df.data[, property.damage.exp := as.numeric(property.damage.exp)]
df.data[crops.damage.exp == 'K', crops.damage.exp := '1']
df.data[crops.damage.exp == 'M', crops.damage.exp := '1000']
df.data[crops.damage.exp == 'B', crops.damage.exp := '1000000']
df.data[, crops.damage.exp := as.numeric(crops.damage.exp)]
df.data[!is.na(crops.damage), ][, crops.damage := crops.damage * crops.damage.exp]
# remove misspelling from event type
df.data[ev.type == 'TSTM WIND', ev.type := 'THUNDERSTORM WIND']

# remove columns with unit names
df.data <- df.data[, which(!grepl("^property.damage.exp$", colnames(df.data))), 
                   with = FALSE]
df.data <- df.data[, which(!grepl("^crops.damage.exp$", colnames(df.data))), 
                   with = FALSE]
# see what's inside df.data
str(df.data)
## Classes 'data.table' and 'data.frame':   902297 obs. of  15 variables:
##  $ state          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ state.name     : chr  "AL" "AL" "AL" "AL" ...
##  $ county         : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ county.name    : chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ bgn.date       : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ ev.type        : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ f.scale        : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ magnitude      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ length         : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ width          : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ fatalities     : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ injuries       : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ property.damage: num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ crops.damage   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ev.year        : int  1950 1950 1951 1951 1951 1951 1951 1952 1952 1952 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Cleaned dataset includes 902297 observations and 15 variables (in brackets corresponding names from raw data file are shown):
* state (STATE__) - A unique number is assigned to the county where event occured by NIST
* state.name (STATE) - the state name where the event occurred
* county (COUNTY) - unique number assigned to the county (by NIST) where event started
* county.name (COUNTYNAME) - County/Parish, Zone or Marine Name assigned to the county FIPS number or NWS Forecast Zone
* bgn.date (BGN_DATE) - date of beginning of the event formatted as ‘m/d/Y 0:00:00’
* ev.year - calculated year of the event
* ev.type (EVTYPE) - name of the event’s type
* f (F) - category of tornado by it’s intensity and area in Fujita scale (0:5)
* magnitude (MAG) - measured extent of the magnitude type; only used for wind speeds and hail size
* length (LENGTH) - length of the event’s path in miles
* width (WIDTH) - width of the event’s path in miles
* fatalities (FATALITIES) - number of fatalities related to the weather event
* injuries (INJURIES) - number of injuries related to the weather event
* property.damage (PROPDMG) - the estimated amount of damage to property incurred by the weather event, thousands USD
* crops.damage (CROPDMG) - the estimated amount of damage to crops incurred by the weather event, thousands USD

Only two variables in cleaned data file have missing values.

# count NA's in clean data
na.counts <- sapply(lapply(df.data, is.na), sum)
sort(na.counts[na.counts > 0], decreasing = T)
##     f.scale county.name 
##      843563        1589

Results

Two plots were produced in order to understand, what types of events are more dangerous for national health and economy of U.S. First couple of plots show events which cause more fatalities and injuries than other.

# group data by event type
data.by.ev <- group_by(df.data, ev.type)
# select data for plots
df.plot <- summarise(data.by.ev, sum.fatalities = sum(fatalities),
                     sum.injuries = sum(injuries))
df.plot <- df.plot[, pc.sum.f := sum.fatalities / sum(sum.fatalities)]
df.plot <- df.plot[, pc.sum.i := sum.injuries / sum(sum.injuries)]
# FATALITIES
df.plot.1 <- select(df.plot[order(sum.fatalities)], 
                    ev.type, sum.fatalities, pc.sum.f)
# less than 5% fatalities >> 'OTHER'
df.plot.1 <- rbind(list(paste('OTHER*'), 
                        sum(df.plot[pc.sum.f < 0.05, sum.fatalities]),
                        sum(df.plot[pc.sum.f < 0.05, pc.sum.f])),
                   df.plot.1)
df.plot.1 <- df.plot.1[pc.sum.f >= 0.05, ]
# INJURIES
df.plot.2 <- select(df.plot[order(sum.injuries)], 
                    ev.type, sum.injuries, pc.sum.i)
# less than 2% injuries >> 'OTHER'
df.plot.2 <- rbind(list(paste('OTHER*'), 
                        sum(df.plot[pc.sum.i < 0.02, sum.injuries]),
                        sum(df.plot[pc.sum.i < 0.02, pc.sum.i])),
                   df.plot.2)
df.plot.2 <- df.plot.2[pc.sum.i >= 0.02, ]

par(mar = c(3, 2, 3, 2), oma = c(1, 9, 1, 2), mfrow = c(2, 1))
# pane1: fatalities
with(df.plot.1, barplot(sum.fatalities, names.arg = ev.type, horiz = T, las = 1,
                        col = 'tan1', axes = F))
with(df.plot.1, text(sum.fatalities / 2, seq(0.7, 6.7, length = 6),
                     as.character(sum.fatalities), cex = 0.7))
mtext(paste('Tornadoes caused the greatest number of fatalities than other',
            ' \n types of weather events during the period from',
            min(df.data$ev.year), 'to',
            max(df.data$ev.year)), side = 3, line = 1, font = 2, cex = 0.9)
mtext('Total fatalities, USA', side = 1, line = 0)
mtext(paste('*"OTHER" include ', length(df.plot[pc.sum.f < 0.05, sum.fatalities]),
      'types of weather events'), side = 1, line = 1, font = 3, cex = 0.8)
# pane2: injuries
with(df.plot.2, barplot(sum.injuries, names.arg = ev.type, horiz = T, las = 1,
                        col = 'thistle1', axes = F))
with(df.plot.2, text(c(sum.injuries[1] / 2, sum.injuries[2:5] + 7000, sum.injuries[6] / 2), 
                     seq(0.7, 6.7, length = 6),
                     as.character(sum.injuries), cex = 0.7))
mtext(paste('Tornadoes caused the greatest number of injuries than other', 
            ' \n types of weather events during the period from',
            min(df.data$ev.year), 'to',
            max(df.data$ev.year)), side = 3, line = 1, font = 2, cex = 0.9)
mtext('Total injuries, USA', side = 1, line = 0)
mtext(paste('*"OTHER" include ', length(df.plot[pc.sum.i < 0.02, sum.injuries]),
            'types of weather events'), side = 1, line = 1, font = 3, cex = 0.8)

# remove temporary objects
rm(df.plot, df.plot.1, df.plot.2)

Second plot illustrates damage to property and crops.

# select data for plots
df.plot <- summarise(data.by.ev, sum.prop.dmg = sum(property.damage, na.rm = T),
                     sum.crops.dmg = sum(crops.damage, na.rm = T))
df.plot <- df.plot[, sum.prop.dmg := round(sum.prop.dmg / 10^6, 1)]
df.plot <- df.plot[, sum.crops.dmg := round(sum.crops.dmg / 10^6, 1)]
df.plot <- df.plot[, pc.sum.p := sum.prop.dmg / sum(sum.prop.dmg)]
df.plot <- df.plot[, pc.sum.c := sum.crops.dmg / sum(sum.crops.dmg)]
# PROPERTY
df.plot.1 <- select(df.plot[order(sum.prop.dmg)], 
                    ev.type, sum.prop.dmg, pc.sum.p)
# less than 5% fatalities >> 'OTHER'
df.plot.1 <- rbind(list(paste('OTHER*'), 
                        sum(df.plot[pc.sum.p < 0.05, sum.prop.dmg]),
                        sum(df.plot[pc.sum.p < 0.05, pc.sum.p])),
                   df.plot.1)
df.plot.1 <- df.plot.1[pc.sum.p >= 0.05, ]
# CROPS
df.plot.2 <- select(df.plot[order(sum.crops.dmg)], 
                    ev.type, sum.crops.dmg, pc.sum.c)
# less than 2% injuries >> 'OTHER'
df.plot.2 <- rbind(list(paste('OTHER*'), 
                        sum(df.plot[pc.sum.c < 0.02, sum.crops.dmg]),
                        sum(df.plot[pc.sum.c < 0.02, pc.sum.c])),
                   df.plot.2)
df.plot.2 <- df.plot.2[pc.sum.c >= 0.02, ]

par(mar = c(3, 2, 3, 2), oma = c(1, 9, 1, 2), mfrow = c(2, 1))
# pane1: property dmg
with(df.plot.1, barplot(sum.prop.dmg, names.arg = ev.type, horiz = T, las = 1,
                        col = 'yellowgreen', axes = F))
with(df.plot.1, text(sum.prop.dmg / 2, seq(0.7, 8, length = 7),
                     as.character(sum.prop.dmg), cex = 0.7))
mtext(paste('Tornadoes caused more damage to property than other',
            ' \n types of weather events during the period from',
            min(df.data$ev.year), 'to',
            max(df.data$ev.year)), side = 3, line = 1, font = 2, cex = 0.9)
mtext('Total property damage, bln USD', side = 1, line = 0)
mtext(paste('*"OTHER" include ', length(df.plot[pc.sum.p < 0.05, sum.prop.dmg]),
            'types of weather events'), side = 1, line = 1, font = 3, cex = 0.8)
# pane2: crops dmg
with(df.plot.2, barplot(sum.crops.dmg, names.arg = ev.type, horiz = T, las = 1,
                        col = 'wheat1', axes = F))
with(df.plot.2, text(sum.crops.dmg / 2, 
                     seq(0.7, 9.1, length = 8),
                     as.character(sum.crops.dmg), cex = 0.7))
mtext(paste('Hail caused more damage to crops than other',
            ' \n types of weather events during the period from',
            min(df.data$ev.year), 'to',
            max(df.data$ev.year)), side = 3, line = 1, font = 2, cex = 0.9)
mtext('Total crops damage, bln USD', side = 1, line = 0)
mtext(paste('*"OTHER" include ', length(df.plot[pc.sum.c < 0.02, sum.crops.dmg]),
            'types of weather events'), side = 1, line = 1, font = 3, cex = 0.8)

# remove temporary objects
rm(df.plot, df.plot.1, df.plot.2)