April 25, 2015
Data on Severe Storm Events from the National Weather Service for the United States from 1950 to November, 2011, was analyzed for their impact on population health (in the form of the total of fatalities and injuries to humans) and on the US economy (in the form of total dollar value damage to property and crops). We found that Tornados by far caused the greatest number of fatalities and injuries with more than ten times the impact of the next highest ranked event type Excessive Heat. The greatest negative economic impact was due to Floods with more than twice the impact in dollar cost in property and crop damage of the next highest ranked event type Hurricane/Typhoon.
# Data Science - Reproducible Research Project #2
Download, uncompress, and read in the complete dataset from a Web source.
# First, download the compressed (.bz2) data set file into directory ../data/
if( !file.exists("../data/repdata_data_StormData.csv.bz2") )
{
dir.create("../data") # ignore any warning that the directory already exists
fileURL <-
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file( fileURL,
destfile="../data/repdata_data_StormData.csv.bz2",
mode = "wb"
)
}
We provide an option to read an abbreviated data set for testing.
# If read_bz2 is TRUE, we will read (using read.csv( bzfile(), ) below) the
# full data set from the above .bz2 file. If this value is FALSE we will,
# for debugging and testing purposes, read from an abbreviated file.
read_bz2 <- TRUE
if ( read_bz2 )
{ # Set up file name to indicate we will read.csv() from the above .bz2 file
input_file_name <- "../data/repdata_data_StormData.csv.bz2"
} else
{
# For testing don't read from the full compressed data file.
# Read in a sampled version of the full data set to save time and file space
# during development and debugging
input_file_name <- "../data/repdata_data_StormData_100000_rows.csv"
}
We only read in the data set columns that we will use in our analysis.
# Construct a vector, ctr, to be able to control which columns of the
# .csv file to actually read in using a read.csv() further below:
# Columns to NOT read in are indicated by NULL values.
# Columns to read in are indicated by NA values.
# First, read in a "type-representative" head of our data file
if ( read_bz2 )
{ temp_data_df <- read.csv( bzfile(input_file_name), header=TRUE, nrows=10 )
} else
{ temp_data_df <- read.csv( input_file_name, header=TRUE, nrows=10 )
}
# ctr == columns to read - we know these specific column numbers from inspecting
# the data set documentation, Storm Data Documentation (a .pdf) .
ctr <- c( 8, # EVTYPE
23, # FATALITIES
24, # INJURIES
25, # PROPDMG
26, # PROPDMGEXP
27, # CROPDMG
28 # CROPDMGEXP
)
# start off colCs indicating not reading any cols
colCs <- c( rep( "NULL", length(temp_data_df) ) )
# now overlay colCs in the columns we DO want to read in (with default type
# conversions)
colCs[ ctr ] <- NA
remove( temp_data_df ) # not needed any more
Now read in a full data set - but only the columns, colCs, we’re interested in.
if ( read_bz2 )
{ # read in the compressed data set
# bzfile() will uncompress the complete dataset as it's read into a data
# frame by read.csv()
data_df <-
read.csv( bzfile(input_file_name), header=TRUE, colClasses=colCs )
} else
{ # read in an abbreviated version of the full data set
data_df <-
read.csv( input_file_name, header=TRUE, colClasses=colCs )
}
Compute the total number of fatalities for each event type (NAs are removed for this computation).
FATALITIES_by_EVTYPE <- by( data_df$FATALITIES, data_df$EVTYPE,
sum, na.rm=TRUE, simplify=TRUE )
# Coerce the list FATALITIES_by_EVTYPE into a numeric vector,
# FATALITIES_by_EVTYPE_num, for use below
FATALITIES_by_EVTYPE_num <- as.numeric( FATALITIES_by_EVTYPE )
# Compute the total number of injuries for each event type
INJURIES_by_EVTYPE <- by( data_df$INJURIES, data_df$EVTYPE,
sum, na.rm=TRUE, simplify=FALSE )
INJURIES_by_EVTYPE_num <- as.numeric( INJURIES_by_EVTYPE )
Construct a data frame with population health-related variables from data_df .
health_df <- data.frame( fEVTYPE = names(FATALITIES_by_EVTYPE),
fval = FATALITIES_by_EVTYPE_num,
iEVTYPE = names(INJURIES_by_EVTYPE),
ival = INJURIES_by_EVTYPE_num
)
# Add a column with overall health impact to data frame health_df
# the sum of fatalities and injuries
health_df$overall_h_impact <- health_df$fval + health_df$ival
Note, below, near the end of this file, we plot the health impact of the top 10 event types.
Note: dollar amounts for property and crop damage are not included directly in the original data set; they must be computed by us:
# PROPDMGEXP can have values:
# blanks, "H", "K", "M" & "B", as well as "0", "1", "3", ..., "9".
# We assume that "H", "K", "M" & "B" stand for scale factors of 10^2,
# 10^3, 10^6 & 10^9, respectively.
# We assume that the numeric characters represent scale factors of
# 10^0 = 1, 10^1 = 10, 10^3 = 1000, etc.
# Blanks represent a scale factor of 1.
data_df$PROPDMG_multiplier <-
ifelse(
(data_df$PROPDMGEXP == "0" ), 1,
ifelse(
(data_df$PROPDMGEXP == "1" ), 10,
ifelse(
(data_df$PROPDMGEXP == "2" ), 100,
ifelse(
(toupper(data_df$PROPDMGEXP) == "H" ), 100,
ifelse(
(data_df$PROPDMGEXP == "3" ), 1000,
ifelse(
(toupper(data_df$PROPDMGEXP) == "K" ), 1000,
ifelse(
(data_df$PROPDMGEXP == "4" ), 10000,
ifelse(
(data_df$PROPDMGEXP == "5" ), 100000,
ifelse(
(data_df$PROPDMGEXP == "6" ), 1000000,
ifelse(
(toupper(data_df$PROPDMGEXP) == "M" ), 1000000,
ifelse(
(data_df$PROPDMGEXP == "7" ), 10000000,
ifelse(
(data_df$PROPDMGEXP == "8" ), 100000000,
ifelse(
(data_df$PROPDMGEXP == "9" ), 1000000000,
ifelse(
(toupper(data_df$PROPDMGEXP) == "B" ), 1000000000,
(1)
))))))))))))))
# Property damage is PROPDMG scaled by the multiplier
data_df$PROPDMG_in_dollars <- data_df$PROPDMG * data_df$PROPDMG_multiplier
# CROPDMGEXP can have the same values as PROPDMGEXP
data_df$CROPDMG_multiplier <-
ifelse(
(data_df$CROPDMGEXP == "0" ), 1,
ifelse(
(data_df$CROPDMGEXP == "1" ), 10,
ifelse(
(data_df$CROPDMGEXP == "2" ), 100,
ifelse(
(toupper(data_df$CROPDMGEXP) == "H" ), 100,
ifelse(
(data_df$CROPDMGEXP == "3" ), 1000,
ifelse(
(toupper(data_df$CROPDMGEXP) == "K" ), 1000,
ifelse(
(data_df$CROPDMGEXP == "4" ), 10000,
ifelse(
(data_df$CROPDMGEXP == "5" ), 100000,
ifelse(
(data_df$CROPDMGEXP == "6" ), 1000000,
ifelse(
(toupper(data_df$CROPDMGEXP) == "M" ), 1000000,
ifelse(
(data_df$CROPDMGEXP == "7" ), 10000000,
ifelse(
(data_df$CROPDMGEXP == "8" ), 100000000,
ifelse(
(data_df$CROPDMGEXP == "9" ), 1000000000,
ifelse(
(toupper(data_df$CROPDMGEXP) == "B" ), 1000000000,
(1)
))))))))))))))
# Crop damage is CROPDMG scaled by the multiplier
data_df$CROPDMG_in_dollars <- data_df$CROPDMG * data_df$CROPDMG_multiplier
Now we can compute the total property damage dollar cost for each event type (NAs are removed for this computation).
PROPDMG_by_EVTYPE <- by( data_df$PROPDMG_in_dollars, data_df$EVTYPE,
sum, na.rm=TRUE, simplify=FALSE )
PROPDMG_by_EVTYPE_num <- as.numeric( PROPDMG_by_EVTYPE )
Similarly, compute the total crop damage dollar cost for each event type.
CROPDMG_by_EVTYPE <- by( data_df$CROPDMG_in_dollars, data_df$EVTYPE,
sum, na.rm=TRUE, simplify=FALSE )
CROPDMG_by_EVTYPE_num <- as.numeric( CROPDMG_by_EVTYPE )
Construct a data frame with economic-related variables from data_df.
econ_df <- data.frame( pEVTYPE = names(PROPDMG_by_EVTYPE),
pval = PROPDMG_by_EVTYPE_num,
cEVTYPE = names(CROPDMG_by_EVTYPE),
cval = CROPDMG_by_EVTYPE_num
)
# Add a column with overall economic impact to data frame econ_df -
# the sum of property damage and crop damage
econ_df$overall_e_impact <- econ_df$pval + econ_df$cval
We analyze our data with respect to these two questions:
1 Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
2 Across the United States, which types of events have the greatest economic consequences?
First, order health_df by column overall_h_impact (the sum of fatalities and injuries).
hdf_ordered_by_overall_h_impact <- order( health_df$overall_h_impact,
decreasing=TRUE )
We only plot the top 10 event types as a bar plot.
top_h_num_events <- 10
top_h_impact <- hdf_ordered_by_overall_h_impact[1:top_h_num_events]
xlabel_h <- paste0( "Top ", top_h_num_events, " Severe Weather Event Types" )
ylimit_h <- c( 0, max( health_df$overall_h_impact[top_h_impact] ) )
barplot( height = health_df$overall_h_impact[ top_h_impact ],
names.arg = health_df$fEVTYPE[ top_h_impact ],
main = paste0("Health Impact by Fatalities & Injuries ",
"Due to Severe Weather Events (1950-2011)"),
xlab = xlabel_h,
cex.names = .6,
ylab = "Total of Fatalities & Injuries",
ylim = ylimit_h,
col = "red"
)
We see in the plot of Health Impact just above that the answer to Question 1 is that Tornados are the most harmful event type with respect to population health. In fact, Tornados have more than twice the impact of the next nine lesser impactful event types all together.
First, order econ_df by column overall_e_impact (the sum of property and crop damages).
edf_ordered_by_overall_e_impact <- order( econ_df$overall_e_impact,
decreasing=TRUE )
We only plot the top 10 event types as a bar plot.
top_e_num_events <- 10
top_e_impact <- edf_ordered_by_overall_e_impact[1:top_e_num_events]
xlabel_e <- paste0( "Top ", top_e_num_events, " Severe Weather Event Types" )
ylimit_e <- c( 0, max( econ_df$overall_e_impact[ top_e_impact ] / 1000000000 ) )
barplot( height = econ_df$overall_e_impact[ top_e_impact ] / 1000000000,
names.arg = econ_df$pEVTYPE[ top_e_impact ],
main = paste0("Economic Impact of Property & Crop Damage ",
"Due to Severe Weather Events (1950-2011)"),
xlab = xlabel_e,
cex.names = .6,
ylab = "Total of Property & Crop Damage (billions of dollars)",
ylim = ylimit_e,
col = "brown"
)
We see in the plot of Economic Impact immediately above that the answer to Question 2 is that Floods are the most costly event type with respect to property and crop dollar damage. The next three lesser impactful event types, Hurricane/Typhoons, Tornados, and Storm Surge, are together only 15% more damaging than Floods alone.