Synopsis

This study aims to analyse the storm data set collected from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) from 1950 - 2011 and identify those events that are most harmful to human population and have the highest economic impact

Loading data

Check if the zip file exists, if not it downloads the file from the web, unzip it and read it with either read csv or read rds, according to the existing file format. The needed libraries are loaded as well:

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.3

library(plyr)
library(gridExtra)

setwd("~/Desktop/Reproducible Research/Projects/RepData_PeerAssessment2/")
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_zip <- "StormData.csv.bz2"
data_rds <- "StormData.RDS"

if (!file.exists(data_zip)) {
    download.file(url = data_url,
                  destfile = data_zip,
                  method = "curl")
}

if (!file.exists(data_rds)) {
    data <- read.csv(file = bzfile(data_zip), strip.white = TRUE)
} else {
    data <- readRDS(data_rds)
    RDSloaded <- TRUE
}

Data processing (and cleaning)

Reducing the dataset to the columns of interest:

Events: EVTYPE
Effects on people health: FATALITIES, INJURIES (people health damage)
Economic effects: PROPDMGxPROPDMGEXP, CROPDMGxCROPDMGEXP
The other information we need is that the events must regard the United States, so we need to filter out the intruders

data_US <- subset(data, data$STATE %in% state.abb)

In the following the eventypes are normalised:

data_US$EVTYPE[grep("WIND",data_US$EVTYPE)] <- "THUNDERSTORM WIND"
data_US$EVTYPE[grep("HEAT",data_US$EVTYPE)] <- "HEAT"
data_US$EVTYPE[grep("WARM",data_US$EVTYPE)] <- "HEAT"

Two different datasets are then created, one for the analysis about health data, the other to focus the analysis on economic damage-related data:

health.df <- data_US[,c("EVTYPE", "FATALITIES", "INJURIES")]
economy.df <- data_US[,c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]

There are no missing data in the columns of FATALITIES,INJURIES, PROPDMG and CROPDMG:

na_fatalities <- sum(is.na(health.df$FATALITIES))
na_injuries <- sum(is.na(health.df$INJURIES))
na_propdmg <- sum(is.na(economy.df$PROPDMG))
na_cropdmg <- sum(is.na(economy.df$CROPDMG))
print(c(na_fatalities,na_injuries,na_propdmg,na_cropdmg))

## [1] 0 0 0 0

However the fields PROPDMGEXP and CROPDMGEXP have to be cleaned because there are a few entries that are not allowed as it can be seen in the following:

print(summary(economy.df$PROPDMGEXP))

##             -      ?      +      0      1      2      3      4      5 
## 456190      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      h      H      K      m      M 
##      4      5      1     39      1      6 415375      7  11254

print(summary(economy.df$CROPDMGEXP))

##             ?      0      2      B      k      K      m      M 
## 607897      7     19      1      9     21 273262      1   1969

The cleaning strategy is the following:

h (or H), k (or K), m (or M) and b (or B) correspond to a factor of 100, 1000, 1000000 and 1000000000 respectively
The empty string and the other strange charactes correspond to a multiplication factor of 1

calc_amount <- function(dmg, dmgexp) dmg * switch(toupper(dmgexp), H=100, K=1000, M=1000000, B=1000000000,1)
economy.df$PDMG <- mapply(calc_amount, economy.df$PROPDMG, economy.df$PROPDMGEXP)
economy.df$CDMG <- mapply(calc_amount, economy.df$CROPDMG, economy.df$CROPDMGEXP)

Results

Most harmful (top 10) events respect to population health

Fatalities and injuries are aggregated according to the events, summing the number of events for each category.

fatalities <- aggregate(health.df$FATALITIES, list(health.df$EVTYPE), sum )
names(fatalities) <- c("EVTYPE", "FATALITIES")
top10fatalities <- head(arrange(fatalities, fatalities$FATALITIES, decreasing = T ), 10 )

injuries <- aggregate(health.df$INJURIES, list(health.df$EVTYPE), sum )
names(injuries) <- c("EVTYPE", "INJURIES")
top10injuries <- head(arrange(injuries, injuries$INJURIES, decreasing = T ), 10 )

The results are:

top10fatalities

##               EVTYPE FATALITIES
## 1            TORNADO       5633
## 2               HEAT       3156
## 3  THUNDERSTORM WIND       1408
## 4        FLASH FLOOD        939
## 5          LIGHTNING        806
## 6              FLOOD        464
## 7        RIP CURRENT        343
## 8          AVALANCHE        224
## 9       WINTER STORM        205
## 10      RIP CURRENTS        175

and

top10injuries

##               EVTYPE INJURIES
## 1            TORNADO    91346
## 2  THUNDERSTORM WIND    11373
## 3               HEAT     8856
## 4              FLOOD     6786
## 5          LIGHTNING     5212
## 6          ICE STORM     1975
## 7        FLASH FLOOD     1767
## 8               HAIL     1361
## 9       WINTER STORM     1321
## 10        HEAVY SNOW     1021

These results can be visualized in the following plots:

plot_fatalities <- qplot( data = top10fatalities, EVTYPE, weight = FATALITIES, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "N° of fatalities") + xlab( "Event Types") + ggtitle("Fatalities caused by\n the top 10 natural events")

plot_injuries <- qplot( data = top10injuries, EVTYPE, weight = INJURIES, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "Nos. of injuries") + xlab( "Event Types") + ggtitle("Injuries caused by\n the top 10 natural events")

grid.arrange(plot_fatalities, plot_injuries, ncol=2)

Events that have the most severe impact on the US economy (top 10)

The same steps as above are repeated to analyse the economic impact:

properties <- aggregate(economy.df$PDMG, list(economy.df$EVTYPE), sum )
names(properties) <- c("EVTYPE", "propDamage")
top10propdmg <- head(arrange(properties, properties$propDamage, decreasing = T ), 10 )

crops <- aggregate(economy.df$CDMG, list(economy.df$EVTYPE), sum )
names(crops) <- c("EVTYPE", "cropDamage")
top10cropdmg <- head(arrange(crops, crops$cropDamage, decreasing = T ), 10 )

The results are:

top10propdmg

##               EVTYPE   propDamage
## 1              FLOOD 144531318807
## 2  HURRICANE/TYPHOON  69033100000
## 3            TORNADO  56936688779
## 4        STORM SURGE  43323461000
## 5  THUNDERSTORM WIND  17710742880
## 6        FLASH FLOOD  15868170417
## 7               HAIL  15732260543
## 8          HURRICANE   9913998010
## 9     TROPICAL STORM   7475121550
## 10      WINTER STORM   6688492251

and

top10cropdmg

##               EVTYPE  cropDamage
## 1            DROUGHT 13972361000
## 2              FLOOD  5613968450
## 3        RIVER FLOOD  5029459000
## 4          ICE STORM  5022113500
## 5               HAIL  3025954473
## 6  HURRICANE/TYPHOON  2603500800
## 7          HURRICANE  2189930000
## 8  THUNDERSTORM WIND  2158958538
## 9        FLASH FLOOD  1406905100
## 10      EXTREME COLD  1292973000

The results are shown in the following plots:

plot_propdmg <- qplot( data = top10propdmg, EVTYPE, weight = propDamage, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "Property Damages [$]") + xlab( "Event Types") + ggtitle("Total property damages\n caused by top 10 natural events")

plot_cropdmg <- qplot( data = top10cropdmg, EVTYPE, weight = cropDamage, geom = "bar") + theme( axis.text.x = element_text(angle=45, hjust=1)) + scale_y_continuous( "Crop Damages in USD") + xlab( "Event Types") + ggtitle("Total crop damages\n caused by top 10 natural events top 10 weather events")

grid.arrange(plot_propdmg, plot_cropdmg, ncol=2)

Conclusion

The conclusions of this study are:

Tornado is the natural phenomenon that caused the highest number of both fatalities and injuries
Flood is the natural event that caused the highest damage to properties, while drought had the biggest impact on crops

Reproducible Research: Project Assignment 2

af

16 February 2016