The consequences of severe weather events across the United States

Abstract

In this project, we will explore the U.S. National Oceanic and Atmospheric Administation’s (NOAA) Storm Database. The purpose of this project is to extract some informations about both public health and economics consequences produced by the major storms and wheather events. This exploratory analyses concern the wheather events, which occur accross the United States.

This project have to give an answer for each of these questions :

Across the United States, which types of events are most harmful with respect to population health ?
Across the United States, which types of events have the greatest economic consequences ?

Data processing

Material

R.version

##                _                           
## platform       x86_64-pc-linux-gnu         
## arch           x86_64                      
## os             linux-gnu                   
## system         x86_64, linux-gnu           
## status                                     
## major          3                           
## minor          2.3                         
## year           2015                        
## month          12                          
## day            10                          
## svn rev        69752                       
## language       R                           
## version.string R version 3.2.3 (2015-12-10)
## nickname       Wooden Christmas-Tree

Packages used

library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Processing

Get the data and put it into a cache

We need to get the data from the course web site.

if(!file.exists('data.csv.bz2')){
        download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2', destfile = 'data.csv.bz2')
}

cache <- read.csv('data.csv.bz2')

Cleanning the health consequences data of a natural disaster

We need a dataset called ‘health_data’, in the purpose to plot an exploratory analyse about the health consequences of a natural disaster.

Step 1 : Getting the cache

health_data <- cache

Step 2 : Sum and get the total

Summing of the number of injuried and killed persons, for each natural disaster.

data_1 <- aggregate(health_data[ , 'INJURIES'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_1)
DMG <- rep('Injuried', DMG)
data_1 <- cbind(data_1, DMG)

data_2 <- aggregate(health_data[ , 'FATALITIES'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_2)
DMG <- rep('Killed', DMG)
data_2 <- cbind(data_2, DMG)

health_data <- rbind(data_1, data_2)
names(health_data) <- c('EVTYPE', 'COST', 'DMG')

rm(data_1, data_2, DMG)

Step 3 : Creation of a rank column, and arrange the result

In the purpose to facilitate the reading of the results, we have to create a ‘RANK’ column and ranking the rows by ‘COST’ and by ‘EVTYPE’.

subset <- aggregate(health_data[ , 'COST'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
names(subset) <- c('EVTYPE', 'COST')
subset <- subset[ with(subset, order(-COST)), ]
RANK <- 1:nrow(subset)
subset <- cbind(subset, RANK )
subset <- select(subset, c('EVTYPE', 'RANK'))
health_data <- merge(health_data, subset, by = 'EVTYPE')

health_data <- health_data[ with(health_data, order(RANK, DMG)), ]

rm(subset, RANK)

head(health_data)

##              EVTYPE  COST      DMG RANK
## 1659        TORNADO 91346 Injuried    1
## 1660        TORNADO  5633   Killed    1
## 245  EXCESSIVE HEAT  6525 Injuried    2
## 246  EXCESSIVE HEAT  1903   Killed    2
## 1708      TSTM WIND  6957 Injuried    3
## 1707      TSTM WIND   504   Killed    3

Cleanning the economics consequences data of a natural disaster

We need a dataset called ‘eco_data’, in the purpose to plot an exploratory analyse about the economics consequences of a natural disaster.

Step 1 : Getting the cache

eco_data <- cache

Step 2 : Getting the absolute economics costs

The variables ‘PROPDMGEXP’ and ‘CROPDMGEXP’, are the powers for the values presented into ‘PROPDMG’ and ‘CROPDMG’. We will clean these powers variables ( “K” = thousands, “M” = millions, and “B” = billions ), in the puropose to get the economics values of each natural disaster event into one column only. The rows with empty values into ‘PROPDMGEXP’ and ‘CROPDMGEXP’ will be exclude from our analysis.

eco_data[ , 'PROPDMGEXP'] <- as.character(eco_data[ , 'PROPDMGEXP'])
eco_data[ , 'PROPDMGEXP'] <- toupper(eco_data[ , 'PROPDMGEXP'])
Na <- grep('H|K|M|B|[0-9]', eco_data[ , 'PROPDMGEXP'])
eco_data[ -Na , 'PROPDMGEXP'] <- ''
eco_data <- filter(eco_data, PROPDMGEXP != '')
eco_data[ , 'PROPDMGEXP'] <- mapvalues(eco_data[ , 'PROPDMGEXP'], from = c('H', 'K', 'M', 'B'), to = c(2, 3, 6, 9))
eco_data[ , 'PROPDMGEXP'] <- as.numeric(eco_data[ , 'PROPDMGEXP'])
eco_data[ , 'PROPDMG'] <- eco_data[ , 'PROPDMG']*10^eco_data[ , 'PROPDMGEXP']
eco_data <- select(eco_data, -PROPDMGEXP)

eco_data[ , 'CROPDMGEXP'] <- as.character(eco_data[ , 'CROPDMGEXP'])
eco_data[ , 'CROPDMGEXP'] <- toupper(eco_data[ , 'CROPDMGEXP'])
Na <- grep('K|M|B|[0-9]', eco_data[ , 'CROPDMGEXP'])
eco_data[ -Na , 'CROPDMGEXP'] <- ''
eco_data <- filter(eco_data, CROPDMGEXP != '')
eco_data[ , 'CROPDMGEXP'] <- mapvalues(eco_data[ , 'CROPDMGEXP'], from = c('K', 'M', 'B'), to = c(3, 6, 9))
eco_data[ , 'CROPDMGEXP'] <- as.numeric(eco_data[ , 'CROPDMGEXP'])
eco_data[ , 'CROPDMG'] <- eco_data[ , 'CROPDMG']*10^eco_data[ , 'CROPDMGEXP']
eco_data <- select(eco_data, -CROPDMGEXP)

rm(Na)

Step 3 : Sum and get the total

Summing of the propriety and the crop costs, for each natural disaster.

data_1 <- aggregate(eco_data[ , 'PROPDMG'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_1)
DMG <- rep('Property', DMG)
data_1 <- cbind(data_1, DMG)

data_2 <- aggregate(eco_data[ , 'CROPDMG'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_2)
DMG <- rep('Crop', DMG)
data_2 <- cbind(data_2, DMG)

eco_data <- rbind(data_1, data_2)
names(eco_data) <- c('EVTYPE', 'COST', 'DMG')

rm(data_1, data_2, DMG)

Step 4 : Creation of a rank column, and arrange the result

In the purpose to facilitate the reading of the results, we have to create a ‘RANK’ column and ranking the rows by ‘COST’ and by ‘EVTYPE’.

subset <- aggregate(eco_data[ , 'COST'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
names(subset) <- c('EVTYPE', 'COST')
subset <- subset[ with(subset, order(-COST)), ]
RANK <- 1:nrow(subset)
subset <- cbind(subset, RANK )
subset <- select(subset, c('EVTYPE', 'RANK'))
eco_data <- merge(eco_data, subset, by = 'EVTYPE')

eco_data <- eco_data[ with(eco_data, order(RANK, DMG)), ]

rm(subset, RANK)

Step 5 : Short the economics costs

eco_data[ , 'COST'] <- eco_data[ , 'COST'] / 1000000000

head(eco_data)

##                EVTYPE        COST      DMG RANK
## 46              FLOOD 132.8364891 Property    1
## 45              FLOOD   5.1709555     Crop    1
## 123 HURRICANE/TYPHOON  26.7402950 Property    2
## 124 HURRICANE/TYPHOON   2.6078728     Crop    2
## 198           TORNADO  16.1669467 Property    3
## 197           TORNADO   0.4033796     Crop    3

Results

The ten and worst naturals disasters about health consequences

health_data <- health_data[1:20, ]
X <- with(health_data, reorder(EVTYPE, RANK))

health_plot <- ggplot(health_data, aes(x = X , y = COST, fill = DMG)) + ggtitle('Health consequences') + theme(plot.title = element_text(hjust = 0.5))
health_plot <- health_plot + scale_fill_discrete(name = 'Damage')
health_plot <- health_plot + labs(x = 'Natural disaster', y = 'Number of person') + theme(axis.text.x = element_text(angle = 90))
health_plot <- health_plot + geom_bar(stat = 'identity')
print(health_plot)

rm(X)

The ten and worst naturals disasters about economics consequences

eco_data <- eco_data[1:20, ]
X <- with(eco_data, reorder(EVTYPE, RANK))

eco_plot <- ggplot(eco_data, aes(x = X , y = COST, fill = DMG)) + ggtitle('Economics consequences') + theme(plot.title = element_text(hjust = 0.5))
eco_plot <- eco_plot + scale_fill_discrete(name = 'Damage')
eco_plot <- eco_plot + labs(x = 'Natural disaster', y = 'Cost in Billion US Dollars (Billion USD)') + theme(axis.text.x = element_text(angle = 90))
eco_plot <- eco_plot + geom_bar(stat = 'identity')
print(eco_plot)

rm(X)

Discusssion

Across the United States, the worst natural disaster in health consequences is the tornado and the worst disaster in economics consequences is flood. In the purpose to reduct the impacts of theses two weather event, we could work on others exploratories (or inferencials) analyses, for to build some preventives actions.