Abstract



In this project, we will explore the U.S. National Oceanic and Atmospheric Administation’s (NOAA) Storm Database. The purpose of this project is to extract some informations about both public health and economics consequences produced by the major storms and wheather events. This exploratory analyses concern the wheather events, which occur accross the United States.


This project have to give an answer for each of these questions :



Data processing



Material


R.version
##                _                           
## platform       x86_64-pc-linux-gnu         
## arch           x86_64                      
## os             linux-gnu                   
## system         x86_64, linux-gnu           
## status                                     
## major          3                           
## minor          2.3                         
## year           2015                        
## month          12                          
## day            10                          
## svn rev        69752                       
## language       R                           
## version.string R version 3.2.3 (2015-12-10)
## nickname       Wooden Christmas-Tree


Packages used


library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)



Processing


Get the data and put it into a cache


We need to get the data from the course web site.

if(!file.exists('data.csv.bz2')){
        download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2', destfile = 'data.csv.bz2')
}

cache <- read.csv('data.csv.bz2')



Cleanning the health consequences data of a natural disaster


We need a dataset called ‘health_data’, in the purpose to plot an exploratory analyse about the health consequences of a natural disaster.


Step 1 : Getting the cache


health_data <- cache


Step 2 : Sum and get the total


Summing of the number of injuried and killed persons, for each natural disaster.

data_1 <- aggregate(health_data[ , 'INJURIES'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_1)
DMG <- rep('Injuried', DMG)
data_1 <- cbind(data_1, DMG)

data_2 <- aggregate(health_data[ , 'FATALITIES'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_2)
DMG <- rep('Killed', DMG)
data_2 <- cbind(data_2, DMG)

health_data <- rbind(data_1, data_2)
names(health_data) <- c('EVTYPE', 'COST', 'DMG')

rm(data_1, data_2, DMG)


Step 3 : Creation of a rank column, and arrange the result


In the purpose to facilitate the reading of the results, we have to create a ‘RANK’ column and ranking the rows by ‘COST’ and by ‘EVTYPE’.

subset <- aggregate(health_data[ , 'COST'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
names(subset) <- c('EVTYPE', 'COST')
subset <- subset[ with(subset, order(-COST)), ]
RANK <- 1:nrow(subset)
subset <- cbind(subset, RANK )
subset <- select(subset, c('EVTYPE', 'RANK'))
health_data <- merge(health_data, subset, by = 'EVTYPE')

health_data <- health_data[ with(health_data, order(RANK, DMG)), ]

rm(subset, RANK)

head(health_data)
##              EVTYPE  COST      DMG RANK
## 1659        TORNADO 91346 Injuried    1
## 1660        TORNADO  5633   Killed    1
## 245  EXCESSIVE HEAT  6525 Injuried    2
## 246  EXCESSIVE HEAT  1903   Killed    2
## 1708      TSTM WIND  6957 Injuried    3
## 1707      TSTM WIND   504   Killed    3



Cleanning the economics consequences data of a natural disaster


We need a dataset called ‘eco_data’, in the purpose to plot an exploratory analyse about the economics consequences of a natural disaster.


Step 1 : Getting the cache


eco_data <- cache


Step 2 : Getting the absolute economics costs


The variables ‘PROPDMGEXP’ and ‘CROPDMGEXP’, are the powers for the values presented into ‘PROPDMG’ and ‘CROPDMG’. We will clean these powers variables ( “K” = thousands, “M” = millions, and “B” = billions ), in the puropose to get the economics values of each natural disaster event into one column only. The rows with empty values into ‘PROPDMGEXP’ and ‘CROPDMGEXP’ will be exclude from our analysis.

eco_data[ , 'PROPDMGEXP'] <- as.character(eco_data[ , 'PROPDMGEXP'])
eco_data[ , 'PROPDMGEXP'] <- toupper(eco_data[ , 'PROPDMGEXP'])
Na <- grep('H|K|M|B|[0-9]', eco_data[ , 'PROPDMGEXP'])
eco_data[ -Na , 'PROPDMGEXP'] <- ''
eco_data <- filter(eco_data, PROPDMGEXP != '')
eco_data[ , 'PROPDMGEXP'] <- mapvalues(eco_data[ , 'PROPDMGEXP'], from = c('H', 'K', 'M', 'B'), to = c(2, 3, 6, 9))
eco_data[ , 'PROPDMGEXP'] <- as.numeric(eco_data[ , 'PROPDMGEXP'])
eco_data[ , 'PROPDMG'] <- eco_data[ , 'PROPDMG']*10^eco_data[ , 'PROPDMGEXP']
eco_data <- select(eco_data, -PROPDMGEXP)

eco_data[ , 'CROPDMGEXP'] <- as.character(eco_data[ , 'CROPDMGEXP'])
eco_data[ , 'CROPDMGEXP'] <- toupper(eco_data[ , 'CROPDMGEXP'])
Na <- grep('K|M|B|[0-9]', eco_data[ , 'CROPDMGEXP'])
eco_data[ -Na , 'CROPDMGEXP'] <- ''
eco_data <- filter(eco_data, CROPDMGEXP != '')
eco_data[ , 'CROPDMGEXP'] <- mapvalues(eco_data[ , 'CROPDMGEXP'], from = c('K', 'M', 'B'), to = c(3, 6, 9))
eco_data[ , 'CROPDMGEXP'] <- as.numeric(eco_data[ , 'CROPDMGEXP'])
eco_data[ , 'CROPDMG'] <- eco_data[ , 'CROPDMG']*10^eco_data[ , 'CROPDMGEXP']
eco_data <- select(eco_data, -CROPDMGEXP)

rm(Na)


Step 3 : Sum and get the total


Summing of the propriety and the crop costs, for each natural disaster.

data_1 <- aggregate(eco_data[ , 'PROPDMG'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_1)
DMG <- rep('Property', DMG)
data_1 <- cbind(data_1, DMG)

data_2 <- aggregate(eco_data[ , 'CROPDMG'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_2)
DMG <- rep('Crop', DMG)
data_2 <- cbind(data_2, DMG)

eco_data <- rbind(data_1, data_2)
names(eco_data) <- c('EVTYPE', 'COST', 'DMG')

rm(data_1, data_2, DMG)


Step 4 : Creation of a rank column, and arrange the result


In the purpose to facilitate the reading of the results, we have to create a ‘RANK’ column and ranking the rows by ‘COST’ and by ‘EVTYPE’.

subset <- aggregate(eco_data[ , 'COST'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
names(subset) <- c('EVTYPE', 'COST')
subset <- subset[ with(subset, order(-COST)), ]
RANK <- 1:nrow(subset)
subset <- cbind(subset, RANK )
subset <- select(subset, c('EVTYPE', 'RANK'))
eco_data <- merge(eco_data, subset, by = 'EVTYPE')

eco_data <- eco_data[ with(eco_data, order(RANK, DMG)), ]

rm(subset, RANK)


Step 5 : Short the economics costs


eco_data[ , 'COST'] <- eco_data[ , 'COST'] / 1000000000

head(eco_data)
##                EVTYPE        COST      DMG RANK
## 46              FLOOD 132.8364891 Property    1
## 45              FLOOD   5.1709555     Crop    1
## 123 HURRICANE/TYPHOON  26.7402950 Property    2
## 124 HURRICANE/TYPHOON   2.6078728     Crop    2
## 198           TORNADO  16.1669467 Property    3
## 197           TORNADO   0.4033796     Crop    3



Results



The ten and worst naturals disasters about health consequences



health_data <- health_data[1:20, ]
X <- with(health_data, reorder(EVTYPE, RANK))

health_plot <- ggplot(health_data, aes(x = X , y = COST, fill = DMG)) + ggtitle('Health consequences') + theme(plot.title = element_text(hjust = 0.5))
health_plot <- health_plot + scale_fill_discrete(name = 'Damage')
health_plot <- health_plot + labs(x = 'Natural disaster', y = 'Number of person') + theme(axis.text.x = element_text(angle = 90))
health_plot <- health_plot + geom_bar(stat = 'identity')
print(health_plot)

rm(X)



The ten and worst naturals disasters about economics consequences



eco_data <- eco_data[1:20, ]
X <- with(eco_data, reorder(EVTYPE, RANK))

eco_plot <- ggplot(eco_data, aes(x = X , y = COST, fill = DMG)) + ggtitle('Economics consequences') + theme(plot.title = element_text(hjust = 0.5))
eco_plot <- eco_plot + scale_fill_discrete(name = 'Damage')
eco_plot <- eco_plot + labs(x = 'Natural disaster', y = 'Cost in Billion US Dollars (Billion USD)') + theme(axis.text.x = element_text(angle = 90))
eco_plot <- eco_plot + geom_bar(stat = 'identity')
print(eco_plot)

rm(X)



Discusssion



Across the United States, the worst natural disaster in health consequences is the tornado and the worst disaster in economics consequences is flood. In the purpose to reduct the impacts of theses two weather event, we could work on others exploratories (or inferencials) analyses, for to build some preventives actions.