In this project, we will explore the U.S. National Oceanic and Atmospheric Administation’s (NOAA) Storm Database. The purpose of this project is to extract some informations about both public health and economics consequences produced by the major storms and wheather events. This exploratory analyses concern the wheather events, which occur accross the United States.
This project have to give an answer for each of these questions :
Across the United States, which types of events are most harmful with respect to population health ?
Across the United States, which types of events have the greatest economic consequences ?
R.version
## _
## platform x86_64-pc-linux-gnu
## arch x86_64
## os linux-gnu
## system x86_64, linux-gnu
## status
## major 3
## minor 2.3
## year 2015
## month 12
## day 10
## svn rev 69752
## language R
## version.string R version 3.2.3 (2015-12-10)
## nickname Wooden Christmas-Tree
Packages used
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Get the data and put it into a cache
We need to get the data from the course web site.
if(!file.exists('data.csv.bz2')){
download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2', destfile = 'data.csv.bz2')
}
cache <- read.csv('data.csv.bz2')
We need a dataset called ‘health_data’, in the purpose to plot an exploratory analyse about the health consequences of a natural disaster.
Step 1 : Getting the cache
health_data <- cache
Step 2 : Sum and get the total
Summing of the number of injuried and killed persons, for each natural disaster.
data_1 <- aggregate(health_data[ , 'INJURIES'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_1)
DMG <- rep('Injuried', DMG)
data_1 <- cbind(data_1, DMG)
data_2 <- aggregate(health_data[ , 'FATALITIES'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_2)
DMG <- rep('Killed', DMG)
data_2 <- cbind(data_2, DMG)
health_data <- rbind(data_1, data_2)
names(health_data) <- c('EVTYPE', 'COST', 'DMG')
rm(data_1, data_2, DMG)
Step 3 : Creation of a rank column, and arrange the result
In the purpose to facilitate the reading of the results, we have to create a ‘RANK’ column and ranking the rows by ‘COST’ and by ‘EVTYPE’.
subset <- aggregate(health_data[ , 'COST'], by = list(EVTYPE = health_data[ , 'EVTYPE']), FUN = sum)
names(subset) <- c('EVTYPE', 'COST')
subset <- subset[ with(subset, order(-COST)), ]
RANK <- 1:nrow(subset)
subset <- cbind(subset, RANK )
subset <- select(subset, c('EVTYPE', 'RANK'))
health_data <- merge(health_data, subset, by = 'EVTYPE')
health_data <- health_data[ with(health_data, order(RANK, DMG)), ]
rm(subset, RANK)
head(health_data)
## EVTYPE COST DMG RANK
## 1659 TORNADO 91346 Injuried 1
## 1660 TORNADO 5633 Killed 1
## 245 EXCESSIVE HEAT 6525 Injuried 2
## 246 EXCESSIVE HEAT 1903 Killed 2
## 1708 TSTM WIND 6957 Injuried 3
## 1707 TSTM WIND 504 Killed 3
We need a dataset called ‘eco_data’, in the purpose to plot an exploratory analyse about the economics consequences of a natural disaster.
Step 1 : Getting the cache
eco_data <- cache
Step 2 : Getting the absolute economics costs
The variables ‘PROPDMGEXP’ and ‘CROPDMGEXP’, are the powers for the values presented into ‘PROPDMG’ and ‘CROPDMG’. We will clean these powers variables ( “K” = thousands, “M” = millions, and “B” = billions ), in the puropose to get the economics values of each natural disaster event into one column only. The rows with empty values into ‘PROPDMGEXP’ and ‘CROPDMGEXP’ will be exclude from our analysis.
eco_data[ , 'PROPDMGEXP'] <- as.character(eco_data[ , 'PROPDMGEXP'])
eco_data[ , 'PROPDMGEXP'] <- toupper(eco_data[ , 'PROPDMGEXP'])
Na <- grep('H|K|M|B|[0-9]', eco_data[ , 'PROPDMGEXP'])
eco_data[ -Na , 'PROPDMGEXP'] <- ''
eco_data <- filter(eco_data, PROPDMGEXP != '')
eco_data[ , 'PROPDMGEXP'] <- mapvalues(eco_data[ , 'PROPDMGEXP'], from = c('H', 'K', 'M', 'B'), to = c(2, 3, 6, 9))
eco_data[ , 'PROPDMGEXP'] <- as.numeric(eco_data[ , 'PROPDMGEXP'])
eco_data[ , 'PROPDMG'] <- eco_data[ , 'PROPDMG']*10^eco_data[ , 'PROPDMGEXP']
eco_data <- select(eco_data, -PROPDMGEXP)
eco_data[ , 'CROPDMGEXP'] <- as.character(eco_data[ , 'CROPDMGEXP'])
eco_data[ , 'CROPDMGEXP'] <- toupper(eco_data[ , 'CROPDMGEXP'])
Na <- grep('K|M|B|[0-9]', eco_data[ , 'CROPDMGEXP'])
eco_data[ -Na , 'CROPDMGEXP'] <- ''
eco_data <- filter(eco_data, CROPDMGEXP != '')
eco_data[ , 'CROPDMGEXP'] <- mapvalues(eco_data[ , 'CROPDMGEXP'], from = c('K', 'M', 'B'), to = c(3, 6, 9))
eco_data[ , 'CROPDMGEXP'] <- as.numeric(eco_data[ , 'CROPDMGEXP'])
eco_data[ , 'CROPDMG'] <- eco_data[ , 'CROPDMG']*10^eco_data[ , 'CROPDMGEXP']
eco_data <- select(eco_data, -CROPDMGEXP)
rm(Na)
Step 3 : Sum and get the total
Summing of the propriety and the crop costs, for each natural disaster.
data_1 <- aggregate(eco_data[ , 'PROPDMG'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_1)
DMG <- rep('Property', DMG)
data_1 <- cbind(data_1, DMG)
data_2 <- aggregate(eco_data[ , 'CROPDMG'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
DMG <- nrow(data_2)
DMG <- rep('Crop', DMG)
data_2 <- cbind(data_2, DMG)
eco_data <- rbind(data_1, data_2)
names(eco_data) <- c('EVTYPE', 'COST', 'DMG')
rm(data_1, data_2, DMG)
Step 4 : Creation of a rank column, and arrange the result
In the purpose to facilitate the reading of the results, we have to create a ‘RANK’ column and ranking the rows by ‘COST’ and by ‘EVTYPE’.
subset <- aggregate(eco_data[ , 'COST'], by = list(EVTYPE = eco_data[ , 'EVTYPE']), FUN = sum)
names(subset) <- c('EVTYPE', 'COST')
subset <- subset[ with(subset, order(-COST)), ]
RANK <- 1:nrow(subset)
subset <- cbind(subset, RANK )
subset <- select(subset, c('EVTYPE', 'RANK'))
eco_data <- merge(eco_data, subset, by = 'EVTYPE')
eco_data <- eco_data[ with(eco_data, order(RANK, DMG)), ]
rm(subset, RANK)
Step 5 : Short the economics costs
eco_data[ , 'COST'] <- eco_data[ , 'COST'] / 1000000000
head(eco_data)
## EVTYPE COST DMG RANK
## 46 FLOOD 132.8364891 Property 1
## 45 FLOOD 5.1709555 Crop 1
## 123 HURRICANE/TYPHOON 26.7402950 Property 2
## 124 HURRICANE/TYPHOON 2.6078728 Crop 2
## 198 TORNADO 16.1669467 Property 3
## 197 TORNADO 0.4033796 Crop 3
health_data <- health_data[1:20, ]
X <- with(health_data, reorder(EVTYPE, RANK))
health_plot <- ggplot(health_data, aes(x = X , y = COST, fill = DMG)) + ggtitle('Health consequences') + theme(plot.title = element_text(hjust = 0.5))
health_plot <- health_plot + scale_fill_discrete(name = 'Damage')
health_plot <- health_plot + labs(x = 'Natural disaster', y = 'Number of person') + theme(axis.text.x = element_text(angle = 90))
health_plot <- health_plot + geom_bar(stat = 'identity')
print(health_plot)
rm(X)
eco_data <- eco_data[1:20, ]
X <- with(eco_data, reorder(EVTYPE, RANK))
eco_plot <- ggplot(eco_data, aes(x = X , y = COST, fill = DMG)) + ggtitle('Economics consequences') + theme(plot.title = element_text(hjust = 0.5))
eco_plot <- eco_plot + scale_fill_discrete(name = 'Damage')
eco_plot <- eco_plot + labs(x = 'Natural disaster', y = 'Cost in Billion US Dollars (Billion USD)') + theme(axis.text.x = element_text(angle = 90))
eco_plot <- eco_plot + geom_bar(stat = 'identity')
print(eco_plot)
rm(X)
Across the United States, the worst natural disaster in health consequences is the tornado and the worst disaster in economics consequences is flood. In the purpose to reduct the impacts of theses two weather event, we could work on others exploratories (or inferencials) analyses, for to build some preventives actions.