Synopsis

In this analysis we examine the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The goal is to describe based on the historical data the most influencial events types deaths, injuries and total damage in dollars.

The explored dataset describes the Weather events in different aspects for the period 1950 to 2011. In the study we extract the relevant data fields, aggregate the data summing up the deaths, injured people and total damage in dollar for property and crop. There are three histograms visualising the total deaths, injures and total damage over the period by weather event type.

As this could be seen, the highest impact to deaths and injuries is tornado. The highest economical impact is created by floods.

Libraries used

library(ggplot2)
library(reshape)

Data processing

In this section we download the data and apply neccessary data processing steps. Data is downloaded from here Storm Data. The documentation of the data is accessible here National Weather Service Storm Data Documentation

Downloading the data

if(!file.exists("repdata_data_StormData.csv.bz2")){
    url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(url, destfile = "repdata_data_StormData.csv.bz2",method = "curl")
    }

Reading the downloaded data if RDS fromat does not exist

if(!file.exists("repdata_data_StormData.RDS")){
    # Read the data
    data <- read.csv(file = "repdata_data_StormData.csv.bz2",header = TRUE,sep=",")
    # Save to RDS if not exist
    saveRDS(object = data,file = "repdata_data_StormData.RDS")
    }

Storing the data in RDS format for quicker reading

data <- readRDS(file = "repdata_data_StormData.RDS")
dim(data)
## [1] 902297     37

Subset the data for relevant events

data <- data[,c("EVTYPE","FATALITIES","INJURIES","PROPDMG",
                "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]

Creating factor variables for event type

data$EVTYPE <- as.factor(data$EVTYPE)

Aggregate deaths and injuries by weather event type

fatalities_agr <- aggregate(data$FATALITIES, list(data$EVTYPE), sum)
injuries_agr <- aggregate(data$INJURIES, list(data$EVTYPE), sum)
names(fatalities_agr) <- c("Event", "Count")
names(injuries_agr) <- c("Event", "Count")
eventsdisasters <- cbind(fatalities_agr,injuries_agr[,2])
names(eventsdisasters) <- c("Event_type","Fatailities_count","Injuries_count")

Showing 5 most impactful events types by deaths and injuries

eventsdisasters <- eventsdisasters[order(-eventsdisasters$Fatailities_count,
                                         -eventsdisasters$Injuries_count),]
head(eventsdisasters)
##         Event_type Fatailities_count Injuries_count
## 834        TORNADO              5633          91346
## 130 EXCESSIVE HEAT              1903           6525
## 153    FLASH FLOOD               978           1777
## 275           HEAT               937           2100
## 464      LIGHTNING               816           5230
## 856      TSTM WIND               504           6957

Aggregated weather events by econimical damage

In this section we review the economical damage, adopting appropirate impact value units.

unitscale <- function(coef, scale){
    
    if (is.na(scale)){
        as.numeric(coef)
        }
    else if (scale== "K"){
        as.numeric(coef)*10^3
        }
    else if (scale== "k"){
        as.numeric(coef)*10^3
        }
    else if (scale == "M"){
        as.numeric(coef)*10^6
        }
    else if (scale == "m"){
        as.numeric(coef)*10^6
        }
    else if (scale== "B"){
        as.numeric(coef)*10^9
        }
    else if (scale== "?"){
        as.numeric(coef)*1
        }
    else{
        as.numeric(coef)
        }
    }

data$property_damage <- apply(data[, c('PROPDMG', 'PROPDMGEXP')], 1,
                              function(x) unitscale(x['PROPDMG'], x['PROPDMGEXP']))
data$crop_damage <- apply(data[, c('CROPDMG', 'CROPDMGEXP')], 1, 
                          function(x) unitscale(x['CROPDMG'], x['CROPDMGEXP']))

Aggregating damage values in dollars

propoerty_damage_agr <- aggregate(data$property_damage, list(data$EVTYPE), sum)
crop_damage_agr <- aggregate(data$crop_damage, list(data$EVTYPE), sum)
names(propoerty_damage_agr) <- c("Event", "Property_damage_value")
names(crop_damage_agr) <- c("Event", "Crop_damage_value")

Creating summary dataset and showing 5 highest impact event types

eventsdamage_value <- cbind(propoerty_damage_agr,crop_damage_agr[,2])
names(eventsdamage_value) <- c("Event_type","Property_damage_value","Crop_damage_value")
eventsdamage_value$Total_damage_value <- eventsdamage_value$Property_damage_value +
    eventsdamage_value$Crop_damage_value
eventsdamage_value <- eventsdamage_value[order(-eventsdamage_value$Total_damage_value),]
head(eventsdamage_value)
##            Event_type Property_damage_value Crop_damage_value
## 170             FLOOD          144657709807        5661968450
## 411 HURRICANE/TYPHOON           69305840000        2607872800
## 834           TORNADO           56937160779         414953270
## 670       STORM SURGE           43323536000              5000
## 244              HAIL           15732267048        3025954473
## 153       FLASH FLOOD           16140812067        1421317100
##     Total_damage_value
## 170       150319678257
## 411        71913712800
## 834        57352114049
## 670        43323541000
## 244        18758221521
## 153        17562129167

Results

In this section we describe 3 historgrams with the highest impact weather event types in corresponding categories.

eventsdisasters$events_order_by_injuries_count <- reorder(eventsdisasters$Event_type,
                                                          eventsdisasters$Injuries_count)
p1 <- ggplot(data = eventsdisasters[1:10,], aes(x = events_order_by_injuries_count,
                                                y = Injuries_count))
p1 + geom_bar(stat ='identity') + 
    coord_flip() +
    labs(title = '10 Highest impact types of weather\n measured by injuries 1950 - 2011',
         y = 'Number of People', x = 'Weather event type')

As this could be clearly seen, the tornadoes are the highest impact for generating injuries for people.

eventsdisasters$events_order_by_deaths_count <- reorder(eventsdisasters$Event_type,
                                                        eventsdisasters$Fatailities_count)
p2 <- ggplot(data = eventsdisasters[1:10,], aes(x = events_order_by_deaths_count, 
                                                y = Fatailities_count))
p2 + geom_bar(stat ='identity') + 
    coord_flip() +
    labs(title = '10 Highest impact types of weather\n measured by deaths 1950 - 2011',
         y = 'Number of People', x = 'Weather event type')

As this could be clearly seen, the tornadoes are the highest impact for generating deaths of people.

eventsdamage_value$events_order_by_damage <- reorder(eventsdamage_value$Event_type,
                                                     eventsdamage_value$Total_damage_value)
p3 <- ggplot(data = eventsdamage_value[1:10,], aes(x = events_order_by_damage, 
                                                   y = Total_damage_value/10^9))
p3 + geom_bar(stat ='identity') + 
    coord_flip() +
    labs(title = '10 Highest damage events types of weather\n measured by total property and crop damage 1950 - 2011', 
         y = 'Damage in Billion dollars', x = 'Weather event type')