Reproducible Research: Peer Assessment 1

Severe weather conditions may cause huge costs in both people health & economics. Such events as Tornadoes, Floods, Hail and etc-etc - should be considered in risk-mitigation plan of each region - which might be affected by such disasters.

This analysis tries to respond on the 1st question, asked by any city or region manager - “which wather events triggers most damage - both in people casualties & property damage?”.

Loading and preprocessing the data

We've used open historical data about all types of weather events - including not serious ones (such as “cold/warm weather” etc.)

While raw data has some misprints (e.g., 10 different variants of word “Lightning” in event type), duplications (month summary records) and damage value, encoded into two fields (value and exponent) - we have to apply serious data massage to raw data before starting analysis:

download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile="stormdata.csv.bz2")
## Error: ?? ?????????????? ????? URL
#Works fine with compressed files, too
storm_df <- read.csv("stormdata.csv.bz2", stringsAsFactors=FALSE)

#Cleaning the data
storm_df$EVTYPE <- gsub("(^ +)|( +$)", "", storm_df$EVTYPE) #Remove leading & trailing spaces
storm_df$EVTYPE <- toupper(storm_df$EVTYPE)
storm_df <- subset(storm_df, !grepl("^SUMMARY", EVTYPE))
storm_df <- subset(storm_df, FATALITIES + INJURIES + PROPDMG + CROPDMG > 0)

#Calculating property damage in one scale
storm_df$PROPDMGEXP <- as.numeric( chartr("-?+HhKkMmBb", "00022336699", storm_df$PROPDMGEXP) )
storm_df$CROPDMGEXP <- as.numeric( chartr("-?+HhKkMmBb", "00022336699", storm_df$CROPDMGEXP) )

storm_df$PROPDMGEXP <- ifelse(is.na(storm_df$PROPDMGEXP), 0, storm_df$PROPDMGEXP)
storm_df$CROPDMGEXP <- ifelse(is.na(storm_df$CROPDMGEXP), 0, storm_df$CROPDMGEXP)

PropertyDamage <- storm_df$PROPDMG * 10^storm_df$PROPDMGEXP
CropDamage <- storm_df$CROPDMG * 10^storm_df$CROPDMGEXP

storm_df <- cbind(storm_df, PropertyDamage)
storm_df <- cbind(storm_df, CropDamage)


#Variants of misprints in data for common event types
ThunderstormPattern <- "^(THUNDERSTORM|TUNDERSTORM|THUNDERTORM|THUNERSTORM|THUNDERSTROM|THUNDERTSORM|THUNDESTORM|THUNDEERSTORM|THUNDERESTORM|THUDERSTORM)"

#Adding new string column - appropriate for our analysis
EventType <- storm_df$EVTYPE
storm_df <- cbind(storm_df, EventType)
storm_df$EventType = as.character(storm_df$EventType)

#This new column will categorize similar weather event types
storm_df$EventType[grepl(ThunderstormPattern, storm_df$EVTYPE)] <- "THUNDERSTORM"
storm_df$EventType[grepl("^(LIGHTNING|LIGNTNING)", storm_df$EVTYPE)] <- "LIGHTNING"
storm_df$EventType[grepl("^TSTM", storm_df$EVTYPE)] <- "TSTM"
storm_df$EventType[grepl("^HURRICANE", storm_df$EVTYPE)] <- "HURRICANE"
storm_df$EventType[grepl("^HIGH WIND", storm_df$EVTYPE)] <- "HIGH WIND"
storm_df$EventType[grepl("^TORNADO", storm_df$EVTYPE)] <- "TORNADO"
storm_df$EventType[grepl("^HEAVY SNOW", storm_df$EVTYPE)] <- "HEAVY SNOW"
storm_df$EventType[grepl("^HEAVY RAIN", storm_df$EVTYPE)] <- "HEAVY RAIN"
storm_df$EventType[grepl("^HAIL", storm_df$EVTYPE)] <- "HAIL"
storm_df$EventType[grepl("^FLASH FLOOD", storm_df$EVTYPE)] <- "FLOOD"
storm_df$EventType[grepl("^FLOOD", storm_df$EVTYPE)] <- "FLOOD"
storm_df$EventType[grepl("^DRY MICROBURST", storm_df$EVTYPE)] <- "DRY MICROBURST"

storm_df_aggregated <- aggregate(cbind(FATALITIES, INJURIES, PropertyDamage, CropDamage) ~ EventType, data=storm_df, FUN=sum)

Which event types affects people's health a most?

So, first of all - we need to distinguish important events in terms of people causalties - from “All other”. If we'll focus only on those, which caused more than 100 fatalities or more than 1000 injuries - we'll cover more than 88% of all casualties in our analysis.

You may check with pie chart of Fatalities & Injuries below:

events_most_fatalities <- storm_df_aggregated
events_most_fatalities$EventType[which(events_most_fatalities$FATALITIES <= 100)] <- "ALL OTHER"
events_most_fatalities <- aggregate(FATALITIES ~ EventType, data=events_most_fatalities, FUN=sum)

events_most_injuries <- storm_df_aggregated
events_most_injuries$EventType[which(storm_df_aggregated$INJURIES <= 1000)] <- "ALL OTHER"
events_most_injuries <- aggregate(INJURIES ~ EventType, data=events_most_injuries, FUN=sum)

library(ggplot2)
library(RColorBrewer)
library(scales)

#Pie chart for ppl fatalities
pallette_length_f <- dim(events_most_fatalities)[[1]]
total_fatalities <- sum(events_most_fatalities$FATALITIES)
ggplot(events_most_fatalities, aes(x = "",
       y = FATALITIES, fill = EventType)) + geom_bar(width = 1, stat = "identity") +
       coord_polar(theta = "y", start = pi/2) + 
       scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Paired"))(pallette_length_f)) +
       scale_y_continuous(breaks = seq(0, total_fatalities, 2000)) +
       labs(title = "Fatilities distribution per Event Type")

plot of chunk peoplehealthaffect

#Pie chart for ppl injuries
pallette_length_i <- dim(events_most_injuries)[[1]]
total_injuries <- sum(events_most_injuries$INJURIES)
ggplot(events_most_injuries, aes(x = "",
       y = INJURIES, fill = EventType)) + geom_bar(width = 1, stat = "identity") +
       coord_polar(theta = "y", start = pi/2) + 
       scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Paired"))(pallette_length_i)) +
       scale_y_continuous(breaks = seq(0, total_injuries, 15000)) +
       labs(title = "Injuries distribution per Event Type")

plot of chunk peoplehealthaffect

# Same for ppl casualties! +OVERALL SUM / 88% explanation

Which event types affects country economic a most?

To cover most part of total economic loss - we may focus on all event types, caused more than 10 bilions of dollars damage - summary property & crop damage.

Here is pie chart:

events_most_damage <- storm_df_aggregated
events_most_damage$EventType[which(events_most_damage$PropertyDamage + events_most_damage$CropDamage <= 10000000000)] <- "ALL OTHER"
events_most_damage <- aggregate(cbind(PropertyDamage, CropDamage) ~ EventType, data=events_most_damage, FUN=sum)

library(ggplot2)
library(RColorBrewer)
library(scales)

#Pie chart for total damage
pallette_length_d <- dim(events_most_damage)[[1]]
total_damage <- sum(events_most_damage$PropertyDamage) + sum(events_most_damage$CropDamage)
ggplot(events_most_damage, aes(x = "",
       y = (PropertyDamage + CropDamage) / 10000000000, fill = EventType)) + geom_bar(width = 1, stat = "identity") +
       coord_polar(theta = "y", start = pi/2) + 
       scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Paired"))(pallette_length_d)) +
       scale_y_continuous("Total damage, Tenth of Bilions $", breaks = seq(0, total_damage / 10000000000, 10)) +
       labs(title = "Total damage distribution per Event Type")

plot of chunk economicalaffect

Results

As we can see from the given plots:

  1. Tornadoes makes most part of people health damage - so our efforts on people alert, timely evacuation etc. - should be focused on protection from Tornadoes. Given dataset doesn't contain reasonable classification into different categories of tornadoes, so more detailed analysis is impossible - but I guess that F4-F5 is most dangerous ones.
  2. As for financial loss analysis - result is different - Floods are most “expensive” weather event according to our observations data. So, infrastructure improvements, aiming to enhance buidlings protection from floods looks most reasonable approach to minimize economic affect from wather disasters.