Synopsis

The goal of this analysis is to answer the following two questions using the NOAA Storm Database. Across the United States: 1. Which types of events are most harmful with respect to population health? 2. Which types of events have the greatest economic consequences?

Analysing the data it is possible to identify that the type of event that causes the most fatalities and injuries in the population are by far the tornados.

The type of event that causes the most damage to properties and crops are the various types of floods, followed by the hurricanes and just then the tornados.

Data Processing

The data is downloaded from the bzip2 compressed CSV separated file: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2. We read in the raw data thus (note that we later cache the prepared data so that we don’t have to do this expensive operation again)

if (!file.exists("repdata_data_StormData.csv.bz2")) {
  download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2")
}
storm.data <- read.csv("repdata_data_StormData.csv.bz2", stringsAsFactors = F)

Looking further into EVTYPE column it is possible to notice that there are some minor inconsistencies, like leading and trailing spaces, or upper case x lower case versions of the same event and some misspelled or very similar events. We group similar weather events into a new group that would reflect a consolidated view of types. These new categories are stored into a new column named eventgrop.

storm.data$EVTYPE <- trimws(tolower(storm.data$EVTYPE))
storm.data$eventgroup <- ""
storm.data$eventgroup[grep("thunderstorm|thundeerstorm|thuderstorm|thunderestorm|thunderstrom|thundertorm|thundertsorm|thundestorm|thunerstorm|tunderstorm", storm.data$EVTYPE)] <- "Thunderstorm"
storm.data$eventgroup[grep("blizzard|frost|freeze|freezing|snow|winter|ice|cold|glaze", storm.data$EVTYPE)] <- "Winter precipitation"
storm.data$eventgroup[grep("high wind|strong wind|extreme wind", storm.data$EVTYPE)] <- "High wind"
storm.data$eventgroup[grep("flood|fldg|fld", storm.data$EVTYPE)] <- "Flood"
storm.data$eventgroup[grep("tornado|torndao", storm.data$EVTYPE)] <- "Tornado"
storm.data$eventgroup[grep("tropical storm|tstm", storm.data$EVTYPE)] <- "Tropical storm"
storm.data$eventgroup[grep("drought|dry", storm.data$EVTYPE)] <- "Drought"
storm.data$eventgroup[grep("lightning|lighting|ligntning", storm.data$EVTYPE)] <- "Lightning"
storm.data$eventgroup[grep("heavy rain|heavy shower", storm.data$EVTYPE)] <- "Heavy rain"
storm.data$eventgroup[grep("hail", storm.data$EVTYPE)] <- "Hail"
storm.data$eventgroup[grep("waterspout|water spout|wayterspout", storm.data$EVTYPE)] <- "Waterspout"
storm.data$eventgroup[grep("heat|warm", storm.data$EVTYPE)] <- "Heat"
storm.data$eventgroup[grep("hurricane", storm.data$EVTYPE)] <- "Hurricane"
storm.data$eventgroup[grep("wildfire|wild fire|forest fire", storm.data$EVTYPE)] <- "Wildfire"
storm.data$eventgroup[grep("rip current", storm.data$EVTYPE)] <- "Rip current"
storm.data$eventgroup[grep("avalanche", storm.data$EVTYPE)] <- "Avalanche"
storm.data$eventgroup[grep("high surf|heavy surf", storm.data$EVTYPE)] <- "High surf"
storm.data$eventgroup[grep("dust storm", storm.data$EVTYPE)] <- "Dust storm"
storm.data$eventgroup[grep("fog", storm.data$EVTYPE)] <- "Fog"
storm.data$eventgroup[grep("storm surge", storm.data$EVTYPE)] <- "Storm surge"
storm.data$eventgroup[storm.data$eventgroup==""] <- "Other"

Replace the simbol values of EXP columns with numeric ones.
* H - Hundred - 10^2
* K - Kilo - 10^3
* M - Mega - 10^6
* B - Billion - 10^9

storm.data$PROPDMGEXP[storm.data$PROPDMGEXP %in% c("","-","+","?", NA)] <- 0
storm.data$PROPDMGEXP[storm.data$PROPDMGEXP %in% c("h","H")] <- 2
storm.data$PROPDMGEXP[storm.data$PROPDMGEXP %in% c("k","K")] <- 3
storm.data$PROPDMGEXP[storm.data$PROPDMGEXP %in% c("m","M")] <- 6
storm.data$PROPDMGEXP[storm.data$PROPDMGEXP %in% c("b","B")] <- 9
storm.data$PROPDMGEXP <- as.numeric(storm.data$PROPDMGEXP)
storm.data$CROPDMGEXP[storm.data$CROPDMGEXP %in% c("","-","+","?", NA)] <- 0
storm.data$CROPDMGEXP[storm.data$CROPDMGEXP %in% c("h","H")] <- 2
storm.data$CROPDMGEXP[storm.data$CROPDMGEXP %in% c("k","K")] <- 3
storm.data$CROPDMGEXP[storm.data$CROPDMGEXP %in% c("m","M")] <- 6
storm.data$CROPDMGEXP[storm.data$CROPDMGEXP %in% c("b","B")] <- 9
storm.data$CROPDMGEXP <- as.numeric(storm.data$CROPDMGEXP)

Calculate two new columns with the property and crop cost.

storm.data$propcost <- storm.data$PROPDMG * 10 ^ storm.data$PROPDMGEXP
storm.data$cropcost <- storm.data$CROPDMG * 10 ^ storm.data$CROPDMGEXP

Results

Check which are the top 10 most dangerous weather events using the sum of the FATALITIES and INJURIES. On this analysis we using the new groups created in the Data Processing section

health <- storm.data %>% select(eventgroup, FATALITIES, INJURIES) %>% group_by(eventgroup) %>% 
  summarise(fatalities=sum(FATALITIES), injuries=sum(INJURIES)) %>% arrange(desc(fatalities + injuries))
head(health,10)
## # A tibble: 10 x 3
##    eventgroup           fatalities injuries
##    <chr>                     <dbl>    <dbl>
##  1 Tornado                    5633    91364
##  2 Heat                       3178     9243
##  3 Flood                      1552     8683
##  4 Tropical storm              580     7353
##  5 Winter precipitation       1093     6527
##  6 Lightning                   817     5232
##  7 Thunderstorm                211     2477
##  8 High wind                   439     1851
##  9 Wildfire                     90     1606
## 10 Hail                         45     1467

Order the data and plot the graph that will show the top 10 most harmful weather event types when it comes to kill or endanger the lives of people.

health$impact <- health$fatalities+health$injuries
ggplot(head(health,10), aes(reorder(eventgroup,-impact), impact)) + 
  geom_bar(stat = "identity") + ylab("Total Events") + xlab("Event Types") + 
  ggtitle("Top 10 Weather Events Harmful to Population Health (1950-2011)") +
  theme(axis.text.x = element_text(angle = 90, size = 7, hjust = 1 ), legend.position="none")

The logic used to calculate the economic impact is the same used in the previous sections. We will calculate the cost of damage to properties and crops per group of event types (not the original ones) and then plot a graph where the top 10 with most damage are shown in order.

damage <- storm.data %>% select(eventgroup, propcost, cropcost) %>% group_by(eventgroup) %>% 
  summarise(propcost=sum(propcost)/10^9, cropcost=sum(cropcost)/10^9) %>% arrange(desc(propcost + propcost))
head(damage,10)
## # A tibble: 10 x 3
##    eventgroup           propcost  cropcost
##    <chr>                   <dbl>     <dbl>
##  1 Flood                  168.   12.4     
##  2 Hurricane               84.8   5.52    
##  3 Tornado                 57.0   0.415   
##  4 Storm surge             48.0   0.000855
##  5 Hail                    17.6   3.11    
##  6 Winter precipitation    12.5   8.72    
##  7 Tropical storm          12.2   1.25    
##  8 Wildfire                 8.50  0.403   
##  9 Thunderstorm             6.64  0.653   
## 10 High wind                6.24  0.778
damage$cost <- damage$propcost+damage$cropcost
ggplot(head(damage,10), aes(reorder(eventgroup,-cost), cost)) + 
  geom_bar(stat = "identity") + ylab("Total Cost") + xlab("Event Types") + 
  ggtitle("Top 10 Weather Events Harmful to Economic Health (1950-2011)") +
  theme(axis.text.x = element_text(angle = 90, size = 7, hjust = 1 ), legend.position="none")