Introduction

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project will focus on exploring the NOAA Storm Database to identify which type of severe weather events are most harmful with respect to population health as well as have the greatest economic consequences.

Research questions:

  1. Across the United States, which types of events (as indicated in the EVTYPE EVTYPE variable) are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

Set up

Load packages

library(ggplot2)
library(readr)
library(dplyr)

Load data

storm <- read_csv("C:/Users/tuuye/Desktop/Data Science course/Reproducible Research/repdata_data_StormData.csv")

Exporatory Data Analysis

First, we will check the dimemsion as well as the first six rows of the dataset

dim(storm)
## [1] 902297     37

The dataset has 902297 rows and 37 variables

head(storm)
## # A tibble: 6 x 37
##   STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
##     <dbl> <chr>    <chr>    <chr>      <dbl> <chr>      <chr> <chr> 
## 1       1 4/18/19~ 0130     CST           97 MOBILE     AL    TORNA~
## 2       1 4/18/19~ 0145     CST            3 BALDWIN    AL    TORNA~
## 3       1 2/20/19~ 1600     CST           57 FAYETTE    AL    TORNA~
## 4       1 6/8/195~ 0900     CST           89 MADISON    AL    TORNA~
## 5       1 11/15/1~ 1500     CST           43 CULLMAN    AL    TORNA~
## 6       1 11/15/1~ 2000     CST           77 LAUDERDALE AL    TORNA~
## # ... with 29 more variables: BGN_RANGE <dbl>, BGN_AZI <lgl>,
## #   BGN_LOCATI <lgl>, END_DATE <lgl>, END_TIME <lgl>, COUNTY_END <dbl>,
## #   COUNTYENDN <lgl>, END_RANGE <dbl>, END_AZI <lgl>, END_LOCATI <lgl>,
## #   LENGTH <dbl>, WIDTH <dbl>, F <dbl>, MAG <dbl>, FATALITIES <dbl>,
## #   INJURIES <dbl>, PROPDMG <dbl>, PROPDMGEXP <chr>, CROPDMG <dbl>,
## #   CROPDMGEXP <lgl>, WFO <lgl>, STATEOFFIC <lgl>, ZONENAMES <lgl>,
## #   LATITUDE <dbl>, LONGITUDE <dbl>, LATITUDE_E <dbl>, LONGITUDE_ <dbl>,
## #   REMARKS <lgl>, REFNUM <dbl>

The analysis of damage based on EVTYPE (event type), FATALITIES, INJURIES, PROPDMG (property damage), PROPDMGEXP (property damage expense), CROPDMG (crop damage), and CROPDMGEXP (crop damage expense). So, we will prepare data for analysis as follows

data <- storm[, c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]

head(data)
## # A tibble: 6 x 7
##   EVTYPE  FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
##   <chr>        <dbl>    <dbl>   <dbl> <chr>        <dbl> <lgl>     
## 1 TORNADO          0       15    25   K                0 NA        
## 2 TORNADO          0        0     2.5 K                0 NA        
## 3 TORNADO          0        2    25   K                0 NA        
## 4 TORNADO          0        2     2.5 K                0 NA        
## 5 TORNADO          0        2     2.5 K                0 NA        
## 6 TORNADO          0        6     2.5 K                0 NA

1. Across the United States, which types of events are most harmful with respect to population health?

# Organize type of events
fatalities <- aggregate(FATALITIES ~ EVTYPE, data = data, sum)
injuries <- aggregate(INJURIES ~ EVTYPE, data = data, sum)

# Sort fatalities
fatalities <- fatalities[order(-fatalities$FATALITIES),][1:20,]
fatalities$EVTYPE <- factor(fatalities$EVTYPE, levels = fatalities$EVTYPE)
head(fatalities)
##             EVTYPE FATALITIES
## 826        TORNADO       5633
## 122 EXCESSIVE HEAT       1903
## 145    FLASH FLOOD        978
## 267           HEAT        937
## 456      LIGHTNING        816
## 848      TSTM WIND        504
# Sort injuries
injuries <- injuries[order(-injuries$INJURIES),][1:20,]
injuries$EVTYPE <- factor(injuries$EVTYPE, levels = injuries$EVTYPE)
head(injuries)
##             EVTYPE INJURIES
## 826        TORNADO    91346
## 848      TSTM WIND     6957
## 162          FLOOD     6789
## 122 EXCESSIVE HEAT     6525
## 456      LIGHTNING     5230
## 267           HEAT     2100
p1 <- ggplot(fatalities, aes(x = EVTYPE, y = FATALITIES, theme_set(theme_bw())))  +
  geom_bar(stat = 'identity', fill = 'blue') +
  labs(title = 'Fatalities by top 20 Weather Event Types', x = 'Event Type', y = 'Fatalities') +
  theme(plot.title = element_text(size = 10), axis.text.x = element_text(angle = 90, hjust = 1, size = 6))

p2 <- ggplot(injuries, aes(x = EVTYPE, y =INJURIES, theme_set(theme_bw()))) +
  geom_bar(stat = 'identity', fill = 'darkgreen') +
  labs(title = 'Injuries by top 20 Weather Event Types', x = 'Event Type', y = 'Injuries') +
  theme(plot.title = element_text(size = 10), axis.text.x = element_text(angle = 90, hjust = 1, size = 6))

library(gridExtra)
grid.arrange(p1, p2, ncol = 2, top = "Most Harmful Events with Respect to Population Health")

—>Tornado is an event type has the highest level of Fatalities and Injuries.

2. Across the United States, which types of events have the greatest economic consequences?

unique(data$PROPDMGEXP)
##  [1] "K" "M" NA  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
unique(data$CROPDMGEXP)
## [1]    NA FALSE

We can see that there are both numerical and alphbetical characters to represent significant digits. For example, “8” would be 10^8 and “H” or “h” would be hundreds. We will now convert PROPDMGEXP and CROPDMGEXP fields to tangile numbers where H(hundreds = 10^2), K(thousands = 10^3), M(millions = 10^6), B(billions = 10^9) based on Wikipedia power of 10 table

symbol <- c('0','1','2','3','4','5','6','7','8','9','H','K','M','B','h','k','m','b' )
factor <- c(rep(0:9), 2,3,6,9,2,3,6,9)
multiplier <- data.frame(symbol, factor)
data$PROPDMGDOLLARS <- data$PROPDMG*10^multiplier[match(data$PROPDMGEXP, multiplier$symbol),2]
data$CROPDMGDOLLARS <- data$CROPDMG*10^multiplier[match(data$PROPDMGEXP, multiplier$symbol), 2]
# Organize Property & Crop to Event Type and store in object called "economicconsequenses"
economicconsequenses <- aggregate(PROPDMGDOLLARS + CROPDMGDOLLARS ~ EVTYPE, data=data, sum)
names(economicconsequenses) = c("EVENT_TYPE", "TOTAL_DAMAGE")
# Sort 
economicconsequenses <- economicconsequenses[order(-economicconsequenses$TOTAL_DAMAGE), ][1:20, ]
economicconsequenses$EVENT_TYPE <- factor(economicconsequenses$EVENT_TYPE, levels = economicconsequenses$EVENT_TYPE)
## Check headers
head(economicconsequenses)
##            EVENT_TYPE TOTAL_DAMAGE
## 168         HURRICANE 814750235010
## 176 HURRICANE/TYPHOON 802074291330
## 58              FLOOD 231909682070
## 330           TORNADO  85217252847
## 46        FLASH FLOOD  55687860812
## 278       STORM SURGE  43328536000
# Plot
ggplot(economicconsequenses, aes(x = EVENT_TYPE, y = TOTAL_DAMAGE, theme_set(theme_bw()))) +
    geom_bar(stat = "identity", fill = "purple") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 6)) + 
    xlab("Event Type") + ylab("Total Damage in $USD") + ggtitle("Total Property & Crop Damage by top 20 Weather Events")