1: Data Introduction

This assignment’s data source comes from the storm’s impact towards fatalities, injuries and property damage coming from US National Oceanic and Atmospheric Administration’s storm data. Listing when the storm occurs, and estimating of any fatalities, injuries and property damage.

The data set is located in the dataset link to download.

For detailed explanation for individual parameters, please refer to the documentation link.

2: Data Proceessing

2.1: Environment Setup

Obtain raw data file and extract the data into a data frame for filtering.

## load necessary packages
library(data.table)
library(ggplot2)
library(dplyr)

# Download the file
PA02 <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(PA02, "StormData.csv.bz2")
stormF <- read.csv("StormData.csv.bz2")

# Converting data.frame to data.table
stormT <- as.data.table(stormF)

2.2: Verify Data Structure & Column Names

# Extract valuable data input due to the bz file extract while remove 
# non-value adding entries.
stormT <- stormT[1:547362, ]

# Convert the damage of crop and properties into numeric values
stormT$PROPDMG <- as.numeric(stormT$PROPDMG, na.rm = FALSE)
stormT$CROPDMG <- as.numeric(stormT$CROPDMG, na.rm = FALSE)
stormT$FATALITIES <- as.numeric(stormT$FATALITIES, na.rm = FALSE)
stormT$INJURIES <- as.numeric(stormT$INJURIES, na.rm = FALSE)

str(stormT)
## Classes 'data.table' and 'data.frame':   547362 obs. of  37 variables:
##  $ STATE__   : chr  "1.00" "1.00" "1.00" "1.00" ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : chr  "97.00" "3.00" "57.00" "89.00" ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : chr  "0.00" "0.00" "0.00" "0.00" ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: chr  "0.00" "0.00" "0.00" "0.00" ...
##  $ COUNTYENDN: chr  "" "" "" "" ...
##  $ END_RANGE : chr  "0.00" "0.00" "0.00" "0.00" ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : chr  "14.00" "2.00" "0.10" "0.00" ...
##  $ WIDTH     : chr  "100.00" "150.00" "123.00" "100.00" ...
##  $ F         : chr  "3" "2" "2" "2" ...
##  $ MAG       : chr  "0.00" "0.00" "0.00" "0.00" ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : chr  "3040.00" "3042.00" "3340.00" "3458.00" ...
##  $ LONGITUDE : chr  "8812.00" "8755.00" "8742.00" "8626.00" ...
##  $ LATITUDE_E: chr  "3051.00" "0.00" "0.00" "0.00" ...
##  $ LONGITUDE_: chr  "8806.00" "0.00" "0.00" "0.00" ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : chr  "1.00" "2.00" "3.00" "4.00" ...
##  - attr(*, ".internal.selfref")=<externalptr>

2.3: Data Wrangling

Subset the data only for the interested parameters, in this case will only be occurred event, fatalities, injuries, prop and crop damages

# Filtering via dplyr command
stormT <- stormT %>%
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
  filter(EVTYPE != "?")%>%
  filter((INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0))

# Change all damaged exponents to upper case for correction.
exp <- c("PROPDMGEXP","CROPDMGEXP")
stormT[, (exp) := c(lapply(.SD, toupper)), .SDcols = exp]
              
# Map property damage alphanumeric exponents to numeric values.
propDmgExp <- c("\"\"" = 10^0,
                 "-" = 10^0, 
                 "+" = 10^0,
                 "0" = 10^0,
                 "1" = 10^1,
                 "2" = 10^2,
                 "3" = 10^3,
                 "4" = 10^4,
                 "5" = 10^5,
                 "6" = 10^6,
                 "7" = 10^7,
                 "8" = 10^8,
                 "9" = 10^9,
                 "H" = 10^2,
                 "K" = 10^3,
                 "M" = 10^6,
                 "B" = 10^9)

# Map crop damage alphanumeric exponents to numeric values.
cropDmgExp <-  c("\"\"" = 10^0,
                "?" = 10^0, 
                "0" = 10^0,
                "K" = 10^3,
                "M" = 10^6,
                "B" = 10^9)
                
# Replace the data set's value with characters before using the logical vector to convert to numeric.
stormT[, PROPDMGEXP := propDmgExp[as.character(stormT[, PROPDMGEXP])]]
stormT[, CROPDMGEXP := cropDmgExp[as.character(stormT[, CROPDMGEXP])]] 

# Convert the zero values to default numeric (which is 10^0)
stormT[is.na(PROPDMGEXP), PROPDMGEXP := 10^0]
stormT[is.na(CROPDMGEXP), CROPDMGEXP := 10^0]

2.4: Calculate Economic/Injury Summary.

# Mutate data table by add extra columns for the calculation
stormT <- stormT %>%
  mutate(Cost_Prop = as.numeric(PROPDMG) * PROPDMGEXP, Cost_Crop = as.numeric(CROPDMG )* CROPDMGEXP)
# Perform the Total Cost summary by having the 3 data columns for Prop Cost, Crop Cost and Total Cost by Event Type Classification
Cost_Total <- stormT[, .(Cost_Prop = sum(Cost_Prop), 
                         Cost_Crop = sum(Cost_Crop), 
                         Cost_Total = sum(Cost_Prop + Cost_Crop))
                     , by =.(EVTYPE)]

# Order by Reverse with Total Cost to see the top 10 EVTYPE failure
Cost_Total <- Cost_Total %>%
                arrange(desc(Cost_Total))

Cost_Total <- Cost_Total [1:10, ]

# Check the structure by only showing the top 5 EVTYPE which caused high costs.
head(Cost_Total, 5)
##               EVTYPE   Cost_Prop  Cost_Crop  Cost_Total
## 1:           TORNADO 40987926487  216014370 41203940857
## 2: HURRICANE/TYPHOON 19403415000  586770800 19990185800
## 3:             FLOOD 12338106477 2333949550 14672056027
## 4:         HURRICANE  9400719010 2561400000 11962119010
## 5:           DROUGHT   845298000 9860245000 10705543000
# Perform the Total Injuries summary by having the 3 data columns for Injuries, Fatalities and Total Injuries by Event Type Classification
Inj_Tot <- stormT[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTAL_INJURIES = sum(FATALITIES + INJURIES)), by =.(EVTYPE)]

# Order by Reverse with fatality count to see the top 10 EVTYPE failure
Inj_Tot <- Inj_Tot[order(-FATALITIES), ]
Inj_Tot <- Inj_Tot[1:10, ]
head(Inj_Tot, 5)
##            EVTYPE FATALITIES INJURIES TOTAL_INJURIES
## 1:        TORNADO       4658    80084          84742
## 2: EXCESSIVE HEAT       1416     4354           5770
## 3:           HEAT        708      878           1586
## 4:      LIGHTNING        562     3628           4190
## 5:    FLASH FLOOD        559     1407           1966

3: Results

3.1: Economic Chart Summary

This chart will display and rank the event which caused high economic impact for crop and property damages.

# Melt the data.table for bar graph format with sub levels
eimpact <- melt(Cost_Total, id.vars="EVTYPE", variable.name = "Damage_Type")
head(eimpact, 5)
##               EVTYPE Damage_Type       value
## 1:           TORNADO   Cost_Prop 40987926487
## 2: HURRICANE/TYPHOON   Cost_Prop 19403415000
## 3:             FLOOD   Cost_Prop 12338106477
## 4:         HURRICANE   Cost_Prop  9400719010
## 5:           DROUGHT   Cost_Prop   845298000
# Generate Chart
econChart <- ggplot(eimpact, aes(x = reorder(EVTYPE, -value), y = value)) + 
  geom_bar(stat = "identity", aes(fill = Damage_Type), position ="dodge") + 
  labs(x = "Event Types", y = "Frequency Count", title = "Top 10 US Storm Events with Strong Economic Impact")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        axis.text.y = element_text(size = 8),
        axis.title = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 11),
        legend.title = element_text(size = 7),
        legend.text = element_text(size = 7))+
  scale_fill_manual(values = c("#45B39D","#8A14B3","#E02A5F"))

econChart

3.2: Fatality Chart Summary

This chart will display and rank the event which caused the severe fatality and injuries for the populations

# Melt the data.table for bar graph format with sub levels
chaos <- melt(Inj_Tot, id.vars="EVTYPE", variable.name = "Chaos_Level")
head(chaos, 5)
##            EVTYPE Chaos_Level value
## 1:        TORNADO  FATALITIES  4658
## 2: EXCESSIVE HEAT  FATALITIES  1416
## 3:           HEAT  FATALITIES   708
## 4:      LIGHTNING  FATALITIES   562
## 5:    FLASH FLOOD  FATALITIES   559
# Generate Chart
InjuryChart <- ggplot(chaos, aes(x = reorder(EVTYPE, -value), y = value)) +
  geom_bar(stat = "identity", aes(fill = Chaos_Level), position ="dodge") +
  labs(x = "Event Types", y = "Frequency", title = "Top 10 US Fatalities Events")+
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        axis.text.y = element_text(size = 8),
        axis.title = element_text(size = 8),
        legend.title = element_text(size = 7),
        legend.text = element_text(size = 7))+
  scale_fill_manual(values = c("#45B39D","#8A14B3","#E02A5F"))

InjuryChart