Synopsis

This reproducible research aims at exploring the NOAA Storm Database and answer some basic questions about severe weather events:

  1. Across the United States, which types of events are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

National Weather Service Storm Data Documentation, which shows us how some of the variables are constructed/defined. (https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf)

1.1 - Data Preparation

setwd("C:/Users/Anhuynh/Desktop/Data Science_Cousera/Reproducible Research/Assignment/data 2")
library(readr)
dataStorm <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = ",", quote = "\"")
str(dataStorm)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

1.2 - Data Processing

1.2.1 - Across the United States, which types of events are most harmful with respect to population health?

library(plyr)

# Count on number of fatalities and figure out the most harmful event

subgroup_fat <- ddply(dataStorm, ~EVTYPE, summarise, num_fatalities = sum(FATALITIES))
data.frame(subgroup_fat[which.max(subgroup_fat$num_fat), ])
##      EVTYPE num_fatalities
## 834 TORNADO           5633
# Count on number of injuries and figure out the most harmful event

subgroup_inj <- ddply(dataStorm, ~EVTYPE, summarise, num_injuries = sum(INJURIES))
data.frame(subgroup_inj[which.max(subgroup_inj$num_injuries), ])
##      EVTYPE num_injuries
## 834 TORNADO        91346

1.2.2 - Result

  • Compared to another events, which are listed in 2.1.1-Storm Data Event Table, Tornado was the most harmful event with respect to population health.
  • This event accounted for 5633 number of fatalities and 91346 number of injuries across the states in the US.

1.2.3 - Data Visualization

The below graphs demonstrate the serious influences of Tornado on number of fatalities and injuries in comparison with the second and the third harmful ones, Excessive Heat and Flood respectively.

library(ggplot2)

# Create subsets for comparison of events

subA <- grep("^[A-Ia-i]", dataStorm$EVTYPE, value=TRUE)
subB <- grep("^[L-Wl-w]", dataStorm$EVTYPE, value=TRUE)

subgroupA <- subset(dataStorm, EVTYPE %in% subA)
subgroupB <- subset(dataStorm, EVTYPE %in% subB)

# Figure out the 1st and 2nd harmful event on group of fatalities

subgroupA_fat <- ddply(subgroupA, ~EVTYPE, summarise, num_fatalities = sum(FATALITIES, na.rm=TRUE))
A <- data.frame(subgroupA_fat[which.max(subgroupA_fat$num_fatalities), ])

subgroupB_fat <- ddply(subgroupB, ~EVTYPE, summarise, num_fatalities = sum(FATALITIES, na.rm=TRUE))
B <- data.frame(subgroupB_fat[which.max(subgroupB_fat$num_fatalities), ])

AB <- rbind(A, B)
AB
##             EVTYPE num_fatalities
## 121 EXCESSIVE HEAT           1903
## 399        TORNADO           5633
# Figure out the 1st and 2nd harmful event on group of injuries

subgroupC_inj <- ddply(subgroupA, ~EVTYPE, summarise, num_injuries = sum(INJURIES, na.rm=TRUE))
C <- data.frame(subgroupC_inj[which.max(subgroupC_inj$num_injuries), ])

subgroupD_inj <- ddply(subgroupB, ~EVTYPE, summarise, num_injuries = sum(INJURIES, na.rm=TRUE))
D <- data.frame(subgroupD_inj[which.max(subgroupD_inj$num_injuries), ])

CD <- rbind(C, D)
CD
##      EVTYPE num_injuries
## 161   FLOOD         6789
## 399 TORNADO        91346

1.2.4 - Data visualization with bar graph.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
colnames(AB) <- c("EVTYPE", "num_event")
colnames(CD) <- c("EVTYPE", "num_event")
com_ABCD <- rbind(AB, CD)

com_ABCD <- tapply(com_ABCD$num_event, com_ABCD$EVTYPE, FUN=sum)
com_ABCD = data.frame(EVTYPE = c("EXCESSIVE HEAT", " FLOOD ", "TORNADO "), 
                      num_event = c(1903, 6789, 96979) )

com_ABCD <- com_ABCD %>%
  group_by(EVTYPE) %>%
  mutate(cum_ev = cumsum(num_event)) %>%
  mutate(sum_ev = sum(num_event) )

ggplot(data=com_ABCD, aes(x = reorder(EVTYPE, -sum_ev), y=num_event, fill=EVTYPE)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=cum_ev, label=num_event), vjust=1, color="black", size=3) +
labs(title="Top 3 Events impacting Population Health") +
labs(x="Event Type",y="Total number of Fatalities and Injuries (cases)") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))

Figure 01: Top 3 Events impacting Population Health

2. Across the United States, which types of events have the greatest economic consequences?

2.1 - Data Processing

# Create new subset for calculating economic damages.

sub_dataStorm <- subset(dataStorm, select = c("STATE", 
                                              "EVTYPE", 
                                              "FATALITIES", 
                                              "INJURIES", 
                                              "PROPDMG", 
                                              "PROPDMGEXP", 
                                              "CROPDMG", 
                                              "CROPDMGEXP"))
# Replace numeric value to PROPDMGEXP 

pEXP <- replace(sub_dataStorm$PROPDMGEXP, sub_dataStorm$PROPDMGEXP %in% c("\"\"",
                                                                       "-",
                                                                       "?",
                                                                       "+",
                                                                       "0",
                                                                       "1",
                                                                       "2",
                                                                       "3",
                                                                       "4",
                                                                       "5",
                                                                       "6",
                                                                       "7",
                                                                       "8",
                                                                       "B",
                                                                       "h",
                                                                       "H",
                                                                       "K",
                                                                       "m",
                                                                       "M"), c(0,
                                                                               0,
                                                                               0,
                                                                               10^0,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^1,
                                                                               10^9,
                                                                               10^2,
                                                                               10^2,
                                                                               10^3,
                                                                               10^6,
                                                                               10^6))
## Warning in x[list] <- values: number of items to replace is not a multiple of
## replacement length
# Replace numeric value to CROPDMGEXP

cEXP <- replace(sub_dataStorm$CROPDMGEXP, sub_dataStorm$CROPDMGEXP %in% c("\"\"",
                                                                          "?",
                                                                          "0",
                                                                          "2",
                                                                          "B",
                                                                          "k",
                                                                          "K",
                                                                          "m",
                                                                          "M"), c(0,
                                                                                  0,
                                                                                  10^1,
                                                                                  10^1,
                                                                                  10^9,
                                                                                  10^3,
                                                                                  10^3,
                                                                                  10^6,
                                                                                  10^6)) 
## Warning in x[list] <- values: number of items to replace is not a multiple of
## replacement length
# Create data with new values to PROPDMGEXP (Property) and CROPDMGEXP (Crop)

final_dataStorm <- mutate(sub_dataStorm, PROPDMGEXP = pEXP, CROPDMGEXP = cEXP)
final_dataStorm <- mutate(final_dataStorm, valuePROP = PROPDMG * as.numeric(as.character(PROPDMGEXP)))
final_dataStorm <- mutate(final_dataStorm, valueCROP = CROPDMG * as.numeric(as.character(CROPDMGEXP)))

final_dataStorm <- final_dataStorm[which(final_dataStorm$CROPDMG >0 & final_dataStorm$PROPDMG >0), ]

head(final_dataStorm)
##        STATE                    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP
## 187566    AL HURRICANE OPAL/HIGH WINDS          2        0     0.1        100
## 187571    AL        THUNDERSTORM WINDS          0        0     5.0       1000
## 187581    AL            HURRICANE ERIN          0        0    25.0          0
## 187583    AL            HURRICANE OPAL          0        0    48.0          0
## 187584    AL            HURRICANE OPAL          0        0    20.0          1
## 187653    AL        THUNDERSTORM WINDS          0        0    50.0         10
##        CROPDMG CROPDMGEXP valuePROP valueCROP
## 187566      10          0        10     0e+00
## 187571     500          0      5000     0e+00
## 187581       1         10         0     1e+01
## 187583       4         10         0     4e+01
## 187584      10      1e+09        20     1e+10
## 187653      50       1000       500     5e+04

2.2- Result

Compute Property and Crop damage values across the event.

library(dplyr)
require(ggplot2)
require(ggmap)
## Loading required package: ggmap
## Warning: package 'ggmap' was built under R version 4.0.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
require(maps)
## Loading required package: maps
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
library(usdata)
## Warning: package 'usdata' was built under R version 4.0.3
prop <- aggregate(valuePROP ~ EVTYPE, final_dataStorm, sum, na.rm=TRUE)
prop_ev = prop[order(prop$valuePROP, decreasing = TRUE), ]
sort_prop_ev <- head(prop_ev, 10)
sort_prop_ev
##                EVTYPE    valuePROP
## 10        FLASH FLOOD 1.173593e+13
## 25               HAIL 1.119754e+13
## 74            TORNADO 9.764030e+12
## 14              FLOOD 7.623682e+12
## 81          TSTM WIND 6.640148e+12
## 65 THUNDERSTORM WINDS 4.613926e+12
## 64  THUNDERSTORM WIND 3.176501e+12
## 77     TROPICAL STORM 7.009258e+11
## 85        URBAN FLOOD 6.001702e+11
## 86     URBAN FLOODING 5.052504e+11
crop <- aggregate(valueCROP ~ EVTYPE, final_dataStorm, sum, na.rm=TRUE)
crop_ev = crop[order(crop$valueCROP, decreasing = TRUE), ]
sort_crop_ev <- head(crop_ev, 10)
sort_crop_ev
##                EVTYPE    valueCROP
## 25               HAIL 4.234878e+13
## 10        FLASH FLOOD 2.174994e+13
## 14              FLOOD 1.731630e+13
## 74            TORNADO 1.188096e+13
## 64  THUNDERSTORM WIND 9.637671e+12
## 81          TSTM WIND 9.091257e+12
## 38          HIGH WIND 1.759873e+12
## 65 THUNDERSTORM WINDS 1.753841e+12
## 82     TSTM WIND/HAIL 1.466429e+12
## 12     FLASH FLOODING 1.165821e+12

Compute Total amount of Property and Crop damage values in millions, across the event.

subgroup_ev <- ddply(final_dataStorm, .(EVTYPE), summarise, 
                     damage_val = sum(valuePROP + valueCROP)/1000000)
data.frame(subgroup_ev[which.max(subgroup_ev$damage_val), ])
##    EVTYPE damage_val
## 14  FLOOD   24939981

Data visualization

# Sorting data for creating graph chart.

subgroup_ev = subgroup_ev[order(subgroup_ev$damage_val, decreasing = TRUE), ]
sort_subgroup_ev <- head(subgroup_ev, 10)

# Data visualization

sort_subgroup_ev$value <- round(sort_subgroup_ev$damage_val)

sort_subgroup_ev <- sort_subgroup_ev %>%
  group_by(EVTYPE) %>%
  mutate(cum_val = cumsum(damage_val)) %>%
  mutate(sum_val = sum(damage_val) )

ggplot(data=sort_subgroup_ev, aes(x = reorder(EVTYPE, -sum_val), y=damage_val, fill=EVTYPE)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=cum_val, label=""), vjust=1, color="black", size=3) +
labs(title="Top 10 Events impacting Economy in the US") +
labs(x="Event Type",y="Total amount of property and crop damages (millions)") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))

Figure 02: Top 10 Events impacting the economy (Properties and Crops) in the US

  • In terms of economic impacts, Flood was responsible for the greatest damages when caused the worst consequences to Property assets and Crop values in the US.
  • Flash Flood accounted for the greatest damage values in Properties while Hail was responsible for the biggest loss in Crops.

Compute total amount of Property and Crop damage values in millions, across the states in the US.

subgroup_ev <- ddply(final_dataStorm, .(EVTYPE, STATE), summarise, 
                     damage_val = sum(valuePROP + valueCROP)/1000000)
data.frame(subgroup_ev[which.max(subgroup_ev$damage_val), ])
##     EVTYPE STATE damage_val
## 159   HAIL    NE   23093439

Among the states, Nebraska was the most suffering from severe Hail when damage values reached to 23,093,439 (US Dollar).

Data visualization with “states Map”

states_map <- map_data("state")

df <- data.frame(state = tolower(abbr2state(subgroup_ev$STATE)), subgroup_ev$damage_val)
names(df) <- c("region", "value")

join_df <- left_join(df, states_map, by = "region")


mp <- ggplot() + 
        geom_polygon(data=states_map, aes(x=long, y=lat, group = group), fill="grey92" ) +                          ggtitle("Economic Damages States Map") +
        geom_point(data=join_df, aes(x=long, y=lat, size = value, color = value)) +
        guides(size=guide_legend("Event Index")) +
        theme_void()
mp
## Warning: Removed 3991 rows containing missing values (geom_point).

Figure 03: Economic Damages (Properties and Crops loss) States Map