Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
The following code is to load the dataset:
storm <- read.csv("repdata_data_StormData.csv", sep = ",", header = TRUE)
head(storm)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
The following code only extract the related variables for the project:
data <- storm[,c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
str(data)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
head(data)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
Total population injury with the sum up of fatality and injuries
data <- mutate(data, TTL_INJURIES = FATALITIES + INJURIES)
Checking on the patterns of data for variables PROPDMGEXP
unique(unique(data$PROPDMGEXP))
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
Checking of data based on invalid exponent data:
head(data[data$PROPDMGEXP =="+",],n=15)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## 188780 BREAKUP FLOODING 0 0 20 + 0
## 189001 HIGH WIND 0 0 20 + 0
## 192262 FLOODING/HEAVY RAIN 0 0 2 + 0
## 216755 HIGH WINDS 0 0 15 + 0
## 216802 TORNADO 0 0 60 + 0
## CROPDMGEXP TTL_INJURIES
## 188780 0
## 189001 0
## 192262 0
## 216755 0
## 216802 0
head(data[data$PROPDMGEXP =="",],n=15)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 54 TSTM WIND 0 0 0 0
## 55 HAIL 0 0 0 0
## 56 HAIL 0 0 0 0
## 57 TSTM WIND 0 0 0 0
## 58 HAIL 0 0 0 0
## 59 TSTM WIND 0 0 0 0
## 60 TSTM WIND 0 0 0 0
## 61 HAIL 0 0 0 0
## 62 HAIL 0 0 0 0
## 63 HAIL 0 0 0 0
## 64 TSTM WIND 0 0 0 0
## 65 TSTM WIND 0 0 0 0
## 66 TSTM WIND 0 0 0 0
## 67 HAIL 0 0 0 0
## 69 TSTM WIND 0 0 0 0
## TTL_INJURIES
## 54 0
## 55 0
## 56 0
## 57 0
## 58 0
## 59 0
## 60 0
## 61 0
## 62 0
## 63 0
## 64 0
## 65 0
## 66 0
## 67 0
## 69 0
head(data[data$PROPDMGEXP =="?",],n=15)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## 198689 THUNDERSTORM WINDS 0 0 0 ? 0
## 225254 FLASH FLOOD 0 0 0 ? 0
## 227409 FLASH FLOOD 0 0 0 ? 0
## 232016 THUNDERSTORM WIND 0 0 0 ? 0
## 233746 HAIL 0 0 0 ? 0
## 233747 HAIL 0 0 0 ? 0
## 233748 HAIL 0 0 0 ? 0
## 247617 THUNDERSTORM WINDS 0 0 0 ? 0
## CROPDMGEXP TTL_INJURIES
## 198689 0
## 225254 0
## 227409 0
## 232016 0
## 233746 0
## 233747 0
## 233748 0
## 247617 0
head(data[data$PROPDMGEXP =="-",],n=15)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 229327 HIGH WIND 2 0 15 - 0
## TTL_INJURIES
## 229327 2
The transformation of data refer: to https://rpubs.com/gcctang1/271126
Invalid exponent data (“?”,“-”,"") will be hardcoded ‘0’
data$PROPDMGEXP_1[data$PROPDMGEXP %in% c("K","3")] <- 1e+3
data$PROPDMGEXP_1[data$PROPDMGEXP %in% c("M","m","6")] <- 1e+6
data$PROPDMGEXP_1[data$PROPDMGEXP =="B"] <- 1e+9
data$PROPDMGEXP_1[data$PROPDMGEXP %in% ("+")] <- 1
data$PROPDMGEXP_1[data$PROPDMGEXP =="5"] <- 1e+5
data$PROPDMGEXP_1[data$PROPDMGEXP =="4"] <- 1e+4
data$PROPDMGEXP_1[data$PROPDMGEXP %in% c("2","h","H")] <- 1e+2
data$PROPDMGEXP_1[data$PROPDMGEXP =="7"] <- 1e+7
data$PROPDMGEXP_1[data$PROPDMGEXP =="1"] <- 1e+1
data$PROPDMGEXP_1[data$PROPDMGEXP =="8"] <- 1e+8
data$PROPDMGEXP_1[data$PROPDMGEXP %in% c("0","?","-","")] <- 0
data <- mutate(data, PROPDMG_1 = PROPDMG * PROPDMGEXP_1)
Checking on the patterns of data for variables CROPDMGEXP
unique(unique(data$CROPDMGEXP))
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
Checking of data based on invalid exponent data:
head(data[data$CROPDMGEXP =="?",],n=15)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## 192467 FLASH FLOOD WINDS 0 0 0.41 0
## 197066 THUNDERSTORM WINDS 0 0 0.50 K 0
## 197331 THUNDERSTORM WINDS 0 0 0.50 K 0
## 220300 THUNDERSTORM WINDS 0 0 0.00 0
## 220877 FLOOD/FLASH FLOOD 0 0 400.00 K 0
## 232901 FLOOD/FLASH FLOOD 0 0 0.50 M 0
## 242953 THUNDERSTORM WINDS 0 0 80.00 K 0
## CROPDMGEXP TTL_INJURIES PROPDMGEXP_1 PROPDMG_1
## 192467 ? 0 0e+00 0e+00
## 197066 ? 0 1e+03 5e+02
## 197331 ? 0 1e+03 5e+02
## 220300 ? 0 0e+00 0e+00
## 220877 ? 0 1e+03 4e+05
## 232901 ? 0 1e+06 5e+05
## 242953 ? 0 1e+03 8e+04
Invalid exponent data (“?”,“-”,"") will be hardcoded ‘0’
data$CROPDMGEXP_1[data$CROPDMGEXP %in% c("M","m","6")] <- 1e+6
data$CROPDMGEXP_1[data$CROPDMGEXP %in% c("K","k","3")] <- 1e+3
data$CROPDMGEXP_1[data$CROPDMGEXP =="B"] <- 1e+9
data$CROPDMGEXP_1[data$CROPDMGEXP %in% c("0","?","-","")] <- 0
data$CROPDMGEXP_1[data$CROPDMGEXP %in% c("2","h","H")] <- 1e+2
data <- mutate(data, CROPDMG_1 = CROPDMG * CROPDMGEXP_1)
Adding a variable for total damaged (sum up of property damage and crop damage)
data <- mutate(data, TTL_DMG = PROPDMG_1 + CROPDMG_1 )
Chart below will showing the top seven events that causing highest total fatalities and highest total injuries.
fatal_agg <- aggregate( FATALITIES ~ EVTYPE, data, FUN=sum) %>% arrange(desc(FATALITIES))
g <- ggplot(fatal_agg[1:7,], aes(x=reorder(EVTYPE,-FATALITIES), y=FATALITIES) )
a<- g + geom_bar(stat="identity", fill = "#FF6666") +
theme(plot.title = element_text(hjust= 0.5) ,axis.text.x = element_text(angle=90, hjust=1) ,
plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm") ) +
xlab("Event Type") +
ylab("Total Fatalities") +
ggtitle("Top Seven Fatalities Event")
Injury_agg <- aggregate( INJURIES ~ EVTYPE, data, FUN=sum ) %>% arrange(desc(INJURIES))
g1 <- ggplot(Injury_agg[1:7,], aes(x=reorder(EVTYPE,-INJURIES), y=INJURIES) )
b<- g1 + geom_bar(stat="identity", fill="SKY BLUE") +
theme(plot.title = element_text(hjust= 0.5) ,axis.text.x = element_text(angle=90, hjust=1) ,
plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm") ) +
xlab("Event Type") +
ylab("Total Injuries") +
ggtitle("Top Seven Injuries Event")
grid.arrange(a,b, ncol=2)
The table below showing the summary of top 7 EVENTS that causing casualty (fatal + injury). The percentage is the percentage of events causing casualty compared to all EVENT types in our study.
fatal_agg_1 <- mutate(fatal_agg, type="fatal") %>% rename(CASUALTY = FATALITIES)
Injury_agg_1 <- mutate(Injury_agg, type="injury") %>% rename(CASUALTY = INJURIES)
Total_casualty <- rbind(fatal_agg_1, Injury_agg_1)
Total_casualty_1 <- aggregate( CASUALTY ~ EVTYPE, Total_casualty, FUN=sum ) %>% arrange(desc(CASUALTY)) %>% mutate(PERCENTAGE = round(CASUALTY/ sum(CASUALTY) *100,digits=2) )
head(Total_casualty_1,n=7)
## EVTYPE CASUALTY PERCENTAGE
## 1 TORNADO 96979 62.30
## 2 EXCESSIVE HEAT 8428 5.41
## 3 TSTM WIND 7461 4.79
## 4 FLOOD 7259 4.66
## 5 LIGHTNING 6046 3.88
## 6 HEAT 3037 1.95
## 7 FLASH FLOOD 2755 1.77
Charts below showing top casualty events: (included sum of fatalities and injuries)
The top casualty events will be using the events in above charts for comparison
g3 <- ggplot(Total_casualty[Total_casualty$EVTYPE %in% c("TORNADO","EXCESSIVE HEAT","TSTM WIND","FLOOD","LIGHTNING","HEAT","FLASH FLOOD","ICE STORM"), ]
, aes(x=reorder(EVTYPE,-CASUALTY), y=CASUALTY, fill=type) )
c<- g3 + geom_bar(stat="identity") +
theme(plot.title = element_text(hjust= 0.5) ,axis.text.x = element_text(angle=90, hjust=1) ,
plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm") ) +
xlab("Event Type") +
ylab("Total Casualty") +
ggtitle("Top Seven Casualty Event")
plot(c)
In conclusion, TORNADO having the highest total casualty (96979, 62.30%) compared to other type of EVENTs.
Chart below will showing the top five events that causing highest property damage and highest crop damage.
TTL_dmg_agg <- aggregate( TTL_DMG ~ EVTYPE, data, FUN=sum ) %>% arrange(desc(TTL_DMG)) %>% mutate(PERCENTAGE = round(TTL_DMG/ sum(TTL_DMG) *100,digits=2))
head(TTL_dmg_agg)
## EVTYPE TTL_DMG PERCENTAGE
## 1 FLOOD 150319678250 31.49
## 2 HURRICANE/TYPHOON 71913712800 15.07
## 3 TORNADO 57362333650 12.02
## 4 STORM SURGE 43323541000 9.08
## 5 HAIL 18761221670 3.93
## 6 FLASH FLOOD 18243990610 3.82
g6 <- ggplot(TTL_dmg_agg[1:5,], aes(x=reorder(EVTYPE,-TTL_DMG), y=TTL_DMG) )
f<- g6 + geom_bar(stat="identity", fill="LIGHT GREEN") +
theme(plot.title = element_text(hjust= 0.5) ,axis.text.x = element_text(angle=90, hjust=1) ,
plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm") ) +
xlab("Event Type") +
ylab("Total Economic Damange") +
ggtitle("Top Total DMG")
Prop_agg <- aggregate( PROPDMG_1 ~ EVTYPE, data, FUN=sum) %>% arrange(desc(PROPDMG_1))
g4 <- ggplot(Prop_agg[1:5,], aes(x=reorder(EVTYPE,-PROPDMG_1), y=PROPDMG_1) )
d<- g4 + geom_bar(stat="identity", fill = "#FF6666") +
theme(plot.title = element_text(hjust= 0.5) ,axis.text.x = element_text(angle=90, hjust=1) ,
plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm") ) +
xlab("Event Type") +
ylab("Total Property Damage") +
ggtitle("Top Property DMG")
Crop_agg <- aggregate( CROPDMG_1 ~ EVTYPE, data, FUN=sum ) %>% arrange(desc(CROPDMG_1))
g5 <- ggplot(Crop_agg[1:5,], aes(x=reorder(EVTYPE,-CROPDMG_1), y=CROPDMG_1) )
e<- g5 + geom_bar(stat="identity", fill="SKY BLUE") +
theme(plot.title = element_text(hjust= 0.5) ,axis.text.x = element_text(angle=90, hjust=1) ,
plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm") ) +
xlab("Event Type") +
ylab("Total Crop Damange") +
ggtitle("Top Crop DMG")
grid.arrange(f,d,e, ncol=3)
From the charts, Highest Property Damage caused by FLOOD while in Crop, DROUGHT causing the highest damage.
When compairing the total damage, Flood having the highest damage followed by HURRICANE/TYPHOON and TORNADO.
Thus, top three events (FLOOD, HURRICANE/TYPHOON and TORNADO) that contribute to property damge having the highest impact in total economic damage.