Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project will focus on exploring the NOAA Storm Database to identify which type of severe weather events are most harmful with respect to population health as well as have the greatest economic consequences.
library(ggplot2)
library(readr)
library(dplyr)
storm <- read_csv("C:/Users/tuuye/Desktop/Data Science course/Reproducible Research/repdata_data_StormData.csv")
First, we will check the dimemsion as well as the first six rows of the dataset
dim(storm)
## [1] 902297 37
The dataset has 902297 rows and 37 variables
head(storm)
## # A tibble: 6 x 37
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1 4/18/19~ 0130 CST 97 MOBILE AL TORNA~
## 2 1 4/18/19~ 0145 CST 3 BALDWIN AL TORNA~
## 3 1 2/20/19~ 1600 CST 57 FAYETTE AL TORNA~
## 4 1 6/8/195~ 0900 CST 89 MADISON AL TORNA~
## 5 1 11/15/1~ 1500 CST 43 CULLMAN AL TORNA~
## 6 1 11/15/1~ 2000 CST 77 LAUDERDALE AL TORNA~
## # ... with 29 more variables: BGN_RANGE <dbl>, BGN_AZI <lgl>,
## # BGN_LOCATI <lgl>, END_DATE <lgl>, END_TIME <lgl>, COUNTY_END <dbl>,
## # COUNTYENDN <lgl>, END_RANGE <dbl>, END_AZI <lgl>, END_LOCATI <lgl>,
## # LENGTH <dbl>, WIDTH <dbl>, F <dbl>, MAG <dbl>, FATALITIES <dbl>,
## # INJURIES <dbl>, PROPDMG <dbl>, PROPDMGEXP <chr>, CROPDMG <dbl>,
## # CROPDMGEXP <lgl>, WFO <lgl>, STATEOFFIC <lgl>, ZONENAMES <lgl>,
## # LATITUDE <dbl>, LONGITUDE <dbl>, LATITUDE_E <dbl>, LONGITUDE_ <dbl>,
## # REMARKS <lgl>, REFNUM <dbl>
The analysis of damage based on EVTYPE (event type), FATALITIES, INJURIES, PROPDMG (property damage), PROPDMGEXP (property damage expense), CROPDMG (crop damage), and CROPDMGEXP (crop damage expense). So, we will prepare data for analysis as follows
data <- storm[, c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]
head(data)
## # A tibble: 6 x 7
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <lgl>
## 1 TORNADO 0 15 25 K 0 NA
## 2 TORNADO 0 0 2.5 K 0 NA
## 3 TORNADO 0 2 25 K 0 NA
## 4 TORNADO 0 2 2.5 K 0 NA
## 5 TORNADO 0 2 2.5 K 0 NA
## 6 TORNADO 0 6 2.5 K 0 NA
# Organize type of events
fatalities <- aggregate(FATALITIES ~ EVTYPE, data = data, sum)
injuries <- aggregate(INJURIES ~ EVTYPE, data = data, sum)
# Sort fatalities
fatalities <- fatalities[order(-fatalities$FATALITIES),][1:20,]
fatalities$EVTYPE <- factor(fatalities$EVTYPE, levels = fatalities$EVTYPE)
head(fatalities)
## EVTYPE FATALITIES
## 826 TORNADO 5633
## 122 EXCESSIVE HEAT 1903
## 145 FLASH FLOOD 978
## 267 HEAT 937
## 456 LIGHTNING 816
## 848 TSTM WIND 504
# Sort injuries
injuries <- injuries[order(-injuries$INJURIES),][1:20,]
injuries$EVTYPE <- factor(injuries$EVTYPE, levels = injuries$EVTYPE)
head(injuries)
## EVTYPE INJURIES
## 826 TORNADO 91346
## 848 TSTM WIND 6957
## 162 FLOOD 6789
## 122 EXCESSIVE HEAT 6525
## 456 LIGHTNING 5230
## 267 HEAT 2100
p1 <- ggplot(fatalities, aes(x = EVTYPE, y = FATALITIES, theme_set(theme_bw()))) +
geom_bar(stat = 'identity', fill = 'blue') +
labs(title = 'Fatalities by top 20 Weather Event Types', x = 'Event Type', y = 'Fatalities') +
theme(plot.title = element_text(size = 10), axis.text.x = element_text(angle = 90, hjust = 1, size = 6))
p2 <- ggplot(injuries, aes(x = EVTYPE, y =INJURIES, theme_set(theme_bw()))) +
geom_bar(stat = 'identity', fill = 'darkgreen') +
labs(title = 'Injuries by top 20 Weather Event Types', x = 'Event Type', y = 'Injuries') +
theme(plot.title = element_text(size = 10), axis.text.x = element_text(angle = 90, hjust = 1, size = 6))
library(gridExtra)
grid.arrange(p1, p2, ncol = 2, top = "Most Harmful Events with Respect to Population Health")
—>Tornado is an event type has the highest level of Fatalities and Injuries.
unique(data$PROPDMGEXP)
## [1] "K" "M" NA "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
unique(data$CROPDMGEXP)
## [1] NA FALSE
We can see that there are both numerical and alphbetical characters to represent significant digits. For example, “8” would be 10^8 and “H” or “h” would be hundreds. We will now convert PROPDMGEXP and CROPDMGEXP fields to tangile numbers where H(hundreds = 10^2), K(thousands = 10^3), M(millions = 10^6), B(billions = 10^9) based on Wikipedia power of 10 table
symbol <- c('0','1','2','3','4','5','6','7','8','9','H','K','M','B','h','k','m','b' )
factor <- c(rep(0:9), 2,3,6,9,2,3,6,9)
multiplier <- data.frame(symbol, factor)
data$PROPDMGDOLLARS <- data$PROPDMG*10^multiplier[match(data$PROPDMGEXP, multiplier$symbol),2]
data$CROPDMGDOLLARS <- data$CROPDMG*10^multiplier[match(data$PROPDMGEXP, multiplier$symbol), 2]
# Organize Property & Crop to Event Type and store in object called "economicconsequenses"
economicconsequenses <- aggregate(PROPDMGDOLLARS + CROPDMGDOLLARS ~ EVTYPE, data=data, sum)
names(economicconsequenses) = c("EVENT_TYPE", "TOTAL_DAMAGE")
# Sort
economicconsequenses <- economicconsequenses[order(-economicconsequenses$TOTAL_DAMAGE), ][1:20, ]
economicconsequenses$EVENT_TYPE <- factor(economicconsequenses$EVENT_TYPE, levels = economicconsequenses$EVENT_TYPE)
## Check headers
head(economicconsequenses)
## EVENT_TYPE TOTAL_DAMAGE
## 168 HURRICANE 814750235010
## 176 HURRICANE/TYPHOON 802074291330
## 58 FLOOD 231909682070
## 330 TORNADO 85217252847
## 46 FLASH FLOOD 55687860812
## 278 STORM SURGE 43328536000
# Plot
ggplot(economicconsequenses, aes(x = EVENT_TYPE, y = TOTAL_DAMAGE, theme_set(theme_bw()))) +
geom_bar(stat = "identity", fill = "purple") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 6)) +
xlab("Event Type") + ylab("Total Damage in $USD") + ggtitle("Total Property & Crop Damage by top 20 Weather Events")