Exploration the NOAA Storm Database and answering the following questions: 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? 2. Across the United States, which types of events have the greatest economic consequences?
Answering the questions above I will consider different severe events as following: 1. Event of Which type produces maximum average harm per event of that type? 1. Events of Which type produce maximum overall harm (all events of that type together)?
#Data Processing section
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
filePath <- './data/repdata_data_StormData.csv.bz2'
df <- read.csv(filePath)
df <- df[, c("EVTYPE", "INJURIES" , "PROPDMG" , "PROPDMGEXP" ,"CROPDMG" , "CROPDMGEXP")]
head(df)
## EVTYPE INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 15 25.0 K 0
## 2 TORNADO 0 2.5 K 0
## 3 TORNADO 2 25.0 K 0
## 4 TORNADO 2 2.5 K 0
## 5 TORNADO 2 2.5 K 0
## 6 TORNADO 6 2.5 K 0
print(dim(df))
## [1] 902297 6
print(names(df))
## [1] "EVTYPE" "INJURIES" "PROPDMG" "PROPDMGEXP" "CROPDMG"
## [6] "CROPDMGEXP"
Now let’s declare the rule of my research: Then I consider population health harm, I take in consideration only INJURIES (FATALITIES aren’t considered).
Let’s add new column - count of events of that type
df1 <- df %>%
group_by(EVTYPE) %>%
mutate(EVTYPE.count = n()) %>%
ungroup()
head(df1)
## # A tibble: 6 × 7
## EVTYPE INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP EVTYPE.count
## <chr> <dbl> <dbl> <chr> <dbl> <chr> <int>
## 1 TORNADO 15 25 K 0 "" 60652
## 2 TORNADO 0 2.5 K 0 "" 60652
## 3 TORNADO 2 25 K 0 "" 60652
## 4 TORNADO 2 2.5 K 0 "" 60652
## 5 TORNADO 2 2.5 K 0 "" 60652
## 6 TORNADO 6 2.5 K 0 "" 60652
df1 <- df1 %>%
group_by(EVTYPE) %>%
mutate(total.INJURIES.count = sum(INJURIES)) %>%
ungroup()
print(df1)
## # A tibble: 902,297 × 8
## EVTYPE INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP EVTYPE.count
## <chr> <dbl> <dbl> <chr> <dbl> <chr> <int>
## 1 TORNADO 15 25 K 0 "" 60652
## 2 TORNADO 0 2.5 K 0 "" 60652
## 3 TORNADO 2 25 K 0 "" 60652
## 4 TORNADO 2 2.5 K 0 "" 60652
## 5 TORNADO 2 2.5 K 0 "" 60652
## 6 TORNADO 6 2.5 K 0 "" 60652
## 7 TORNADO 1 2.5 K 0 "" 60652
## 8 TORNADO 0 2.5 K 0 "" 60652
## 9 TORNADO 14 25 K 0 "" 60652
## 10 TORNADO 0 25 K 0 "" 60652
## # ℹ 902,287 more rows
## # ℹ 1 more variable: total.INJURIES.count <dbl>
df2 <- df1%>%
distinct(EVTYPE, total.INJURIES.count , EVTYPE.count) %>%
mutate(avg.INJURES.count = total.INJURIES.count / EVTYPE.count) %>%
arrange(desc(avg.INJURES.count)) %>%
mutate(avg.INJURES.count = round(avg.INJURES.count, 1)) %>%
select(1,4)
head(df2,20)
## # A tibble: 20 × 2
## EVTYPE avg.INJURES.count
## <chr> <dbl>
## 1 Heat Wave 70
## 2 TROPICAL STORM GORDON 43
## 3 WILD FIRES 37.5
## 4 THUNDERSTORMW 27
## 5 HIGH WIND AND SEAS 20
## 6 SNOW/HIGH WINDS 18
## 7 WINTER STORM HIGH WINDS 15
## 8 GLAZE/ICE STORM 15
## 9 HEAT WAVE DROUGHT 15
## 10 HURRICANE/TYPHOON 14.5
## 11 WINTER WEATHER MIX 11.3
## 12 EXTREME HEAT 7
## 13 NON-SEVERE WIND DAMAGE 7
## 14 GLAZE 6.8
## 15 TSUNAMI 6.4
## 16 WINTER STORMS 5.7
## 17 TORNADO F2 5.3
## 18 WATERSPOUT/TORNADO 5.2
## 19 EXCESSIVE RAINFALL 5.2
## 20 HEAT WAVE 4.2
df3 <- df1 %>%
select(EVTYPE, total.INJURIES.count) %>%
distinct(EVTYPE, total.INJURIES.count) %>%
arrange(desc(total.INJURIES.count)) %>%
mutate(overall.INJURIES.count = sum(total.INJURIES.count)) %>%
mutate(total.INJURIES.count = total.INJURIES.count / overall.INJURIES.count * 100) %>%
rename(INJURIES.percentage = total.INJURIES.count) %>%
select('EVTYPE','INJURIES.percentage') %>%
mutate(INJURIES.percentage = round(INJURIES.percentage,2))
head(df3, 20)
## # A tibble: 20 × 2
## EVTYPE INJURIES.percentage
## <chr> <dbl>
## 1 TORNADO 65
## 2 TSTM WIND 4.95
## 3 FLOOD 4.83
## 4 EXCESSIVE HEAT 4.64
## 5 LIGHTNING 3.72
## 6 HEAT 1.49
## 7 ICE STORM 1.41
## 8 FLASH FLOOD 1.26
## 9 THUNDERSTORM WIND 1.06
## 10 HAIL 0.97
## 11 WINTER STORM 0.94
## 12 HURRICANE/TYPHOON 0.91
## 13 HIGH WIND 0.81
## 14 HEAVY SNOW 0.73
## 15 WILDFIRE 0.65
## 16 THUNDERSTORM WINDS 0.65
## 17 BLIZZARD 0.57
## 18 FOG 0.52
## 19 WILD/FOREST FIRE 0.39
## 20 DUST STORM 0.31
Let’s consider events which have the greatest economic consequences.
First of all let’s exclude those events which have corrupted data in PROPDMG and CROPDMG columns:
#Correct data subset:
df4 <- df %>%
select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
filter((PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') & CROPDMG == 0 )|
(PROPDMG == 0 & CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B')) |
(PROPDMG == 0 & CROPDMG == 0) |
(PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') &
CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B')))
#Corrupted data subset:
df5 <- df %>%
select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
filter(!((PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') & CROPDMG == 0 )|
(PROPDMG == 0 & CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B')) |
(PROPDMG == 0 & CROPDMG == 0) |
(PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') &
CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B'))))
dim(df4); dim(df5)
## [1] 901955 5
## [1] 342 5
As we can see from information above, only 342 rows contain corrupted data, whereas 901955 rows contain correct data. We will work only with correct data subset.
# Preparing coefficients:
df4 <- df4 %>%
mutate(k1 = ifelse((PROPDMGEXP=='K')|(PROPDMGEXP=='k'),
1000,
ifelse((PROPDMGEXP=='M')|(PROPDMGEXP=='m'),
1000000,
ifelse((PROPDMGEXP=='B')|(PROPDMGEXP=='b'),
1000000000,
1))),
k2 = ifelse((CROPDMGEXP=='K')|(CROPDMGEXP=='k'),
1000,
ifelse((CROPDMGEXP=='M')|(CROPDMGEXP=='m'),
1000000,
ifelse((CROPDMGEXP=='B')|(CROPDMGEXP=='b'),
1000000000,
1))),
)
#Calculatuion:
df4 <- df4 %>%
mutate(total.dmg = PROPDMG * k1 + CROPDMG * k2)
df4 <- df4 %>%
group_by(EVTYPE) %>%
mutate(count.EVTYPE = n(),
total.damage.EVTYPE = sum(total.dmg)) %>%
ungroup() %>%
mutate(avg.dmg.per.event = total.damage.EVTYPE / count.EVTYPE) %>%
distinct(EVTYPE, total.damage.EVTYPE, avg.dmg.per.event)
df5 <- df4[, c(1, 3)] %>%
arrange(desc(avg.dmg.per.event))
head(df5, 20)
## # A tibble: 20 × 2
## EVTYPE avg.dmg.per.event
## <chr> <dbl>
## 1 TORNADOES, TSTM WIND, HAIL 1602500000
## 2 HEAVY RAIN/SEVERE WEATHER 1250000000
## 3 HURRICANE/TYPHOON 817201282.
## 4 HURRICANE OPAL 354649556.
## 5 STORM SURGE 165990579.
## 6 WILD FIRES 156025000
## 7 EXCESSIVE WETNESS 142000000
## 8 HURRICANE OPAL/HIGH WINDS 110000000
## 9 SEVERE THUNDERSTORM 92735385.
## 10 HURRICANE 83966833.
## 11 HAILSTORM 80333333.
## 12 COLD AND WET CONDITIONS 66000000
## 13 WINTER STORM HIGH WINDS 65000000
## 14 RIVER FLOOD 58661298.
## 15 HURRICANE ERIN 56301429.
## 16 TYPHOON 54641364.
## 17 HURRICANE EMILY 50000000
## 18 DAMAGING FREEZE 45016667.
## 19 Early Frost 42000000
## 20 MAJOR FLOOD 35000000
df6 <- df4[, c(1, 2)] %>%
arrange(desc(total.damage.EVTYPE)) %>%
mutate(percentage = sum(total.damage.EVTYPE)) %>%
mutate(percentage = round(total.damage.EVTYPE / percentage * 100, 2))
head(df6, 20)
## # A tibble: 20 × 3
## EVTYPE total.damage.EVTYPE percentage
## <chr> <dbl> <dbl>
## 1 FLOOD 150319678250 31.6
## 2 HURRICANE/TYPHOON 71913712800 15.1
## 3 TORNADO 57301940590 12.0
## 4 STORM SURGE 43323541000 9.1
## 5 HAIL 18733211170 3.93
## 6 FLASH FLOOD 17561528610 3.69
## 7 DROUGHT 15018672000 3.15
## 8 HURRICANE 14610229010 3.07
## 9 RIVER FLOOD 10148404500 2.13
## 10 ICE STORM 8967037810 1.88
## 11 TROPICAL STORM 8382236550 1.76
## 12 WINTER STORM 6715441250 1.41
## 13 HIGH WIND 5908617560 1.24
## 14 WILDFIRE 5060586800 1.06
## 15 TSTM WIND 5038935790 1.06
## 16 STORM SURGE/TIDE 4642038000 0.97
## 17 THUNDERSTORM WIND 3897964190 0.82
## 18 HURRICANE OPAL 3191846000 0.67
## 19 WILD/FOREST FIRE 3108626330 0.65
## 20 HEAVY RAIN/SEVERE WEATHER 2500000000 0.52
df23 <- df3 %>%
full_join(df2, by = join_by(EVTYPE))
df23$EVTYPE <- factor(df23$EVTYPE,
levels = df23$EVTYPE,
ordered = T)
par(mfrow = c(2,1), mar = c(6,4,2,1))
barplot(height = df23$INJURIES.percentage[1:20],
names.arg = df23$EVTYPE[1:20],
width = rep(1,20),
space = 0,
main = "Total injuries percentage by all events of that type",
ylab = 'Injuries percentage',
las = 2,
cex.names = 0.5,
xaxt = 'n',
yaxt = 'n')
axis(side = 1,
padj = 1,
labels = F,
tick = T, at = 1:20 - 0.5, tck=-0.03,
cex.axis=0.5)
axis(side = 2,
padj = 1,
labels = T,
tick = T,
cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
y = par("usr")[3] - 2,
labels = df23$EVTYPE[0:20],
xpd = T,
srt = 45,
cex = 0.5,
adj = 1)
title(xlab="Event type", line=4)
barplot(height = df23$avg.INJURES.count[1:20],
names.arg = df23$EVTYPE[1:20],
width = rep(1,20),
space = 0,
main = "Average injuries count for events of that type",
ylab = 'Average injuries count',
las = 2,
cex.names = 0.5,
xaxt = 'n',
yaxt = 'n',
col = 'red')
axis(side = 1,
padj = 1,
labels = F,
tick = T, at = 1:20 - 0.5, tck=-0.03,
cex.axis=0.5)
axis(side = 2,
padj = 1,
labels = T,
tick = T,
cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
y = par("usr")[3] - 2,
labels = df23$EVTYPE[0:20],
xpd = T,
srt = 45,
cex = 0.5,
adj = 1)
title(xlab="Event type", line=4)
df65 <- df6 %>%
left_join(df5, by = join_by(EVTYPE)) %>%
select(EVTYPE, percentage, avg.dmg.per.event)
df65$EVTYPE <- factor(df65$EVTYPE,
levels = df65$EVTYPE,
ordered = T)
par(mfrow = c(2,1), mar = c(6,4,2,1))
barplot(height = df65$percentage[1:20],
names.arg = df65$EVTYPE[1:20],
width = rep(1,20),
space = 0,
main = "Damage percentage by all events of that type",
ylab = 'Damage percentage',
las = 2,
cex.names = 0.5,
xaxt = 'n',
yaxt = 'n')
axis(side = 1,
padj = 1,
labels = F,
tick = T, at = 1:20 - 0.5, tck=-0.03,
cex.axis=0.5)
axis(side = 2,
padj = 1,
labels = T,
tick = T,
cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
y = par("usr")[3] - 2,
labels = df65$EVTYPE[0:20],
xpd = T,
srt = 45,
cex = 0.5,
adj = 1)
title(xlab="Event type", line=4)
barplot(height = df65$avg.dmg.per.event[1:20],
names.arg = df65$EVTYPE[1:20],
width = rep(1,20),
space = 0,
main = "Average damage for events of that type",
ylab = 'Average damage',
las = 2,
cex.names = 0.5,
xaxt = 'n',
yaxt = 'n',
col = 'red')
axis(side = 1,
padj = 1,
labels = F,
tick = T, at = 1:20 - 0.5, tck=-0.03,
cex.axis=0.5)
axis(side = 2,
padj = 1,
labels = T,
tick = T,
cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
y = par("usr")[3] - 2,
labels = df65$EVTYPE[0:20],
xpd = T,
srt = 45,
cex = 0.5,
adj = 1)
title(xlab="Event type", line=4)