TITLE:

Exploration the NOAA Storm Database and answering the following questions: 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? 2. Across the United States, which types of events have the greatest economic consequences?

Synopsis:

Answering the questions above I will consider different severe events as following: 1. Event of Which type produces maximum average harm per event of that type? 1. Events of Which type produce maximum overall harm (all events of that type together)?

#Data Processing section

  1. Loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
  1. Reading table from the file:
filePath <- './data/repdata_data_StormData.csv.bz2'
df <- read.csv(filePath)
  1. Choosing columns and printing a few rows from the table:
df <- df[, c("EVTYPE", "INJURIES" ,  "PROPDMG"   , "PROPDMGEXP" ,"CROPDMG"  ,  "CROPDMGEXP")]
head(df)
##    EVTYPE INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO       15    25.0          K       0           
## 2 TORNADO        0     2.5          K       0           
## 3 TORNADO        2    25.0          K       0           
## 4 TORNADO        2     2.5          K       0           
## 5 TORNADO        2     2.5          K       0           
## 6 TORNADO        6     2.5          K       0
  1. Printing some table attributes
print(dim(df))
## [1] 902297      6
print(names(df))
## [1] "EVTYPE"     "INJURIES"   "PROPDMG"    "PROPDMGEXP" "CROPDMG"   
## [6] "CROPDMGEXP"
  1. Now let’s declare the rule of my research: Then I consider population health harm, I take in consideration only INJURIES (FATALITIES aren’t considered).

  2. Let’s add new column - count of events of that type

df1 <- df %>% 
        group_by(EVTYPE) %>% 
        mutate(EVTYPE.count = n()) %>%
        ungroup()
head(df1)
## # A tibble: 6 × 7
##   EVTYPE  INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP EVTYPE.count
##   <chr>      <dbl>   <dbl> <chr>        <dbl> <chr>             <int>
## 1 TORNADO       15    25   K                0 ""                60652
## 2 TORNADO        0     2.5 K                0 ""                60652
## 3 TORNADO        2    25   K                0 ""                60652
## 4 TORNADO        2     2.5 K                0 ""                60652
## 5 TORNADO        2     2.5 K                0 ""                60652
## 6 TORNADO        6     2.5 K                0 ""                60652
  1. Let’s add new column - total count of INJURIES caused by that type of event
df1 <- df1 %>% 
        group_by(EVTYPE) %>% 
        mutate(total.INJURIES.count = sum(INJURIES)) %>%
        ungroup()
print(df1)
## # A tibble: 902,297 × 8
##    EVTYPE  INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP EVTYPE.count
##    <chr>      <dbl>   <dbl> <chr>        <dbl> <chr>             <int>
##  1 TORNADO       15    25   K                0 ""                60652
##  2 TORNADO        0     2.5 K                0 ""                60652
##  3 TORNADO        2    25   K                0 ""                60652
##  4 TORNADO        2     2.5 K                0 ""                60652
##  5 TORNADO        2     2.5 K                0 ""                60652
##  6 TORNADO        6     2.5 K                0 ""                60652
##  7 TORNADO        1     2.5 K                0 ""                60652
##  8 TORNADO        0     2.5 K                0 ""                60652
##  9 TORNADO       14    25   K                0 ""                60652
## 10 TORNADO        0    25   K                0 ""                60652
## # ℹ 902,287 more rows
## # ℹ 1 more variable: total.INJURIES.count <dbl>
  1. Let’s distinct rows and introduce new variable avg.INJURES.per.event = INJURIES.count / EVTYPE.count The following table contain TOP-20 EVENTS, which could be considered as the most harmful in terms of average count of injures caused by that type of event.
df2 <- df1%>% 
        distinct(EVTYPE, total.INJURIES.count , EVTYPE.count) %>%
        mutate(avg.INJURES.count = total.INJURIES.count / EVTYPE.count) %>%
        arrange(desc(avg.INJURES.count)) %>%
        mutate(avg.INJURES.count = round(avg.INJURES.count, 1)) %>%
        select(1,4)
head(df2,20)
## # A tibble: 20 × 2
##    EVTYPE                  avg.INJURES.count
##    <chr>                               <dbl>
##  1 Heat Wave                            70  
##  2 TROPICAL STORM GORDON                43  
##  3 WILD FIRES                           37.5
##  4 THUNDERSTORMW                        27  
##  5 HIGH WIND AND SEAS                   20  
##  6 SNOW/HIGH WINDS                      18  
##  7 WINTER STORM HIGH WINDS              15  
##  8 GLAZE/ICE STORM                      15  
##  9 HEAT WAVE DROUGHT                    15  
## 10 HURRICANE/TYPHOON                    14.5
## 11 WINTER WEATHER MIX                   11.3
## 12 EXTREME HEAT                          7  
## 13 NON-SEVERE WIND DAMAGE                7  
## 14 GLAZE                                 6.8
## 15 TSUNAMI                               6.4
## 16 WINTER STORMS                         5.7
## 17 TORNADO F2                            5.3
## 18 WATERSPOUT/TORNADO                    5.2
## 19 EXCESSIVE RAINFALL                    5.2
## 20 HEAT WAVE                             4.2
  1. Some events happen often, and all together they may cause a lot of injures even though one separate event of that type doesn’t harm a lot in terms of average count of injures for that type of event. Let’s print the table, containing:
df3 <- df1 %>% 
        select(EVTYPE, total.INJURIES.count) %>%
        distinct(EVTYPE, total.INJURIES.count) %>%
        arrange(desc(total.INJURIES.count)) %>%
        mutate(overall.INJURIES.count = sum(total.INJURIES.count)) %>%
        mutate(total.INJURIES.count = total.INJURIES.count / overall.INJURIES.count * 100) %>%
        rename(INJURIES.percentage = total.INJURIES.count) %>%
        select('EVTYPE','INJURIES.percentage') %>%
        mutate(INJURIES.percentage = round(INJURIES.percentage,2))

head(df3, 20)
## # A tibble: 20 × 2
##    EVTYPE             INJURIES.percentage
##    <chr>                            <dbl>
##  1 TORNADO                          65   
##  2 TSTM WIND                         4.95
##  3 FLOOD                             4.83
##  4 EXCESSIVE HEAT                    4.64
##  5 LIGHTNING                         3.72
##  6 HEAT                              1.49
##  7 ICE STORM                         1.41
##  8 FLASH FLOOD                       1.26
##  9 THUNDERSTORM WIND                 1.06
## 10 HAIL                              0.97
## 11 WINTER STORM                      0.94
## 12 HURRICANE/TYPHOON                 0.91
## 13 HIGH WIND                         0.81
## 14 HEAVY SNOW                        0.73
## 15 WILDFIRE                          0.65
## 16 THUNDERSTORM WINDS                0.65
## 17 BLIZZARD                          0.57
## 18 FOG                               0.52
## 19 WILD/FOREST FIRE                  0.39
## 20 DUST STORM                        0.31
  1. Let’s consider events which have the greatest economic consequences.

  2. First of all let’s exclude those events which have corrupted data in PROPDMG and CROPDMG columns:

#Correct data subset:
df4 <- df %>% 
        select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
        filter((PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') & CROPDMG  == 0 )|
                       (PROPDMG  == 0 & CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B')) |
                       (PROPDMG == 0 & CROPDMG  == 0) | 
                       (PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') & 
                                CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B')))
#Corrupted data subset:
df5 <- df %>% 
        select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
        filter(!((PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') & CROPDMG  == 0 )|
                       (PROPDMG  == 0 & CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B')) |
                       (PROPDMG == 0 & CROPDMG  == 0) | 
                       (PROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B') & 
                                CROPDMGEXP %in% c('k', 'K', 'm','M', 'b','B'))))

dim(df4); dim(df5) 
## [1] 901955      5
## [1] 342   5

As we can see from information above, only 342 rows contain corrupted data, whereas 901955 rows contain correct data. We will work only with correct data subset.

  1. The second step is creation variable total.dmg , considering “magnitude” coefficient, corresponded to PROPDMGEXP and CROPDMGEXP variables. Ones we prepare these coefficients, we can calculate total damage caused by an event.
# Preparing coefficients:
df4 <- df4 %>%
        mutate(k1 = ifelse((PROPDMGEXP=='K')|(PROPDMGEXP=='k'), 
                           1000, 
                           ifelse((PROPDMGEXP=='M')|(PROPDMGEXP=='m'),
                                  1000000, 
                                  ifelse((PROPDMGEXP=='B')|(PROPDMGEXP=='b'),
                                         1000000000,
                                         1))),
               k2 = ifelse((CROPDMGEXP=='K')|(CROPDMGEXP=='k'), 
                           1000, 
                           ifelse((CROPDMGEXP=='M')|(CROPDMGEXP=='m'),
                                  1000000, 
                                  ifelse((CROPDMGEXP=='B')|(CROPDMGEXP=='b'),
                                         1000000000, 
                                         1))),
               )

#Calculatuion:
df4 <- df4 %>% 
        mutate(total.dmg = PROPDMG * k1 + CROPDMG * k2)
  1. Let’s create variable total.demage.EVTYPE (total demage caused by all events of that type) and avg.dmg.per.event (average damage for that type of event)
df4 <- df4 %>%
        group_by(EVTYPE) %>%
        mutate(count.EVTYPE = n(), 
               total.damage.EVTYPE = sum(total.dmg)) %>%
        ungroup() %>%
        mutate(avg.dmg.per.event = total.damage.EVTYPE / count.EVTYPE) %>%
        distinct(EVTYPE, total.damage.EVTYPE, avg.dmg.per.event)
  1. The TOP-20 of the most harmful events in terms of an average damage per event of that type:
df5 <- df4[, c(1, 3)] %>%
        arrange(desc(avg.dmg.per.event)) 
head(df5, 20)
## # A tibble: 20 × 2
##    EVTYPE                     avg.dmg.per.event
##    <chr>                                  <dbl>
##  1 TORNADOES, TSTM WIND, HAIL       1602500000 
##  2 HEAVY RAIN/SEVERE WEATHER        1250000000 
##  3 HURRICANE/TYPHOON                 817201282.
##  4 HURRICANE OPAL                    354649556.
##  5 STORM SURGE                       165990579.
##  6 WILD FIRES                        156025000 
##  7 EXCESSIVE WETNESS                 142000000 
##  8 HURRICANE OPAL/HIGH WINDS         110000000 
##  9 SEVERE THUNDERSTORM                92735385.
## 10 HURRICANE                          83966833.
## 11 HAILSTORM                          80333333.
## 12 COLD AND WET CONDITIONS            66000000 
## 13 WINTER STORM HIGH WINDS            65000000 
## 14 RIVER FLOOD                        58661298.
## 15 HURRICANE ERIN                     56301429.
## 16 TYPHOON                            54641364.
## 17 HURRICANE EMILY                    50000000 
## 18 DAMAGING FREEZE                    45016667.
## 19 Early Frost                        42000000 
## 20 MAJOR FLOOD                        35000000
  1. The TOP-20 of the most harmful events (with maximum total harm produced by all events of that type):
df6 <- df4[, c(1, 2)] %>% 
        arrange(desc(total.damage.EVTYPE)) %>% 
        mutate(percentage = sum(total.damage.EVTYPE)) %>% 
        mutate(percentage = round(total.damage.EVTYPE / percentage * 100, 2)) 
head(df6, 20)
## # A tibble: 20 × 3
##    EVTYPE                    total.damage.EVTYPE percentage
##    <chr>                                   <dbl>      <dbl>
##  1 FLOOD                            150319678250      31.6 
##  2 HURRICANE/TYPHOON                 71913712800      15.1 
##  3 TORNADO                           57301940590      12.0 
##  4 STORM SURGE                       43323541000       9.1 
##  5 HAIL                              18733211170       3.93
##  6 FLASH FLOOD                       17561528610       3.69
##  7 DROUGHT                           15018672000       3.15
##  8 HURRICANE                         14610229010       3.07
##  9 RIVER FLOOD                       10148404500       2.13
## 10 ICE STORM                          8967037810       1.88
## 11 TROPICAL STORM                     8382236550       1.76
## 12 WINTER STORM                       6715441250       1.41
## 13 HIGH WIND                          5908617560       1.24
## 14 WILDFIRE                           5060586800       1.06
## 15 TSTM WIND                          5038935790       1.06
## 16 STORM SURGE/TIDE                   4642038000       0.97
## 17 THUNDERSTORM WIND                  3897964190       0.82
## 18 HURRICANE OPAL                     3191846000       0.67
## 19 WILD/FOREST FIRE                   3108626330       0.65
## 20 HEAVY RAIN/SEVERE WEATHER          2500000000       0.52

Results

df23 <- df3 %>%
        full_join(df2, by = join_by(EVTYPE))
df23$EVTYPE <- factor(df23$EVTYPE,
                           levels = df23$EVTYPE,
                           ordered = T)

par(mfrow = c(2,1), mar = c(6,4,2,1))

barplot(height = df23$INJURIES.percentage[1:20],
        names.arg = df23$EVTYPE[1:20], 
        width = rep(1,20),
        space = 0,
        main = "Total injuries percentage by all events of that type", 
        ylab = 'Injuries percentage', 
        las = 2, 
        cex.names = 0.5, 
        xaxt = 'n',
        yaxt = 'n') 
axis(side = 1,
     padj = 1,  
     labels = F, 
     tick = T, at = 1:20 - 0.5, tck=-0.03,
     cex.axis=0.5)
axis(side = 2,
     padj = 1,  
     labels = T, 
     tick = T,
     cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
     y = par("usr")[3] - 2,
     labels = df23$EVTYPE[0:20],
     xpd = T,
     srt = 45,
     cex = 0.5, 
     adj = 1)
title(xlab="Event type", line=4)

        barplot(height = df23$avg.INJURES.count[1:20],
        names.arg = df23$EVTYPE[1:20], 
        width = rep(1,20),
        space = 0,
        main = "Average injuries count for events of that type", 
        ylab = 'Average injuries count', 
        las = 2, 
        cex.names = 0.5, 
        xaxt = 'n',
        yaxt = 'n', 
        col = 'red') 
axis(side = 1,
     padj = 1,  
     labels = F, 
     tick = T, at = 1:20 - 0.5, tck=-0.03,
     cex.axis=0.5)
axis(side = 2,
     padj = 1,  
     labels = T, 
     tick = T,
     cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
     y = par("usr")[3] - 2,
     labels = df23$EVTYPE[0:20],
     xpd = T,
     srt = 45,
     cex = 0.5, 
     adj = 1)
title(xlab="Event type", line=4)

df65 <- df6 %>% 
        left_join(df5, by = join_by(EVTYPE)) %>%
        select(EVTYPE, percentage, avg.dmg.per.event)

df65$EVTYPE <- factor(df65$EVTYPE,
                           levels = df65$EVTYPE,
                           ordered = T)

par(mfrow = c(2,1), mar = c(6,4,2,1))

barplot(height = df65$percentage[1:20],
        names.arg = df65$EVTYPE[1:20], 
        width = rep(1,20),
        space = 0,
        main = "Damage percentage by all events of that type", 
        ylab = 'Damage percentage', 
        las = 2, 
        cex.names = 0.5, 
        xaxt = 'n',
        yaxt = 'n') 
axis(side = 1,
     padj = 1,  
     labels = F, 
     tick = T, at = 1:20 - 0.5, tck=-0.03,
     cex.axis=0.5)
axis(side = 2,
     padj = 1,  
     labels = T, 
     tick = T,
     cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
     y = par("usr")[3] - 2,
     labels = df65$EVTYPE[0:20],
     xpd = T,
     srt = 45,
     cex = 0.5, 
     adj = 1)
title(xlab="Event type", line=4)

barplot(height = df65$avg.dmg.per.event[1:20],
        names.arg = df65$EVTYPE[1:20], 
        width = rep(1,20),
        space = 0,
        main = "Average damage for events of that type", 
        ylab = 'Average damage', 
        las = 2, 
        cex.names = 0.5, 
        xaxt = 'n',
        yaxt = 'n', 
        col = 'red') 
axis(side = 1,
     padj = 1,  
     labels = F, 
     tick = T, at = 1:20 - 0.5, tck=-0.03,
     cex.axis=0.5)
axis(side = 2,
     padj = 1,  
     labels = T, 
     tick = T,
     cex.axis= 0.7)
text(x = (1:20) - 0.5 ,
     y = par("usr")[3] - 2,
     labels = df65$EVTYPE[0:20],
     xpd = T,
     srt = 45,
     cex = 0.5, 
     adj = 1)
title(xlab="Event type", line=4)