The research starts from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database in order to address the question about which are the natural events that have the most relevant impact on both the USA human health and the USA economy.

Data Processing

Data Importing

Data from the raw database are loaded in R.

data <- read.csv(bzfile('repdata-data-StormData.csv.bz2'), sep = ',', header = TRUE)

Data manipulation

Raw data are manipulated in order to obtain a database suitable for the scope of the analysis.

EVTYPE variable manipulation

The EVTYPE variable is processed to remove differences due to the use of lower and upper letters for the same event type.

length(unique(data$EVTYPE))
## [1] 985
length(unique(toupper(data$EVTYPE)))
## [1] 898
data$EVTYPE <- toupper(data$EVTYPE)

PROPDMGEXP and CROPDMGEXP variables

The PROPDMGEXP and CROPDMGEXP variables are processed to define a specific rule of reading. In particular, symbols are replaced with the following magnitude order:

  • 0 <- ‘-’, ‘?’, 0
  • 1 <- ’‘,’+’
  • 10 <- numbers from 1 to 8
  • 100 <- ‘h’, ‘H’
  • 1000 <- ‘k’, ‘K’
  • 1000000 <- ‘m’, ‘M’
  • 1000000000 <- ‘B’
data$PROPDMGEXP <- as.factor(data$PROPDMGEXP)
levels(data$PROPDMGEXP) <- list('0' = c('-', '?', '0'), 
                                '1' = c('', '+'), 
                                '10' = c('1', '2', '3', '4', '5', '6', '7', '8'), 
                                '100' = c('h', 'H'), 
                                '1000' = 'K', 
                                '1000000' = c('m', 'M'), 
                                '1000000000' = 'B')

data$PROPDMGEXP <- as.numeric(as.character(data$PROPDMGEXP))
data$CROPDMGEXP <- as.factor(data$CROPDMGEXP)
levels(data$CROPDMGEXP) <- list('0' = c('?', '0'), 
                                '1' = '', 
                                '10' = '2', 
                                '1000' = c('k', 'K'), 
                                '1000000' = c('m', 'M'), 
                                '1000000000' = 'B')

data$CROPDMGEXP <- as.numeric(as.character(data$CROPDMGEXP))

Results

Result 1 - Types of events that are most harmful with respect to population health

To obtain the answer, data are grouped by EVTYPE variable and summarized in a new data set displaying the sum of injuries (form the INJURIES variable) for each event type.

data1 <- data %>% group_by(EVTYPE) %>% summarise(Tot.Injuries = sum(INJURIES))

data1 <- data1[data1$Tot.Injuries != 0, ]

Below are shown the top-10 impact events:

result1 <- arrange(data1, desc(Tot.Injuries))
head(result1, 10)
## # A tibble: 10 x 2
##    EVTYPE            Tot.Injuries
##    <chr>                    <dbl>
##  1 TORNADO                  91346
##  2 TSTM WIND                 6957
##  3 FLOOD                     6789
##  4 EXCESSIVE HEAT            6525
##  5 LIGHTNING                 5230
##  6 HEAT                      2100
##  7 ICE STORM                 1975
##  8 FLASH FLOOD               1777
##  9 THUNDERSTORM WIND         1488
## 10 HAIL                      1361
result1$EVTYPE <- factor(result1$EVTYPE, levels = c(result1$EVTYPE))

p1 <- ggplot(data = result1[1:10, ], aes(x = EVTYPE, y = Tot.Injuries/1000)) + 
        geom_bar(stat = 'identity', aes(fill = Tot.Injuries), show.legend = FALSE) + 
        scale_fill_gradient(low = 'gold', high = 'firebrick4') +
        labs(title = 'Top-10 impact events', x = '', y = 'Number of injuries (thousands)') + 
        theme(plot.title = element_text(hjust = 0.5, face = 'bold'), 
              axis.text.x = element_text(angle = 45, hjust = 1))

p1

Result 2 - Types of events that have the greatest economic consequences

To obtain the answer, data are grouped by EVTYPE variable and summarized in a new data set displaying the sum of economic damages. The sum economic damages is obtained as the sum of the properties damages and crop damages (respectively obtained as PROPDMG times PROPDMGEXP and CROPDMG times CROPDMGEXP).

data2 <- data %>% group_by(EVTYPE) %>% summarise(Tot.PropDmg = sum(PROPDMG*PROPDMGEXP), 
                                                 Tot.CropDmg = sum(CROPDMG*CROPDMGEXP))

data2$Tot.EconDmg <- data2$Tot.PropDmg + data2$Tot.CropDmg
        
data2 <- data2[data2$Tot.EconDmg != 0, ]

Below are shown the top-10 impact events:

result2 <- arrange(data2, desc(Tot.EconDmg))
head(result2, 10)
## # A tibble: 10 x 4
##    EVTYPE             Tot.PropDmg Tot.CropDmg  Tot.EconDmg
##    <chr>                    <dbl>       <dbl>        <dbl>
##  1 FLOOD             144657709807  5661968450 150319678257
##  2 HURRICANE/TYPHOON  69305840000  2607872800  71913712800
##  3 TORNADO            56937161565   414953110  57352114675
##  4 STORM SURGE        43323536000        5000  43323541000
##  5 HAIL               15732267577  3025954453  18758222030
##  6 FLASH FLOOD        16140812603  1421317100  17562129703
##  7 DROUGHT             1046106000 13972566000  15018672000
##  8 HURRICANE          11868319010  2741910000  14610229010
##  9 RIVER FLOOD         5118945500  5029459000  10148404500
## 10 ICE STORM           3944927810  5022113500   8967041310
result2$EVTYPE <- factor(result2$EVTYPE, levels = c(result2$EVTYPE))

p2 <- ggplot(data = result2[1:10, ], aes(x = EVTYPE, y = Tot.EconDmg/1000000000)) + 
        geom_bar(stat = 'identity', aes(fill = Tot.EconDmg), show.legend = FALSE) + 
        scale_fill_gradient(low = 'gold', high = 'firebrick4') +
        labs(title = 'Top-10 impact events', x = '', y = 'Economic damages (billions)') + 
        theme(plot.title = element_text(hjust = 0.5, face = 'bold'), 
              axis.text.x = element_text(angle = 45, hjust = 1))

p2