First section : beginning of treatment



Synopsis

The basic goal of this sheet is to explore the NOAA Storm Database and answer two basic questions about severe weather events.The database is used to answer the questions below : - Across the United States, which types of events are most harmful with respect to population health? - Across the United States, which types of events have the greatest economic consequences?

Initialization code

knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
## 
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(ggplot2)
library(patchwork)

Second section : Data Processing



Download and open data

  1. Unzip file, open and store csv file into the object data_set
URL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
# download.file(URL,destfile = "./my_file.bz2",method = "curl")

data_set <- read.csv("my_file.bz2", header = TRUE, row.names = NULL,stringsAsFactors=FALSE)
# Conversion of the varibale BGN_DATE into Date 
data_set$BGN_DATE <- mdy_hms(data_set$BGN_DATE)
  1. The object data_set contains 37 variables :
names(data_set)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
  1. There are a few variables of interest in this project :

Therefore, a subset of object data_set is made :

my_cols <- c('BGN_DATE', 'EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP',
             'CROPDMG',  'CROPDMGEXP')
data_set_mini <- data_set[,my_cols]
  1. Conversion of PROPDMG and CROPDMG variables to numeric and calculation of values taking into account PROPDMGXP and PROPDMGXP variables :
# Conversion of the character-exponents into numeric format
calcul_exp <- function(u) {
  u <- switch(u,
         '+' = {0}, '-' = {0}, '?' = {0},
         'h' = {2}, 'H' = {2},
         'k' = {3}, 'K' = {3},
         'm' = {6}, 'M' = {6},
         'b' = {9}, 'B' = {9},
         u)
  if(u == '' | u == ' ') 0
  else as.integer(u)
}

# Conversion of empty/space value to zero and any other strings to numeric value
convert_num <- function(x) {
  if(x == '' | x == ' ') 0
  else as.numeric(x)
}

# calculate the actual property damage by multiplying PROPDMG with 10^PROPDMGEXP
data_set_mini$PROPDMG <- apply(data.frame(data_set_mini$PROPDMG, data_set_mini$PROPDMGEXP), 
                               1, function(x) { convert_num(x[1])*10^calcul_exp(x[2]) })

# calculate the actual property damage by multiplying CROPDMG with 10^CROPDMGEXP
data_set_mini$CROPDMG <- apply(data.frame(data_set_mini$CROPDMG, data_set_mini$CROPDMGEXP), 
                               1, function(x) { convert_num(x[1])*10^calcul_exp(x[2]) })
  1. By using regexpp, the content of the EVTYPE variable is tidy so as to reduce the number of possible types :
# Capitalization for all events and supression of extra spaces.
data_set_mini$EVTYPE <- toupper(data_set_mini$EVTYPE)
data_set_mini$EVTYPE <- trimws(x = data_set_mini$EVTYPE, which = "both", whitespace = " ")


data_set_mini$EVTYPE[grepl(pattern = "ICESTORM", x = data_set_mini$EVTYPE)] <- "ICESTORM"
data_set_mini$EVTYPE[grepl(pattern = "BLIZZARD", x = data_set_mini$EVTYPE)] <- "BLIZZARD"
data_set_mini$EVTYPE[grepl(pattern = "COASTAL", x = data_set_mini$EVTYPE) & 
                   grepl(pattern = "FLOOD", x = data_set_mini$EVTYPE)] <- "COASTAL FLOOD"
data_set_mini$EVTYPE[grepl(pattern = "FLAS", x = data_set_mini$EVTYPE) &
                       grepl(pattern = "FLOOD", x = data_set_mini$EVTYPE)] <-
  "FLASH FLOOD"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*FLOOD)(?!.*COASTAL)(?!.*FLASH)", 
                           x = data_set_mini$EVTYPE,perl = TRUE)] <- "FLOOD"
data_set_mini$EVTYPE[grepl(pattern = "FOG", x = data_set_mini$EVTYPE)] <- "DENSE FOG"
data_set_mini$EVTYPE[grepl(pattern = "SMOKE", x = data_set_mini$EVTYPE)] <- "DENSE SMOKE"
data_set_mini$EVTYPE[grepl(pattern = "DROUGHT", x = data_set_mini$EVTYPE)] <- "DROUGHT"
data_set_mini$EVTYPE[grepl(pattern = "FROST|FREEZE", x = data_set_mini$EVTYPE)] <-
  "FROST/FREEZE"
data_set_mini$EVTYPE[grepl(pattern = "FUNNEL", x = data_set_mini$EVTYPE)] <- 
  "FUNNEL CLOUD"
data_set_mini$EVTYPE[grepl("^(?=.*HAIL)(?!.*MARINE)", x = data_set_mini$EVTYPE, 
                           perl = TRUE)] <- "HAIL"
data_set_mini$EVTYPE[grepl(pattern = "EXCESSIVE HEAT|RECORD HEAT|HEATBURST", 
                           x = data_set_mini$EVTYPE)] <- "EXCESSIVE HEAT"
data_set_mini$EVTYPE[grepl(pattern = "RAIN", x = data_set_mini$EVTYPE)] <- 
  "HEAVY RAIN"
data_set_mini$EVTYPE[grepl(pattern = "HURRICANE", x = data_set_mini$EVTYPE)] <-
  "HURRICANE (TYPHONE)"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*SNOW)(?=.*LAKE)", x = data_set_mini$EVTYPE,
                           perl = TRUE)] <- "LAKE-EFFECT SNOW"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*SNOW)(?!.*LAKE)", x = data_set_mini$EVTYPE,
                           perl = TRUE)] <- "HEAVY SNOW"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*STRONG)(?=.*WIND)(?!.*MARINE)", 
                           x = data_set_mini$EVTYPE, perl = TRUE)] <- "STRONG WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*TSTM)(?=.*WIND)(?!.*MARINE)|
                           ^(?=.*THUNDERSTORM)(?=.*WIND)(?!.*MARINE)",
                       x = data_set_mini$EVTYPE, perl = TRUE)] <- "THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*TSTM)(?=.*WIND)(?=.*MARINE)", 
                           x = data_set_mini$EVTYPE,perl = TRUE)] <- 
  "MARINE THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*THUNDERSTORM)(?=.*WIND)(?!.*MARINE)", 
                           x = data_set_mini$EVTYPE,perl = TRUE)] <- "THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*EXTREME)(?=.*WIND)(?=.*CHILL)", 
                           x = data_set_mini$EVTYPE,
                       perl = TRUE)] <- "EXTREME COLD/WIND CHILL"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*WIND)(?=.*CHILL)(?!.*EXTREME)", 
                           x = data_set_mini$EVTYPE, perl = TRUE)] <- "COLD/WIND CHILL"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*WIND)(?!.*MARINE)
                           (?!.*CHILL)(?!.*STRONG)(?!.*THUNDERSTORM)",
                           x = data_set_mini$EVTYPE, perl = TRUE)] <- "HIGH WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*TSTM)(?=.*WIND)(?=.*MARINE)", 
                           x = data_set_mini$EVTYPE,perl = TRUE)] <- 
  "MARINE THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "CURRENT", 
                           x = data_set_mini$EVTYPE)] <- "RIP CURRENT"
data_set_mini$EVTYPE[grepl(pattern = "SLEET", x = data_set_mini$EVTYPE)] <- "SLEET"
data_set_mini$EVTYPE[grepl(pattern = "STORM SURGE", x = data_set_mini$EVTYPE)] <- 
  "STORM SURGE/TIDE"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*THUNDERSTORM)(?!.*MARINE)", 
                           x = data_set_mini$EVTYPE, perl = TRUE)] <- "THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "TORNADO", x = data_set_mini$EVTYPE)] <- "TORNADO"
data_set_mini$EVTYPE[grepl(pattern = "VOLCANIC", x = data_set_mini$EVTYPE)] <- 
  "VOLCANIC ASH"
data_set_mini$EVTYPE[grepl(pattern = "WATERSPOUT", x = data_set_mini$EVTYPE)] <-
  "WATERSPOUT"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*WINTER)(?!.*STORM)", 
                           x = data_set_mini$EVTYPE, perl = TRUE)] <- "WINTER WEATHER"

Human damage : injuries + fatalities


Important note: without regexp, we see the same event return several times. It’s “TSTM WIND”, “THUNDERSTORM WIND” and “THUNDERSTROM WINDS” (with an “s”). This is proof of the interest of regexp in this analysis. Therefore, there is two approach below to show it clearly. The first, is the best.

  1. First approach
harms <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini, sum)
names(harms) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms <- arrange(harms, desc(TOTAL.AMOUNT), EVTYPE)
head(harms, 10)
##               EVTYPE TOTAL.AMOUNT
## 1            TORNADO        97043
## 2  THUNDERSTORM WIND        10119
## 3     EXCESSIVE HEAT         8497
## 4              FLOOD         7279
## 5          LIGHTNING         6046
## 6               HEAT         3037
## 7        FLASH FLOOD         2837
## 8          ICE STORM         2064
## 9       WINTER STORM         1527
## 10              HAIL         1512
# Subset from 1996
data_set_mini.sub <- data_set_mini %>% filter(year(data_set_mini$BGN_DATE) > 1996)

harms_sub <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini.sub, sum)
names(harms_sub) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms_sub <- arrange(harms_sub, desc(TOTAL.AMOUNT), EVTYPE)
head(harms_sub, 10)
##                 EVTYPE TOTAL.AMOUNT
## 1              TORNADO        21447
## 2       EXCESSIVE HEAT         8095
## 3                FLOOD         7123
## 4    THUNDERSTORM WIND         5063
## 5            LIGHTNING         4424
## 6          FLASH FLOOD         2425
## 7                 HEAT         1459
## 8  HURRICANE (TYPHONE)         1393
## 9         WINTER STORM         1209
## 10           HIGH WIND         1181
my_plot_fun <- function(my_data) {
  my_data %>% ggplot(aes(x=EVTYPE, y=TOTAL.AMOUNT, fill=EVTYPE)) +
  geom_bar(stat="identity",color="black",alpha=0.6) +
  theme_linedraw() +
  theme(
 plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5),
 axis.title.x = element_text(color="#993333", size=18, face="bold"),
 axis.text.x = element_text(color="#993333", size=14, angle = 90,vjust = 0.5, hjust=1),
 axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
 axis.text.y = element_text(face="bold", color="darkgreen", size=14),
 legend.text = element_text(size=12)
)
} 
harms$EVTYPE <- factor(harms$EVTYPE, levels = harms$EVTYPE)
harms_plot <- my_plot_fun(harms[1:10,]) +
 xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
  ggtitle("The 10 worst weather event fatalities, all years")

harms_sub$EVTYPE <- factor(harms_sub$EVTYPE, levels = harms_sub$EVTYPE)
harms_sub_plot <- my_plot_fun(harms_sub[1:10,]) +
 xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
  labs(
    title = "The 10 worst weather event fatalities, since 1996",
  caption = "This plot contain the most harmful meteorologic events type when regexp is
       applied and since 1996. All are arranged in this barplot.") +
  theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
print(harms_sub_plot)
  1. Second approach
data_set_mini_2 <- data_set[,my_cols]
harms_2 <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini_2, sum)
names(harms_2) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms_2 <- arrange(harms_2, desc(TOTAL.AMOUNT), EVTYPE)
head(harms_2)
##           EVTYPE TOTAL.AMOUNT
## 1        TORNADO        96979
## 2 EXCESSIVE HEAT         8428
## 3      TSTM WIND         7461
## 4          FLOOD         7259
## 5      LIGHTNING         6046
## 6           HEAT         3037
data_set_mini_2.sub <- data_set_mini_2 %>% filter(year(data_set_mini_2$BGN_DATE) > 1996)

harms_2_sub <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini_2.sub, sum)
names(harms_2_sub) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms_2_sub <- arrange(harms_2_sub, desc(TOTAL.AMOUNT), EVTYPE)
head(harms_2_sub, 10)
##               EVTYPE TOTAL.AMOUNT
## 1            TORNADO        21447
## 2     EXCESSIVE HEAT         8093
## 3              FLOOD         7121
## 4          LIGHTNING         4424
## 5          TSTM WIND         3524
## 6        FLASH FLOOD         2425
## 7  THUNDERSTORM WIND         1530
## 8               HEAT         1459
## 9  HURRICANE/TYPHOON         1339
## 10      WINTER STORM         1209
harms_2$EVTYPE <- factor(harms_2$EVTYPE, levels = harms_2$EVTYPE)
harms_2_plot <- my_plot_fun(harms_2[1:10,]) +
 xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
  ggtitle("The 10 worst weather event fatalities, all years, without regexp")

harms_2_sub$EVTYPE <- factor(harms_2_sub$EVTYPE, levels = harms_2_sub$EVTYPE)
harms_2_sub_plot <- my_plot_fun(harms_2_sub[1:10,]) +
 xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
    labs(
    title = "The 10 worst weather events type fatalities, since 1996, without regexp",
  caption = "This plot contain the most harmful meteorologic events type when
    no (any) regexp is applied and since 1996. All are arranged in this barplot.") +
  theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
print(harms_2_sub_plot)
  1. Summarizing table
my_df_1<-merge (harms[1:10,],harms_sub[1:10,], by="EVTYPE")
my_df_2<-merge (harms_2[1:10,],harms_2_sub[1:10,], by="EVTYPE")
my_merge <- merge(my_df_1,my_df_2, by="EVTYPE")
names(my_merge) <- c("EVTYPE","TOTAL.regexp.ALL.YEARS","TOTAL.regexp.SINCE.1996",
                     "TOTAL.NOregexp.ALL.YEARS","TOTAL.NOregexp.SINCE.1996")
my_merge <- arrange(my_merge, desc(TOTAL.regexp.ALL.YEARS))
head(my_merge)
##              EVTYPE TOTAL.regexp.ALL.YEARS TOTAL.regexp.SINCE.1996
## 1           TORNADO                  97043                   21447
## 2 THUNDERSTORM WIND                  10119                    5063
## 3    EXCESSIVE HEAT                   8497                    8095
## 4             FLOOD                   7279                    7123
## 5         LIGHTNING                   6046                    4424
## 6              HEAT                   3037                    1459
##   TOTAL.NOregexp.ALL.YEARS TOTAL.NOregexp.SINCE.1996
## 1                    96979                     21447
## 2                     1621                      1530
## 3                     8428                      8093
## 4                     7259                      7121
## 5                     6046                      4424
## 6                     3037                      1459


Economic impact : property and crop damage


Important note: without regexp, we see the same event return several times. It’s “TSTM WIND”, “THUNDERSTORM WIND” and “THUNDERSTROM WINDS” (with an “s”). This is proof of the interest of regexp in this analysis.


In the section I. 4., the first function is construct to convert the character-exponents into numeric format and keep numeric exponent as-is. Then the second function converts empty/space value to zero and any other strings to numeric value.

  1. First with regexp.
damages <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini, sum)
names(damages) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages <- arrange(damages, desc(ECONOMIC.DAMAGES), EVTYPE)
damages_sub <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini.sub, sum)
names(damages_sub) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages_sub <- arrange(damages_sub, desc(ECONOMIC.DAMAGES), EVTYPE)
  1. Second, without regexp.
damages_2 <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini_2, sum)
names(damages_2) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages_2 <- arrange(damages_2, desc(ECONOMIC.DAMAGES), EVTYPE)
damages_2_sub <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini_2.sub, sum)
names(damages_2_sub) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages_2_sub <- arrange(damages_2_sub, desc(ECONOMIC.DAMAGES), EVTYPE)
  1. Plot of each of the four cases.
my_plot_fun_2 <- function(my_data) {
  my_data %>% ggplot(aes(x=EVTYPE, y=ECONOMIC.DAMAGES, fill=EVTYPE)) +
  geom_bar(stat="identity",color="black",alpha=0.6) +
  theme_linedraw() +
  theme(
 plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5),
 axis.title.x = element_text(color="#993333", size=18, face="bold"),
 axis.text.x = element_text(color="#993333", size=14, angle = 90,vjust = 0.5, hjust=1),
 axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
 axis.text.y = element_text(face="bold", color="darkgreen", size=14),
 legend.text = element_text(size=14)
)
} 
damages$EVTYPE <- factor(damages$EVTYPE, levels = damages$EVTYPE)
damages_plot <- my_plot_fun_2(damages[1:10,]) +
 xlab("Event Type") + ylab("Total crop and proprety damages") +
  ggtitle("The 10 most destructive events, all yeras and with regexp") +
    scale_y_continuous(
    breaks = c(0, 50000000000, 100000000000, 150000000000, 200000000000, 
               250000000000, 300000000000, 350000000000),
    labels = c("0","50 000 000 000", "100 000 000 000", "150 000 000 000", 
               "200 000 000 000", "250 000 000 000", "300 000 000 000", 
               "350 000 000 000"))
  

damages_sub$EVTYPE <- factor(damages_sub$EVTYPE, levels = damages_sub$EVTYPE)
damages_sub_plot <- my_plot_fun_2(damages_sub[1:10,]) +
 xlab("Event Type") + ylab("Total crop and proprety damages") +
      scale_y_continuous(
    breaks = c(0, 50000000000, 100000000000, 150000000000, 200000000000,
               250000000000, 300000000000, 350000000000),
    labels = c("0","50 000 000 000", "100 000 000 000", "150 000 000 000", 
               "200 000 000 000", "250 000 000 000", "300 000 000 000", 
               "350 000 000 000")) +
      labs(
    title = "The 10 most destructive events, since 1996 and with regexp",
  caption = "This plot contain the most destructer meteorologic events type 
  when regexp is applied and since 1996. All are arranged in this barplot.") +
  theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))

damages_2$EVTYPE <- factor(damages_2$EVTYPE, levels = damages_2$EVTYPE)
damages_2_plot <- my_plot_fun_2(damages_2[1:10,]) +
 xlab("Event Type") + ylab("Total crop and proprety damages") +
  ggtitle("The 10 most destructive events, all years and without regexp") + 
  scale_y_continuous(
    breaks = c(0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000),
    labels = c("0","500 000", "1 000 000", "1 500 000", "2 000 000", "2 500 000", 
               "3 000 000", "3 500 000"))
  

damages_2_sub$EVTYPE <- factor(damages_2_sub$EVTYPE, levels = damages_2_sub$EVTYPE)
damages_2_sub_plot <- my_plot_fun_2(damages_2_sub[1:10,]) +
 xlab("Event Type") + ylab("Total crop and proprety damages") +
    scale_y_continuous(
    breaks = c(0, 250000, 500000, 750000, 1000000, 1250000),
    labels = c("0", "250 000", "500 000", "750 000", "1 000 000", "1 250 000"))
  ggtitle("The 10 most destructive events, since 1996 and without regexp")
## $title
## [1] "The 10 most destructive events, since 1996 and without regexp"
## 
## attr(,"class")
## [1] "labels"
print(damages_sub_plot)


Third and last section : Results


Important note: without regexp, we see the same event return several times. It’s “TSTM WIND”, “THUNDERSTORM WIND” and “THUNDERSTROM WINDS” (with an “s”). This is proof of the interest of regexp in this analysis.


Answer to the first question Across the United States, which types of events are most harmful with respect to population health? is :

Answer to the second question Across the United States, which types of events have the greatest economic consequences? is :