Reproducible Research

Course Project 2 - NOAA Disaster Data

The objective of this analysis is quantify the most harmful event type of NOAA with respect to population health and the events types has the greatest economic consequences

The first step of the analysis is get the data i de NOAA DB, in the link: “https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2”.

if(!file.exists("./repdata_2Fdata_2FStormData.csv.bz2")){
      download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "./repdata_2Fdata_2FStormData.csv.bz2")
      Sys.time()
      NOAAData <- read.csv("./repdata_2Fdata_2FStormData.csv.bz2")
      Sys.time()
} else{
      Sys.time()
      NOAAData <- read.csv("./repdata_2Fdata_2FStormData.csv.bz2")
      Sys.time()
}
## [1] "2022-12-11 16:08:15 -05"

Once obtained the data, the libraries needed, must be loaded

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(purrr)
library(ggplot2)

The data loaded bellow has the structure

head(NOAAData)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6

The Economic damage has the prefix on the ‘EXP’ columns, the convertion to values need a function

power <- function(value){
      if(value=="") 1 else switch(value,"0" = 1,"K" = 10^3,"M" = 10^6,"B" = 10^9,)
}

Tidy data is need to Exploratory Data Analysis. The case of study only need the follow columns: ‘EVTYPE’, ‘BGN_DATE’, ‘INJURIES’, ‘FATALITIES’, ‘PROPDMG’, ‘CROPDMG’. The util data is all the data with values, for this reason de 0 value es deprecated. The data between 1933 and 1995, is deprecated because the format contains many errors and de number of rows is low comparated to the global data. The work flow to clean and organize data is

NOAA.ResumedData <- NOAAData[,c(2,8,23:28)] %>% mutate(BGN_DATE = as.Date(BGN_DATE, format = "%m/%d/%Y")) %>% filter(!(year(BGN_DATE) >= 1993 & year(BGN_DATE) < 1996)) %>% mutate(PROPDMG = PROPDMG * unlist(map(PROPDMGEXP, .f = power))) %>% mutate(CROPDMG = CROPDMG * unlist(map(CROPDMGEXP, .f = power))) %>% select(-c("PROPDMGEXP", "CROPDMGEXP")) %>% filter(FATALITIES > 0 | INJURIES > 0 | CROPDMG > 0 | PROPDMG > 0) %>% mutate(EVTYPE = toupper(EVTYPE))

The Even Type has many typos and the unique value shows thousands of events, the NOAA have only 48 grouped values, we find in the documentation. Using Regex in the Even type, the number of different type in the data is reduced.

EVTYPE.real <- toupper(readLines("./EVTYPE_list"))
NOAA.ResumedData$EVTYPEFIX <- ""
for(type in EVTYPE.real){
      NOAA.ResumedData[grepl(type, NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- type
}
rm(type)
unique(NOAA.ResumedData[NOAA.ResumedData$EVTYPEFIX == "","EVTYPE"])
##   [1] "TSTM WIND"               "FREEZING RAIN"          
##   [3] "EXTREME COLD"            "OTHER"                  
##   [5] "WILD/FOREST FIRE"        "STORM SURGE"            
##   [7] "URBAN/SML STREAM FLD"    "FOG"                    
##   [9] "ROUGH SURF"              "HEAVY SURF"             
##  [11] "MARINE ACCIDENT"         "FREEZE"                 
##  [13] "DRY MICROBURST"          "WINDS"                  
##  [15] "COASTAL STORM"           "DAMAGING FREEZE"        
##  [17] "HURRICANE"               "BEACH EROSION"          
##  [19] "UNSEASONABLE COLD"       "EARLY FROST"            
##  [21] "WINTRY MIX"              "TORRENTIAL RAINFALL"    
##  [23] "LANDSLUMP"               "HURRICANE EDOUARD"      
##  [25] "EXTREME WINDCHILL"       "GLAZE"                  
##  [27] "EXTENDED COLD"           "WHIRLWIND"              
##  [29] "LIGHT SNOW"              "MIXED PRECIP"           
##  [31] "COLD"                    "FREEZING SPRAY"         
##  [33] "DOWNBURST"               "MUDSLIDES"              
##  [35] "MICROBURST"              "MUDSLIDE"               
##  [37] "SNOW"                    "SNOW SQUALLS"           
##  [39] "WIND DAMAGE"             "LIGHT SNOWFALL"         
##  [41] "FREEZING DRIZZLE"        "GUSTY WIND/RAIN"        
##  [43] "GUSTY WIND/HVY RAIN"     "WIND"                   
##  [45] "COLD TEMPERATURE"        "COLD AND SNOW"          
##  [47] "RAIN/SNOW"               "TSTM WIND (G45)"        
##  [49] "GUSTY WINDS"             "GUSTY WIND"             
##  [51] "TSTM WIND 40"            "TSTM WIND 45"           
##  [53] "HARD FREEZE"             "TSTM WIND (41)"         
##  [55] "TSTM WIND (G40)"         "MUD SLIDE"              
##  [57] "SNOW AND ICE"            "AGRICULTURAL FREEZE"    
##  [59] "SNOW SQUALL"             "ICY ROADS"              
##  [61] "THUNDERSTORM"            "HYPOTHERMIA/EXPOSURE"   
##  [63] "LAKE EFFECT SNOW"        "MIXED PRECIPITATION"    
##  [65] "BLACK ICE"               "COASTALSTORM"           
##  [67] "DAM BREAK"               "BLOWING SNOW"           
##  [69] "FROST"                   "GRADIENT WIND"          
##  [71] "UNSEASONABLY COLD"       "WET MICROBURST"         
##  [73] "HEAVY SURF AND WIND"     "FUNNEL CLOUD"           
##  [75] "TYPHOON"                 "LANDSLIDES"             
##  [77] "HIGH SWELLS"             "UNSEASONAL RAIN"        
##  [79] " TSTM WIND (G45)"        "TSTM WIND  (G45)"       
##  [81] "TSTM WIND (G35)"         "COASTAL EROSION"        
##  [83] "UNSEASONABLY WARM"       "HYPERTHERMIA/EXPOSURE"  
##  [85] "ROCK SLIDE"              "HEAVY SEAS"             
##  [87] " TSTM WIND"              "LANDSPOUT"              
##  [89] "EXCESSIVE SNOW"          "WIND AND WAVE"          
##  [91] "LIGHT FREEZING RAIN"     "ICE ROADS"              
##  [93] "HIGH SEAS"               "RAIN"                   
##  [95] "ROUGH SEAS"              "TSTM WIND G45"          
##  [97] "NON-SEVERE WIND DAMAGE"  "WARM WEATHER"           
##  [99] "LANDSLIDE"               "HIGH WATER"             
## [101] "LATE SEASON SNOW"        "ROGUE WAVE"             
## [103] "FALLING SNOW/ICE"        "NON-TSTM WIND"          
## [105] "NON TSTM WIND"           "BRUSH FIRE"             
## [107] "BLOWING DUST"            "HAZARDOUS SURF"         
## [109] "COLD WEATHER"            "ICE ON ROAD"            
## [111] "DROWNING"                "EXTREME COLD/WIND CHILL"
## [113] "MARINE TSTM WIND"        "HURRICANE/TYPHOON"      
## [115] "ASTRONOMICAL HIGH TIDE"  "COLD/WIND CHILL"

Now is necessary reduce the data in the EVTYPE to analyse.

NOAA.ResumedData[grepl("TSTM", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Thunderstorm Wind")
NOAA.ResumedData[grepl("HURRICANE|TYPHOON|BURST|funnel", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Hurricane (Typhoon)")
NOAA.ResumedData[grepl("STORM", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Storm Surge/Tide")
NOAA.ResumedData[grepl("SURF", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("High Surf")
NOAA.ResumedData[grepl("Wind", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("High Wind")
NOAA.ResumedData[grepl("Cold|Windhill", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Cold/Windhill")
NOAA.ResumedData[grepl("Ice|Storm|freezing|freeze|glaze|WINTRY", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Ice Storm")
NOAA.ResumedData[grepl("frost|icy|snow", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Heavy Snow")
NOAA.ResumedData[grepl("coastal", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Coastal Flood")
NOAA.ResumedData[grepl("ASTRONOMICAL", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Astronomical Low Tide")
NOAA.ResumedData[grepl("PRECIP|RAIN", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Heavy Rain")
NOAA.ResumedData[grepl("slide|LANDSPOUT|LANDSLUMP|Erosion", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Avalanche")
NOAA.ResumedData[grepl("seas|wave|marine|swell|water", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Marine Strong Wind")
NOAA.ResumedData[grepl("fog", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Dense Fog")
NOAA.ResumedData[grepl("HYPOTHERMIA", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Frost/Freeze")
NOAA.ResumedData[grepl("HYPERTHERMIA|WARM WEATHER", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Heat")
NOAA.ResumedData[grepl("Fire", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Wildfire")
NOAA.ResumedData[grepl("dust", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Dust Storm")
NOAA.ResumedData[grepl("dam|fld", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Lakeshore Flood")
NOAA.ResumedData[NOAA.ResumedData$EVTYPEFIX == "","EVTYPEFIX"] <- "OTHER"
NOAA.ResumedData$EVTYPE <- NOAA.ResumedData$EVTYPEFIX
NOAA.ResumedData <- select(NOAA.ResumedData,-c("EVTYPEFIX"))
is.factor(NOAA.ResumedData$EVTYPE)
## [1] FALSE
NOAA.ResumedData$EVTYPE <- factor(NOAA.ResumedData$EVTYPE)

Once the data is organized in the EVTYPE and the values of the Economic losses, proceed to show the data analysis

Injuries Number by Eventype

NOAA.ResumedData %>% group_by(EVTYPE) %>%
      summarise(Injuries = sum(INJURIES)) %>%
      arrange(desc(Injuries))
## # A tibble: 31 × 2
##    EVTYPE              Injuries
##    <fct>                  <dbl>
##  1 TORNADO                88703
##  2 HIGH WIND               8538
##  3 FLOOD                   8434
##  4 HEAT                    7615
##  5 LIGHTNING               4141
##  6 ICE STORM               3718
##  7 WILDFIRE                1458
##  8 HURRICANE (TYPHOON)     1354
##  9 HAIL                    1124
## 10 DENSE FOG                855
## # … with 21 more rows
NOAA.ResumedData %>% group_by(EVTYPE) %>%
      summarise(Injuries = sum(INJURIES)) %>%
      arrange(desc(Injuries)) %>%
      ggplot(aes(x=EVTYPE, y=Injuries)) + geom_bar(position="dodge", stat="identity", fill = "orange") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Injuries by Event")

Fatalities Number by Eventype

NOAA.ResumedData %>% group_by(EVTYPE) %>%
      summarise(Fatalities = sum(FATALITIES)) %>%
      arrange(desc(Fatalities))
## # A tibble: 31 × 2
##    EVTYPE        Fatalities
##    <fct>              <dbl>
##  1 TORNADO             5523
##  2 HEAT                2037
##  3 FLOOD               1303
##  4 HIGH WIND            899
##  5 LIGHTNING            651
##  6 RIP CURRENT          542
##  7 ICE STORM            482
##  8 COLD/WINDHILL        358
##  9 AVALANCHE            266
## 10 HIGH SURF            142
## # … with 21 more rows
NOAA.ResumedData %>% group_by(EVTYPE) %>%
      summarise(Fatalities = sum(FATALITIES)) %>%
      arrange(desc(Fatalities)) %>%
      ggplot(aes(x=EVTYPE, y=Fatalities)) + geom_bar(position="dodge", stat="identity", fill = "red") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Fatalities by Event")

Economic Losses by Eventype

NOAA.ResumedData %>% group_by(EVTYPE) %>%
      summarise(Economic_Losses = sum((CROPDMG + PROPDMG)/10^6))%>% arrange(desc(Economic_Losses))
## # A tibble: 31 × 2
##    EVTYPE              Economic_Losses
##    <fct>                         <dbl>
##  1 FLOOD                       165633.
##  2 HURRICANE (TYPHOON)          87071.
##  3 ICE STORM                    65338.
##  4 TORNADO                      55499.
##  5 HAIL                         17092.
##  6 DROUGHT                      14414.
##  7 HIGH WIND                    11296.
##  8 WILDFIRE                      8163.
##  9 HEAVY SNOW                    1900.
## 10 COLD/WINDHILL                 1341.
## # … with 21 more rows
NOAA.ResumedData %>% group_by(EVTYPE) %>%
      summarise(Economic_Losses = sum((CROPDMG + PROPDMG)/10^6))%>%
      ggplot(aes(x=EVTYPE, y=Economic_Losses)) + geom_bar(position="dodge", stat="identity", fill = "green") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Economic Losses in MILLION USD")