The objective of this analysis is quantify the most harmful event type of NOAA with respect to population health and the events types has the greatest economic consequences
The first step of the analysis is get the data i de NOAA DB, in the link: “https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2”.
if(!file.exists("./repdata_2Fdata_2FStormData.csv.bz2")){
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "./repdata_2Fdata_2FStormData.csv.bz2")
Sys.time()
NOAAData <- read.csv("./repdata_2Fdata_2FStormData.csv.bz2")
Sys.time()
} else{
Sys.time()
NOAAData <- read.csv("./repdata_2Fdata_2FStormData.csv.bz2")
Sys.time()
}
## [1] "2022-12-11 16:08:15 -05"
Once obtained the data, the libraries needed, must be loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(purrr)
library(ggplot2)
The data loaded bellow has the structure
head(NOAAData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
The Economic damage has the prefix on the ‘EXP’ columns, the convertion to values need a function
power <- function(value){
if(value=="") 1 else switch(value,"0" = 1,"K" = 10^3,"M" = 10^6,"B" = 10^9,)
}
Tidy data is need to Exploratory Data Analysis. The case of study only need the follow columns: ‘EVTYPE’, ‘BGN_DATE’, ‘INJURIES’, ‘FATALITIES’, ‘PROPDMG’, ‘CROPDMG’. The util data is all the data with values, for this reason de 0 value es deprecated. The data between 1933 and 1995, is deprecated because the format contains many errors and de number of rows is low comparated to the global data. The work flow to clean and organize data is
NOAA.ResumedData <- NOAAData[,c(2,8,23:28)] %>% mutate(BGN_DATE = as.Date(BGN_DATE, format = "%m/%d/%Y")) %>% filter(!(year(BGN_DATE) >= 1993 & year(BGN_DATE) < 1996)) %>% mutate(PROPDMG = PROPDMG * unlist(map(PROPDMGEXP, .f = power))) %>% mutate(CROPDMG = CROPDMG * unlist(map(CROPDMGEXP, .f = power))) %>% select(-c("PROPDMGEXP", "CROPDMGEXP")) %>% filter(FATALITIES > 0 | INJURIES > 0 | CROPDMG > 0 | PROPDMG > 0) %>% mutate(EVTYPE = toupper(EVTYPE))
The Even Type has many typos and the unique value shows thousands of events, the NOAA have only 48 grouped values, we find in the documentation. Using Regex in the Even type, the number of different type in the data is reduced.
EVTYPE.real <- toupper(readLines("./EVTYPE_list"))
NOAA.ResumedData$EVTYPEFIX <- ""
for(type in EVTYPE.real){
NOAA.ResumedData[grepl(type, NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- type
}
rm(type)
unique(NOAA.ResumedData[NOAA.ResumedData$EVTYPEFIX == "","EVTYPE"])
## [1] "TSTM WIND" "FREEZING RAIN"
## [3] "EXTREME COLD" "OTHER"
## [5] "WILD/FOREST FIRE" "STORM SURGE"
## [7] "URBAN/SML STREAM FLD" "FOG"
## [9] "ROUGH SURF" "HEAVY SURF"
## [11] "MARINE ACCIDENT" "FREEZE"
## [13] "DRY MICROBURST" "WINDS"
## [15] "COASTAL STORM" "DAMAGING FREEZE"
## [17] "HURRICANE" "BEACH EROSION"
## [19] "UNSEASONABLE COLD" "EARLY FROST"
## [21] "WINTRY MIX" "TORRENTIAL RAINFALL"
## [23] "LANDSLUMP" "HURRICANE EDOUARD"
## [25] "EXTREME WINDCHILL" "GLAZE"
## [27] "EXTENDED COLD" "WHIRLWIND"
## [29] "LIGHT SNOW" "MIXED PRECIP"
## [31] "COLD" "FREEZING SPRAY"
## [33] "DOWNBURST" "MUDSLIDES"
## [35] "MICROBURST" "MUDSLIDE"
## [37] "SNOW" "SNOW SQUALLS"
## [39] "WIND DAMAGE" "LIGHT SNOWFALL"
## [41] "FREEZING DRIZZLE" "GUSTY WIND/RAIN"
## [43] "GUSTY WIND/HVY RAIN" "WIND"
## [45] "COLD TEMPERATURE" "COLD AND SNOW"
## [47] "RAIN/SNOW" "TSTM WIND (G45)"
## [49] "GUSTY WINDS" "GUSTY WIND"
## [51] "TSTM WIND 40" "TSTM WIND 45"
## [53] "HARD FREEZE" "TSTM WIND (41)"
## [55] "TSTM WIND (G40)" "MUD SLIDE"
## [57] "SNOW AND ICE" "AGRICULTURAL FREEZE"
## [59] "SNOW SQUALL" "ICY ROADS"
## [61] "THUNDERSTORM" "HYPOTHERMIA/EXPOSURE"
## [63] "LAKE EFFECT SNOW" "MIXED PRECIPITATION"
## [65] "BLACK ICE" "COASTALSTORM"
## [67] "DAM BREAK" "BLOWING SNOW"
## [69] "FROST" "GRADIENT WIND"
## [71] "UNSEASONABLY COLD" "WET MICROBURST"
## [73] "HEAVY SURF AND WIND" "FUNNEL CLOUD"
## [75] "TYPHOON" "LANDSLIDES"
## [77] "HIGH SWELLS" "UNSEASONAL RAIN"
## [79] " TSTM WIND (G45)" "TSTM WIND (G45)"
## [81] "TSTM WIND (G35)" "COASTAL EROSION"
## [83] "UNSEASONABLY WARM" "HYPERTHERMIA/EXPOSURE"
## [85] "ROCK SLIDE" "HEAVY SEAS"
## [87] " TSTM WIND" "LANDSPOUT"
## [89] "EXCESSIVE SNOW" "WIND AND WAVE"
## [91] "LIGHT FREEZING RAIN" "ICE ROADS"
## [93] "HIGH SEAS" "RAIN"
## [95] "ROUGH SEAS" "TSTM WIND G45"
## [97] "NON-SEVERE WIND DAMAGE" "WARM WEATHER"
## [99] "LANDSLIDE" "HIGH WATER"
## [101] "LATE SEASON SNOW" "ROGUE WAVE"
## [103] "FALLING SNOW/ICE" "NON-TSTM WIND"
## [105] "NON TSTM WIND" "BRUSH FIRE"
## [107] "BLOWING DUST" "HAZARDOUS SURF"
## [109] "COLD WEATHER" "ICE ON ROAD"
## [111] "DROWNING" "EXTREME COLD/WIND CHILL"
## [113] "MARINE TSTM WIND" "HURRICANE/TYPHOON"
## [115] "ASTRONOMICAL HIGH TIDE" "COLD/WIND CHILL"
Now is necessary reduce the data in the EVTYPE to analyse.
NOAA.ResumedData[grepl("TSTM", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Thunderstorm Wind")
NOAA.ResumedData[grepl("HURRICANE|TYPHOON|BURST|funnel", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Hurricane (Typhoon)")
NOAA.ResumedData[grepl("STORM", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Storm Surge/Tide")
NOAA.ResumedData[grepl("SURF", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("High Surf")
NOAA.ResumedData[grepl("Wind", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("High Wind")
NOAA.ResumedData[grepl("Cold|Windhill", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Cold/Windhill")
NOAA.ResumedData[grepl("Ice|Storm|freezing|freeze|glaze|WINTRY", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Ice Storm")
NOAA.ResumedData[grepl("frost|icy|snow", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Heavy Snow")
NOAA.ResumedData[grepl("coastal", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Coastal Flood")
NOAA.ResumedData[grepl("ASTRONOMICAL", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Astronomical Low Tide")
NOAA.ResumedData[grepl("PRECIP|RAIN", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Heavy Rain")
NOAA.ResumedData[grepl("slide|LANDSPOUT|LANDSLUMP|Erosion", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Avalanche")
NOAA.ResumedData[grepl("seas|wave|marine|swell|water", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Marine Strong Wind")
NOAA.ResumedData[grepl("fog", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Dense Fog")
NOAA.ResumedData[grepl("HYPOTHERMIA", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Frost/Freeze")
NOAA.ResumedData[grepl("HYPERTHERMIA|WARM WEATHER", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Heat")
NOAA.ResumedData[grepl("Fire", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Wildfire")
NOAA.ResumedData[grepl("dust", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Dust Storm")
NOAA.ResumedData[grepl("dam|fld", NOAA.ResumedData$EVTYPE, ignore.case = T),"EVTYPEFIX"] <- toupper("Lakeshore Flood")
NOAA.ResumedData[NOAA.ResumedData$EVTYPEFIX == "","EVTYPEFIX"] <- "OTHER"
NOAA.ResumedData$EVTYPE <- NOAA.ResumedData$EVTYPEFIX
NOAA.ResumedData <- select(NOAA.ResumedData,-c("EVTYPEFIX"))
is.factor(NOAA.ResumedData$EVTYPE)
## [1] FALSE
NOAA.ResumedData$EVTYPE <- factor(NOAA.ResumedData$EVTYPE)
Once the data is organized in the EVTYPE and the values of the Economic losses, proceed to show the data analysis
NOAA.ResumedData %>% group_by(EVTYPE) %>%
summarise(Injuries = sum(INJURIES)) %>%
arrange(desc(Injuries))
## # A tibble: 31 × 2
## EVTYPE Injuries
## <fct> <dbl>
## 1 TORNADO 88703
## 2 HIGH WIND 8538
## 3 FLOOD 8434
## 4 HEAT 7615
## 5 LIGHTNING 4141
## 6 ICE STORM 3718
## 7 WILDFIRE 1458
## 8 HURRICANE (TYPHOON) 1354
## 9 HAIL 1124
## 10 DENSE FOG 855
## # … with 21 more rows
NOAA.ResumedData %>% group_by(EVTYPE) %>%
summarise(Injuries = sum(INJURIES)) %>%
arrange(desc(Injuries)) %>%
ggplot(aes(x=EVTYPE, y=Injuries)) + geom_bar(position="dodge", stat="identity", fill = "orange") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Injuries by Event")
NOAA.ResumedData %>% group_by(EVTYPE) %>%
summarise(Fatalities = sum(FATALITIES)) %>%
arrange(desc(Fatalities))
## # A tibble: 31 × 2
## EVTYPE Fatalities
## <fct> <dbl>
## 1 TORNADO 5523
## 2 HEAT 2037
## 3 FLOOD 1303
## 4 HIGH WIND 899
## 5 LIGHTNING 651
## 6 RIP CURRENT 542
## 7 ICE STORM 482
## 8 COLD/WINDHILL 358
## 9 AVALANCHE 266
## 10 HIGH SURF 142
## # … with 21 more rows
NOAA.ResumedData %>% group_by(EVTYPE) %>%
summarise(Fatalities = sum(FATALITIES)) %>%
arrange(desc(Fatalities)) %>%
ggplot(aes(x=EVTYPE, y=Fatalities)) + geom_bar(position="dodge", stat="identity", fill = "red") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Fatalities by Event")
NOAA.ResumedData %>% group_by(EVTYPE) %>%
summarise(Economic_Losses = sum((CROPDMG + PROPDMG)/10^6))%>% arrange(desc(Economic_Losses))
## # A tibble: 31 × 2
## EVTYPE Economic_Losses
## <fct> <dbl>
## 1 FLOOD 165633.
## 2 HURRICANE (TYPHOON) 87071.
## 3 ICE STORM 65338.
## 4 TORNADO 55499.
## 5 HAIL 17092.
## 6 DROUGHT 14414.
## 7 HIGH WIND 11296.
## 8 WILDFIRE 8163.
## 9 HEAVY SNOW 1900.
## 10 COLD/WINDHILL 1341.
## # … with 21 more rows
NOAA.ResumedData %>% group_by(EVTYPE) %>%
summarise(Economic_Losses = sum((CROPDMG + PROPDMG)/10^6))%>%
ggplot(aes(x=EVTYPE, y=Economic_Losses)) + geom_bar(position="dodge", stat="identity", fill = "green") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle("Economic Losses in MILLION USD")