Synopsis

NOAA database stores a huge amount of data, we are interested to compare the effects of different weather events. This report calculates most harmful events related to fatalities, injuries and economic damages from 1996 to 2011.Because of not homogeneous data, it was necessary to operate some transformation on original data. Every step is documented with words and code. Results are displayed as simple graphs.

#Load libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2

Data Processing

      df <- read.csv("repdata_data_StormData.csv.bz2")
str(df)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

We are interested only to variables describing the events and a sort of damage for human and property

df <- subset(df, select=c("EVTYPE", "BGN_DATE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG","CROPDMGEXP"))

Only from 1996 all events types were recorded therefore we subset observations according to this note.

df$BGN_DATE <- year(strptime(df$BGN_DATE, format="%m/%d/%Y"))
ndf <- df %>% filter(BGN_DATE>=1996)

Variable EVTYPE is affected by typos and some abbreviation makes data inconsistent with codebook therefore we have to operate on it

ndf$EVTYPE <- toupper(ndf$EVTYPE)
ndf$EVTYPE <- gsub("TSTM", "THUNDERSTORM", ndf$EVTYPE)
ndf$EVTYPE <- gsub("WINDS", "WIND", ndf$EVTYPE)
ndf$EVTYPE[grepl("SNOW", ndf$EVTYPE)] <- "Heavy Snow"
ndf$EVTYPE[grepl("RAIN|PRECIPITATION|RAINFALL|PRECIP", ndf$EVTYPE)] <- "Heavy Rain"
ndf$EVTYPE[grepl("HAIL", ndf$EVTYPE)] <- "Hail"
ndf$EVTYPE[grepl("HURRICANE|TYPHOON", ndf$EVTYPE)] <- "Hurricane (Typhoon)"
ndf$EVTYPE[grepl("FLOODING|SML STREAM|SMALL STREAM|FLOODIN|STREAM FLOOD|URBAN", ndf$EVTYPE)] <- "Flood"
ndf$EVTYPE[grepl("FOG", ndf$EVTYPE)] <- "Dense Fog"
ndf$EVTYPE[grepl("FUNNEL", ndf$EVTYPE)] <- "Funnel Cloud"
ndf$EVTYPE[grepl("FROST|FREEZE", ndf$EVTYPE)] <- "Frost/Freeze"
ndf$EVTYPE[grepl("LOW TEMPERATURE|COLD|CHILL|COOL", ndf$EVTYPE)] <- "Extreme Cold"
ndf$EVTYPE[grepl("HEAT|HOT|HIGH TEMPERATURE|TEMPERATURE RECORD|WARM|WARMTH", ndf$EVTYPE)] <- "Excessive Heat"
ndf$EVTYPE[grepl("SURF", ndf$EVTYPE)] <- "High Surf"
ndf$EVTYPE[grepl("FLASH", ndf$EVTYPE)] <- "Flash Flood"
ndf$EVTYPE[grepl("THUNDERSTORM WIND|THUNDERSTROM|THUNERSTORM|THUNDEERSTORM|THUNDERSTORM WINS|THUNDERSTORM WND|THUNDERSTORMW|THUNDERSTORMS|THUNDERSTORM  WIND|THUNDERTORM|SEVERE THUNDERSTORM|MICROBURST", ndf$EVTYPE)] <- "Thunderstorm Wind"
ndf$EVTYPE[grepl("TORNADO|TORNADOES|TORNADOS", ndf$EVTYPE)] <- "Tornado"
ndf$EVTYPE[grepl("HIGH WIND", ndf$EVTYPE)] <- "High Wind"
ndf$EVTYPE[grepl("TROPICAL STORM", ndf$EVTYPE)] <- "Tropical Storm"
ndf$EVTYPE[grepl("SMOKE", ndf$EVTYPE)] <- "Dense Smoke"
ndf$EVTYPE[grepl("ICE|BLACK ICE", ndf$EVTYPE)] <- "Ice Storm"
ndf$EVTYPE <- toupper(ndf$EVTYPE)

To estimate economic damages we have to operate on variables PROPDMGEXP and CROPDMGEXP. According to codebook we change h in 2, k in 3, m in 6 and b in 9 and set any other symbol to zero.

ndf$PROPDMGEXP <- gsub("[hH]", "2", ndf$PROPDMGEXP)
ndf$PROPDMGEXP <- gsub("[kK]", "3", ndf$PROPDMGEXP)
ndf$PROPDMGEXP <- gsub("[mM]", "6", ndf$PROPDMGEXP)
ndf$PROPDMGEXP <- gsub("[bB]", "9", ndf$PROPDMGEXP)
ndf$PROPDMGEXP <- gsub("[-+?]", "0", ndf$PROPDMGEXP)
ndf$PROPDMGEXP <- as.numeric(ndf$PROPDMGEXP)
ndf$PROPDMGEXP[is.na(ndf$PROPDMGEXP)] <- 0
ndf$CROPDMGEXP <- gsub("[hH]", "2", ndf$CROPDMGEXP)
ndf$CROPDMGEXP <- gsub("[kK]", "3", ndf$CROPDMGEXP)
ndf$CROPDMGEXP <- gsub("[mM]", "6", ndf$CROPDMGEXP)
ndf$CROPDMGEXP <- gsub("[bB]", "9", ndf$CROPDMGEXP)
ndf$CROPDMGEXP <- gsub("[-+?]", "0", ndf$CROPDMGEXP)
ndf$CROPDMGEXP <- as.numeric(ndf$CROPDMGEXP)
ndf$CROPDMGEXP[is.na(ndf$CROPDMGEXP)] <- 0

Results

Across the United States, which types of events are most harmful with respect to population health?

We have to focus on variables Fatalities and Injuries. First of all we have to group data by event type.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
fat <- ndf %>% group_by(EVTYPE) %>% summarise(Fatalities=sum(FATALITIES))
## `summarise()` ungrouping output (override with `.groups` argument)
ordfat <- fat %>% arrange(desc(Fatalities))
inj <- ndf %>% group_by(EVTYPE) %>% summarise(Injuries=sum(INJURIES))
## `summarise()` ungrouping output (override with `.groups` argument)
ordinj <- inj %>% arrange(desc(Injuries))

Now we can plot and have a visual idea of top 10 harmful events

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
topfat <- ordfat[1:10,]
topinj <- ordinj[1:10,]
g1 <- ggplot(topfat, aes(reorder(EVTYPE, desc(Fatalities)),Fatalities)) +
      geom_col(fill="red", colour="black") +
      ggtitle("Fatalities by Event Type (1996-2011)") +
      theme(axis.text.x  = element_text(face="bold", colour="black", angle=90, size=8), 
            axis.text.y = element_text(face="bold", colour="black", size=8),
            plot.title = element_text(hjust = 0.5)) +xlab("")+ylab("")
print(g1)

g2 <- ggplot(topinj, aes(reorder(EVTYPE, desc(Injuries)),Injuries)) + 
      geom_col(fill="purple", colour="black") +
      ggtitle("Injuries by Event Type (1996-2011)") +
      theme(axis.text.x  = element_text(face="bold", colour="black", angle=90, size=8), 
            axis.text.y = element_text(face="bold", colour="black", size=8),
            plot.title = element_text(hjust = 0.5)) +xlab("")+ylab("")
      
print(g2)

Across the United States, which types of events have the greatest economic consequences
To address this question, for each event we have to calculate economic damage in variables property and crop, therefore make the sum.

ndf$propvalue=ndf$PROPDMG*10^ndf$PROPDMGEXP
ndf$cropvalue=ndf$CROPDMG*10^ndf$CROPDMGEXP
prop <- ndf %>% group_by(EVTYPE) %>% summarise(tot=sum(propvalue)+sum(cropvalue))
## `summarise()` ungrouping output (override with `.groups` argument)
ordprop <- prop %>% arrange(desc(tot))
topdam <- ordprop[1:10,]

Now we can plot

g3 <- ggplot(topdam, aes(reorder(EVTYPE, desc(tot)),tot/10^9)) + 
      geom_col(fill="green", colour="black") +
      ggtitle("Economic damages in Billion of $ by Event Type (1996-2011)") +
      theme(axis.text.x  = element_text(face="bold", colour="black", angle=90, size=8), 
            axis.text.y = element_text(face="bold", colour="black", size=8),
            plot.title = element_text(hjust = 0.5)) +xlab("")+ylab("")
print(g3)