Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events.

Data Processing

The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. You can download the file from the course web site.

library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)

fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

if (!file.exists("./data")) {
        dir.create("./data")
        }

download.file(url = fileUrl, destfile = "./data/storm.bz2")

data <-  read.csv(bzfile("./data/storm.bz2"))

# let's look at the data
glimpse(data)
## Rows: 902,297
## Columns: 37
## $ STATE__    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ BGN_DATE   <chr> "4/18/1950 0:00:00", "4/18/1950 0:00:00", "2/20/1951 0:0...
## $ BGN_TIME   <chr> "0130", "0145", "1600", "0900", "1500", "2000", "0100", ...
## $ TIME_ZONE  <chr> "CST", "CST", "CST", "CST", "CST", "CST", "CST", "CST", ...
## $ COUNTY     <dbl> 97, 3, 57, 89, 43, 77, 9, 123, 125, 57, 43, 9, 73, 49, 1...
## $ COUNTYNAME <chr> "MOBILE", "BALDWIN", "FAYETTE", "MADISON", "CULLMAN", "L...
## $ STATE      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ EVTYPE     <chr> "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNADO", "...
## $ BGN_RANGE  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ BGN_AZI    <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ BGN_LOCATI <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ END_DATE   <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ END_TIME   <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ COUNTY_END <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ COUNTYENDN <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ END_RANGE  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ END_AZI    <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ END_LOCATI <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ LENGTH     <dbl> 14.0, 2.0, 0.1, 0.0, 0.0, 1.5, 1.5, 0.0, 3.3, 2.3, 1.3, ...
## $ WIDTH      <dbl> 100, 150, 123, 100, 150, 177, 33, 33, 100, 100, 400, 400...
## $ F          <int> 3, 2, 2, 2, 2, 2, 2, 1, 3, 3, 1, 1, 3, 3, 3, 4, 1, 1, 1,...
## $ MAG        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FATALITIES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0,...
## $ INJURIES   <dbl> 15, 0, 2, 2, 2, 6, 1, 0, 14, 0, 3, 3, 26, 12, 6, 50, 2, ...
## $ PROPDMG    <dbl> 25.0, 2.5, 25.0, 2.5, 2.5, 2.5, 2.5, 2.5, 25.0, 25.0, 2....
## $ PROPDMGEXP <chr> "K", "K", "K", "K", "K", "K", "K", "K", "K", "K", "M", "...
## $ CROPDMG    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CROPDMGEXP <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ WFO        <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ STATEOFFIC <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ ZONENAMES  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ LATITUDE   <dbl> 3040, 3042, 3340, 3458, 3412, 3450, 3405, 3255, 3334, 33...
## $ LONGITUDE  <dbl> 8812, 8755, 8742, 8626, 8642, 8748, 8631, 8558, 8740, 87...
## $ LATITUDE_E <dbl> 3051, 0, 0, 0, 0, 0, 0, 0, 3336, 3337, 3402, 3404, 0, 34...
## $ LONGITUDE_ <dbl> 8806, 0, 0, 0, 0, 0, 0, 0, 8738, 8737, 8644, 8640, 0, 85...
## $ REMARKS    <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", ...
## $ REFNUM     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

# select the data required for analysis
sdata <- data %>% 
        select(EVTYPE, FATALITIES, INJURIES, 
               PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

# preparation of data for analysis: property damage
# let's look at the unique values
unique(sdata$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
# replacement of letters with numbers
sdata$PROPDMGEXP <- toupper(sdata$PROPDMGEXP)
sdata$PROPDMGEXP[sdata$PROPDMGEXP %in% c("", "+", "-", "?")] <- 0

pn <- c(9, 6, 3, 2)
pw <- c("B", "M", "K", "H")

for ( i in seq(1, length(pw)) ) {
        sdata$PROPDMGEXP[sdata$PROPDMGEXP  == pw[i]] <- pn[i]
        }

sdata$PROPDMGEXP <- 10^(as.numeric(sdata$PROPDMGEXP))
sdata$fullPROPDMG <-  sdata$PROPDMG * sdata$PROPDMGEXP

# similarly for crop damage
unique(sdata$CROPDMGEXP)
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"
sdata$CROPDMGEXP <- toupper(sdata$CROPDMGEXP)
sdata$CROPDMGEXP[sdata$CROPDMGEXP %in% c("", "?")] <- 0        
cn <- c(9, 6, 3)
cw <- c("B", "M", "K")
for ( i in seq(1, length(cw)) ) {
        sdata$CROPDMGEXP[sdata$CROPDMGEXP  == cw[i]] <- cn[i]
        }        
sdata$CROPDMGEXP <- 10^(as.numeric(sdata$CROPDMGEXP))
sdata$fullCROPDMG <-  sdata$CROPDMG * sdata$CROPDMGEXP

# calculate and add the total damage
fdata <- sdata %>% mutate(fullDMG = fullPROPDMG + fullCROPDMG)

Results

First analyzes the most traumatic and fatal events.

# the required columns are selected, summed by the number of injuries and sorted
inj <- fdata %>% select(EVTYPE, INJURIES) %>% 
        group_by(EVTYPE) %>% 
        summarise(INJURIES = sum(INJURIES, na.rm = TRUE)) %>%
        arrange(desc(INJURIES))

# transformation of char variable into factor       
inj$EVTYPE <- factor(inj$EVTYPE , levels = inj$EVTYPE)

# creating a plot
inj_plot <- ggplot(data = inj[1:5, ], 
                   mapping = aes(x = EVTYPE, y = INJURIES)) +
        geom_col() + 
        xlab("Event Type") + 
        ylab("Number of Injuries") + 
        ggtitle("Injuries by Event type") +
        geom_text(aes(label = INJURIES), 
                  vjust = 1.6, 
                  color = "white",
                  size = 3.5)

# similar steps are performed for fatalities 
fat <- fdata %>% select(EVTYPE, FATALITIES) %>% 
        group_by(EVTYPE) %>% 
        summarise(FATALITIES = sum(FATALITIES, na.rm = TRUE)) %>%
        arrange(desc(FATALITIES))

fat$EVTYPE <- factor(fat$EVTYPE , levels = fat$EVTYPE)

fat_plot <- ggplot(data = fat[1:5, ], 
                   mapping = aes(x = EVTYPE, y = FATALITIES)) +
        geom_col() + 
        xlab("Event Type") + 
        ylab("Number of Fatalities") + 
        ggtitle("Fatalities by Event type") +
        geom_text(aes(label = FATALITIES), 
                  vjust = 1.6, 
                  color = "white",
                  size = 3.5)

# combining graphs
grid.arrange(inj_plot, fat_plot, ncol = 2)

Now it is necessary to find out which events cause the most prop and crop damages.

# similar calculations for both injuries and fatalities
prop <- fdata %>% select(EVTYPE, fullPROPDMG) %>% 
        group_by(EVTYPE) %>% 
        summarise(fullPROPDMG = sum(fullPROPDMG, na.rm = TRUE)) %>%
        arrange(desc(fullPROPDMG))

prop$EVTYPE <- factor(prop$EVTYPE , levels = prop$EVTYPE)

prop_plot <- ggplot(data = prop[1:5, ], 
                   mapping = aes(x = EVTYPE, y = fullPROPDMG)) +
        geom_col() + 
        xlab("Event Type") + 
        ylab("Damage in Dollars") + 
        ggtitle("Property damage by Event type") +
        geom_text(aes(label = fullPROPDMG), 
                  vjust = 1.6, 
                  color = "white",
                  size = 3.5)


crop <- fdata %>% select(EVTYPE, fullCROPDMG) %>% 
        group_by(EVTYPE) %>% 
        summarise(fullCROPDMG = sum(fullCROPDMG, na.rm = TRUE)) %>%
        arrange(desc(fullCROPDMG))

crop$EVTYPE <- factor(crop$EVTYPE , levels = crop$EVTYPE)

crop_plot <- ggplot(data = crop[1:5, ], 
                    mapping = aes(x = EVTYPE, y = fullCROPDMG)) +
        geom_col() + 
        xlab("Event Type") + 
        ylab("Damage in Dollars") + 
        ggtitle("Crop damage by Event type") +
        geom_text(aes(label = fullCROPDMG), 
                  vjust = 1.6, 
                  color = "white",
                  size = 3.5)

grid.arrange(prop_plot, crop_plot, ncol = 2)

And finally, the top 10 events with the most total damage.

tot <- fdata %>% select(EVTYPE, fullDMG) %>% 
        group_by(EVTYPE) %>% 
        summarise(fullDMG = sum(fullDMG, na.rm = TRUE)) %>%
        arrange(desc(fullDMG))

tot$EVTYPE <- factor(tot$EVTYPE , levels = tot$EVTYPE)

ggplot(data = tot[1:10, ], mapping = aes(x = EVTYPE, y = fullDMG)) +
        geom_col() + 
        xlab("Event Type") + 
        ylab("Damage in Dollars") + 
        ggtitle("Total damage by Event type") +
        geom_text(aes(label = fullDMG), 
                  vjust = 1.6, 
                  color = "white",
                  size = 3.5)