Analysis of National Weather Service Storm Data

Synopsis

The National Weather service gathers information regarding significant storm events in the United States. Data is taken from a variety of sources, such as governmental agencies, law enforcement agencies and media organizations, etc. This database includes counts of fatalities, injuries, property damage and crop damage on significant US weather events. The following report contains the r code necessary to process and plot the data, ultimately answering which types of events cause the most fatalities, injuries, property damage and crop damage.

Data Processing

Load Packages

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Load Data

url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("repdata-data-StormData.csv.bz2")){
        download.file(url, destfile = "repdata-data-StormData.csv.bz2")
}
data <- read.csv("repdata-data-StormData.csv.bz2")
names(data) <- tolower(names(data))
data$bgn_date <- as.Date(data$bgn_date, "%d/%m/%Y")

Create Property and Crop Damage Estimates

Property and crop damage estimates are recorded as a numeric value and character value???K, M, B, that means thousands, million and billion. Let’s combine this two columns for a more interpretable economic cost.

At first we create function that helps us to build a new columns with numerical values of property and crop damages. After, we summarize these columns and get the result in the form of overall economic losses from each event type.

#At first we create function that helps us to build a new column with numerical values of property and crop damages. 
data_dmg <- function(x, y) {
        if(x == "K") {
                y * 1e3
                 
        } else if(x == "M") {
                y * 1e6
                
        } else if(y == "B") {
                y * 1e9
                 
        }else {
                y = 0
                 }
}

#receiving a numerical value of property and crop damage from two columns(propdmgexp and propdmg), and get overall economic losses from each event type.

data$propdmgexp <- as.character(data$propdmgexp)
data$property_dmg <- mapply(data_dmg, data$propdmgexp, data$propdmg)

data$cropdmgexp <- as.character(data$cropdmgexp)
data$crop_dmg <- mapply(data_dmg, data$cropdmgexp, data$cropdmg)

data$economic_dmg <- data$property_dmg + data$crop_dmg

Now we specify the columns that we need for the analysis

data <- data%>%
        select(evtype, fatalities, injuries, economic_dmg)

Event Type Preprocessing

In this dataset we have a huge problem associated with the amount of event types. And most of them are simply a repetition. The first thing we do is try to reduce the number of types.

# Consolidate event types. Change variant names into common ones.
data[grepl("THUND", data$evtype, ignore.case=T), c("evtype")] <- "THUNDERSTORM"
data[grepl("TSTM.*", data$evtype, ignore.case=T), c("evtype")] <- "THUNDERSTORM"
data[grepl("COLD|COOL", data$evtype, ignore.case=T), c("evtype")] <- "COLD"
data[grepl("HURRICANE", data$evtype, ignore.case=T), c("evtype")] <- "HURRICANE"
data[grepl("TORNADO", data$evtype, ignore.case=T), c("evtype")] <- "TORNADO"
data[grepl("NADO", data$evtype, ignore.case=T), c("evtype")] <- "TORNADO"
data[grepl("WA.*ER.*SPOUT", data$evtype, ignore.case=T), c("evtype")] <- "WATERSPOUT"
data[grepl("HAIL", data$evtype, ignore.case=T), c("evtype")] <- "HAIL"
data[grepl("DRY|DRI", data$evtype, ignore.case=T), c("evtype")] <- "DRY WEATHER"
data[grepl("WARM", data$evtype, ignore.case=T), c("evtype")] <- "WARM WEATHER"
data[grepl("ICE|ICY|SLEET|FREEZ|FROST", data$evtype, ignore.case=T), c("evtype")] <- "ICY WEATHER"
data[grepl("TROPICAL STORM", data$evtype, ignore.case=T), c("evtype")] <- "TROPICAL STORM"
data[grepl("WET", data$evtype, ignore.case=T), c("evtype")] <- "WET WEATHER"
data[grepl("TIDE|SURF", data$evtype, ignore.case=T), c("evtype")] <- "TIDES OR SURF"
data[grepl("RAIN|PRECIP", data$evtype, ignore.case=T), c("evtype")] <- "RAINY WEATHER"
data[grepl("COAST.*FLOOD", data$evtype, ignore.case=T), c("evtype")] <- "COASTAL FLOODING"
data[grepl("FLOOD|FLDG", data$evtype, ignore.case=T), c("evtype")] <- "FLOODING"
data[grepl("(RAPI|HIG).*WATER", data$evtype, ignore.case=T), c("evtype")] <- "FLOODING"
data[grepl("BLIZZARD", data$evtype, ignore.case=T), c("evtype")] <- "BLIZZARD"
data[grepl("WIND.*CHILL", data$evtype, ignore.case=T), c("evtype")] <- "WIND CHILL"
data[grepl("WIND", data$evtype, ignore.case=T), c("evtype")] <- "STRONG WINDS"
data[grepl("SNOW", data$evtype, ignore.case=T), c("evtype")] <- "SNOW"
data[grepl("FIRE", data$evtype, ignore.case=T), c("evtype")] <- "WILDFIRE"
data[grepl("FOG|VOG", data$evtype, ignore.case=T), c("evtype")] <- "FOG"
data[grepl("VOLCAN", data$evtype, ignore.case=T), c("evtype")] <- "VOLCANIC ERUPTION OR ASH"
data[grepl("CLOUD", data$evtype, ignore.case=T), c("evtype")] <- "CLOUDS"
data[grepl("HEAT|HOT", data$evtype, ignore.case=T), c("evtype")] <- "HEAT"
data[grepl("LIG.*ING", data$evtype, ignore.case=T), c("evtype")] <- "LIGHTNING"
data[grepl("DUST", data$evtype, ignore.case=T), c("evtype")] <- "DUST STORM"
data[grepl("SURGE", data$evtype, ignore.case=T), c("evtype")] <- "COASTAL FLOODING"
data[grepl("WINT", data$evtype, ignore.case=T), c("evtype")] <- "WINTER WEATHER"
data[grepl("AVALAN", data$evtype, ignore.case=T), c("evtype")] <- "AVALANCHE"

# Clean event types. Summarize all the data on the type of event, and delete all rows with no data 
cleandata <- data %>%
        group_by(evtype)%>%
        summarise(count = n(), deaths = sum(fatalities), injuries = sum(injuries), 
                  economic_dmg = sum(economic_dmg)) %>%
        arrange(desc(deaths)) %>%
        filter(deaths > 0 & injuries > 0 & economic_dmg > 0)

The result of Data Processing

# The event types
cleandata$evtype

##  [1] TORNADO              HEAT                 FLOODING            
##  [4] LIGHTNING            THUNDERSTORM         STRONG WINDS        
##  [7] COLD                 RIP CURRENT          WINTER WEATHER      
## [10] AVALANCHE            RIP CURRENTS         TIDES OR SURF       
## [13] SNOW                 HURRICANE            ICY WEATHER         
## [16] RAINY WEATHER        BLIZZARD             WILDFIRE            
## [19] FOG                  TROPICAL STORM       LANDSLIDE           
## [22] DRY WEATHER          TSUNAMI              URBAN/SML STREAM FLD
## [25] DUST STORM           HAIL                 COASTAL FLOODING    
## [28] WARM WEATHER         GLAZE                HIGH SEAS           
## [31] WATERSPOUT           LANDSLIDES           Marine Accident     
## 192 Levels: ? APACHE COUNTY AVALANCHE BEACH EROSIN ... WND

# The summary of new clean dataset
summary(cleandata)

##               evtype       count            deaths          injuries    
##  AVALANCHE       : 1   Min.   :     1   Min.   :   1.0   Min.   :    1  
##  BLIZZARD        : 1   1st Qu.:   313   1st Qu.:  24.0   1st Qu.:   79  
##  COASTAL FLOODING: 1   Median :  1835   Median : 101.0   Median :  320  
##  COLD            : 1   Mean   : 27038   Mean   : 457.5   Mean   : 4257  
##  DRY WEATHER     : 1   3rd Qu.: 11991   3rd Qu.: 278.0   3rd Qu.: 1608  
##  DUST STORM      : 1   Max.   :336822   Max.   :5636.0   Max.   :91407  
##  (Other)         :27                                                    
##   economic_dmg      
##  Min.   :1.000e+03  
##  1st Qu.:6.893e+06  
##  Median :7.640e+08  
##  Mean   :5.232e+09  
##  3rd Qu.:3.259e+09  
##  Max.   :5.210e+10  
##

# First 10 rows
head(cleandata,10)

## # A tibble: 10 <U+00D7> 5
##            evtype  count deaths injuries economic_dmg
##            <fctr>  <int>  <dbl>    <dbl>        <dbl>
## 1         TORNADO  60705   5636    91407  52096662590
## 2            HEAT   2662   3138     9224    524795030
## 3        FLOODING  82692   1528     8602  46270347420
## 4       LIGHTNING  15763    817     5231    940791370
## 5    THUNDERSTORM 336822    756     9545  11048485930
## 6    STRONG WINDS  26570    470     1953   5608409390
## 7            COLD   2484    451      320   1668345000
## 8     RIP CURRENT    470    368      232         1000
## 9  WINTER WEATHER  19690    278     1953   1758751750
## 10      AVALANCHE    387    225      170      3721800

The Main results of Research

The most harmful types of events for population lives

As we can see from the last table, after data cleaning, we can build a plot of the most dangerous events to human health. We plot a top 15 Storm Event, from which it is clear that the leader of this ranking is the tornado that during the reporting period took more than 5,500 lives. Only the first three participants this ranking have brought more deaths than 1000 people each. The rest are below of this mark.

cleandata$evtype <- tolower(cleandata$evtype)
top15 <- cleandata[1:15,]

ggplot(data = top15, aes(x = reorder(evtype, deaths), y = deaths))+
        geom_point(aes(colour=evtype, size=deaths))+
        scale_y_continuous(breaks = seq(0, 6000, by = 1000))+
        coord_flip()+
        
        theme_bw()+
        xlab("Storm Event")+
        ylab("Deaths")+
        ggtitle("Deaths By Different Weather Event")

The most harmful types of events for population health

Tornadoes also is the undisputed leader in number of injury to more than 9,000 peple during the reporting period. All other participants in ranking are located below the 1000 injury.

cleandata <- cleandata %>% arrange(desc(injuries)) 
top15 <- cleandata[1:15,]

ggplot(data = top15, aes(x = reorder(evtype, injuries), y = injuries))+
        geom_point(aes(colour=evtype, size=injuries))+
        scale_y_continuous(breaks = seq(0, 95000, by = 10000))+
        coord_flip()+
        
        theme_bw()+
        xlab("Storm Event")+
        ylab("Injuries")+
        ggtitle("Injuries By Different Weather Event")

Economic Effects

Next plot shows us the top 15 economic damage events by the reporting period.

cleandata <- cleandata %>% arrange(desc(economic_dmg)) 
top15 <- cleandata[1:15,]

ggplot(data = top15, aes(x = reorder(evtype, economic_dmg), y = economic_dmg/1000000000))+
        geom_point(aes(colour=evtype, size=economic_dmg))+
        coord_flip()+
        theme_bw()+
        xlab("Storm Event")+
        ylab("Economic (Property & Crop) Damage in Billions")+
        ggtitle("Economic Damage By Different Weather Event")

top15

## # A tibble: 15 <U+00D7> 5
##            evtype  count deaths injuries economic_dmg
##             <chr>  <int>  <dbl>    <dbl>        <dbl>
## 1         tornado  60705   5636    91407  52096662590
## 2        flooding  82692   1528     8602  46270347420
## 3            hail 289283     15     1371  17216134320
## 4       hurricane    288    135     1328  14331472810
## 5    thunderstorm 336822    756     9545  11048485930
## 6        wildfire   4240     90     1608   6364910130
## 7     icy weather   4210    120     2223   5837637360
## 8    strong winds  26570    470     1953   5608409390
## 9  tropical storm    697     66      383   3259286550
## 10 winter weather  19690    278     1953   1758751750
## 11           cold   2484    451      320   1668345000
## 12  rainy weather  11991    108      308   1659508490
## 13           snow  17512    138     1107   1136942790
## 14      lightning  15763    817     5231    940791370
## 15       blizzard   2742    101      806    776973950