Synopsis:

Storm events aren’t rare in the US. They often occur across the states causing damage to both properties and people. We analyzed the storm data from the NOAA between years 1950 and 2011. The population health damage was based on a weighted sum of the fatalities and the injuries, while the economic consequences were estimated from the crop and property damage. In general, Tornadoes seem to be the most damaging natural event.

Data Processing

if(!file.exists("Storm_data.csv.bz2")){
    url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(url,"Storm_data.csv.bz2")
}
data <- read.csv("Storm_data.csv.bz2")
head(data)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6
  1. Polulation health: Injuries and fatalities
  2. Economic consequences: Property and crop damage
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data_tbl <- tbl_df(data)
data_tbl_health <- select(data_tbl,c("EVTYPE","FATALITIES","INJURIES"))
data_tbl_economic <- select(data_tbl,c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP"))

Results

Population Health

  • For further processing of the data, we need a mertic that quantify the health damage based on both injuries and fatalities.
  • A simplified assumption is to add them together while assigning a higher weight to fatalities(i.e x10).
# Calculate the damage
data_tbl_health$Damage <- data_tbl_health$INJURIES + 10*data_tbl_health$FATALITIES
library(ggplot2)
# calculate the mean damage per event
data_tbl_health_summary <- data_tbl_health %>% group_by(EVTYPE) %>% summarise(mean_damage = mean(Damage)) %>% arrange(desc(mean_damage)) %>% slice(1:10)
  • Let’s plot the data
ggplot(data_tbl_health_summary,aes(x= reorder(EVTYPE,-mean_damage),mean_damage))+
    geom_bar(stat = "identity")+
    labs(x="Event Type")+
    labs(y="Mean Health Damage")+
    labs(title="Top Events by Mean Health Damage")+
    theme(axis.text.x = element_text(angle = 60,face="bold",size=6,hjust = 1))

We can conclude the Tornadoes are the most significant event in terms of mean health damage followed by Cold & Snow.

Economic Consequences

  • Similiar to population health, we need a metric to evaluate the economic effect.
  • The damage is mainly to properties and crops with an exponent of (K,M,B,+,-,?,..etc)
  • We follow the approach for this URL.
  • We create a fucntion to convert the DMG and its EXP to a single value.
  • We can then add the PROP and CROP damages -Let’s see common EXP
print(table(data_tbl_economic$PROPDMGEXP))
## 
##             -      ?      +      0      1      2      3      4      5      6 
## 465934      1      8      5    216     25     13      4      4     28      4 
##      7      8      B      h      H      K      m      M 
##      5      1     40      1      6 424665      7  11330
print(table(data_tbl_economic$CROPDMGEXP))
## 
##             ?      0      2      B      k      K      m      M 
## 618413      7     19      1      9     21 281832      1   1994

There are huge number of empty cells. “k” expression is also very common.

  • Let’s construct the function and estimate the damage.
library(stringr)
# Use lower case 
data_tbl_economic$CROPDMGEXP <- tolower(data_tbl_economic$CROPDMGEXP)
data_tbl_economic$PROPDMGEXP <- tolower(data_tbl_economic$PROPDMGEXP)

DMG_EXP <- function(x){
    #create a zero vector
    EXP_num <- rep(0,length=dim(x)[1])
    #Most common operator is k
    EXP_num[x[,2]=="k"] = 1000
    #exclude empty cells
    other = which(x[,2]!="" & x[,2]!="k")
    #Assign values to other experessions
    multiplier = c(1,100,1e6,1e9)
    names_multiplier <- c("+","h","m","b")
    names(multiplier) <- names_multiplier
    nums = as.character(1:8)
    #loop over remaining cells and set appropiate multiplier. 
    for (i in other){
        exp = as.character(x[i,2])
        if(exp %in% names_multiplier){
            EXP_num[i] <- multiplier[exp]
        } else if (exp %in% nums){
            EXP_num[i] <- 10
        } 
    }
    EXP_num
}
#Calucalte the damage
data_tbl_economic$PROP <- data_tbl_economic$PROPDMG*DMG_EXP(data_tbl_economic[2:3])
data_tbl_economic$CROP <- data_tbl_economic$CROPDMG*DMG_EXP(data_tbl_economic[4:5])
data_tbl_economic$Damage <- data_tbl_economic$PROP + data_tbl_economic$CROP

# calculate the mean damage per event
data_tbl_economic_summary <- data_tbl_economic %>% group_by(EVTYPE) %>% summarise(mean_damage = mean(Damage)/1e6) %>% arrange(desc(mean_damage)) %>% slice(1:10)
ggplot(data_tbl_economic_summary,aes(x= reorder(EVTYPE,-mean_damage),mean_damage))+
    geom_bar(stat = "identity")+
    labs(x="Event Type")+
    labs(y="Economic Damage [million USD")+
    labs(title="Top Events by Mean Economic Damage")+
    theme(axis.text.x = element_text(angle = 60,face="bold",size=8,hjust = 1))

We can conclude the Tornadoes are the most significant event in terms of mean economic damage followed by heavy rain and severe weather.