What Is Most Important In Preparation for Severe Weather Events

(Based on U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database)

Synopsis: This data analysis is aiming to find the types of events which are the most harmful to the population health, and the types of events which have the greatest economic consequences, across the United States. We are going to find the meteorological events which averagely have caused the most death and injuries, the events which are the most likely to cause a large number of casualty, and the events which had caused the most damage in properties and corps.

Frist, we are providing the system environment to ensure the reproducibility.

sessionInfo()

## R version 3.3.0 (2016-05-03)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] markdown_0.7.7 ggplot2_2.1.0  knitr_1.14     dplyr_0.5.0   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.5      digest_0.6.9     assertthat_0.1   plyr_1.8.4      
##  [5] grid_3.3.0       R6_2.1.2         gtable_0.2.0     DBI_0.4-1       
##  [9] formatR_1.4      magrittr_1.5     scales_0.4.0     evaluate_0.9    
## [13] stringi_1.1.1    rmarkdown_1.0    tools_3.3.0      stringr_1.0.0   
## [17] munsell_0.4.3    colorspace_1.2-6 htmltools_0.3.5  tibble_1.0

Data Processing

library(dplyr)
library(knitr)
library(ggplot2)
library(markdown)
dir.create("~/code/RR_week4")

## Warning in dir.create("~/code/RR_week4"): '/Users/YanYang/code/RR_week4'
## already exists

setwd("~/code/RR_week4")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "./StromData.csv.bz2")
dt <- read.csv("./StromData.csv.bz2", sep = ",", header = TRUE)

Modifing Data for Question 1:

Group the data by event type for further use of package dplyr, which will not change the data itself.

dt <- group_by(dt, EVTYPE)

Modifing Data for Question 2:

dtt <- dt %>% filter(PROPDMG != 0 | CROPDMG != 0) %>% select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

Since the exponents of the dollar damages are different, we need to transform them into the same unit before doing analysis. The column PROPDMGEXP and CROPDMGEXP are indicating the exponents. We are going to select the related columns first to save some processing time. Crops Damage Unit Change

# Lets check whether there are ones with unit smaller then 1 million dollors but actually larger then the data with unit of 1 million dollors.
dttByExpC <- group_by(dtt, CROPDMGEXP)
summarize(dttByExpC, n())

## Source: local data frame [8 x 2]
## 
##   CROPDMGEXP    n()
##       <fctr>  <int>
## 1            145037
## 2          ?      6
## 3          0     17
## 4          B      7
## 5          k     21
## 6          K  97960
## 7          m      1
## 8          M   1982

minMC <- min(min(filter(dtt, CROPDMGEXP == "M" & CROPDMG != 0)$CROPDMG), min(filter(dtt, CROPDMGEXP == "m" & CROPDMG != 0)$CROPDMG))
max(filter(dtt, CROPDMGEXP == "")$CROPDMG) >= minMC*1000000

## [1] FALSE

max(filter(dtt, CROPDMGEXP == "0")$CROPDMG) >= minMC*1000000

## [1] FALSE

max(filter(dtt, CROPDMGEXP == "K")$CROPDMG)*1000 >= minMC*100000

## [1] TRUE

max(filter(dtt, CROPDMGEXP == "k")$CROPDMG)*1000 >= minMC*100000

## [1] TRUE

# so we only need to check the rows where CROPDMGEXP is equal to "K/k", "M/m", "B".
# since there are the most data with exponent "K/k/3", if we set the unit to thousand dollars, there will be the least changes we need to make.
for (i in 1:nrow(dtt)){
        if (dtt$CROPDMGEXP[i] == "B") {
                dtt$CROPDMG[i] <- dtt$CROPDMG[i]*1000000
        }else if (dtt$CROPDMGEXP[i] %in% c("M", "m")) {
                dtt$CROPDMG[i] <- dtt$CROPDMG[i]*1000
        }
        
}

Property Damage Unit Change

# Lets check whether there are ones with unit smaller then 1 million dollors but actually larger then the data with unit of 1 million dollors.
dttByExpP <- group_by(dtt, PROPDMGEXP)
summarize(dttByExpP, n())

## Source: local data frame [16 x 2]
## 
##    PROPDMGEXP    n()
##        <fctr>  <int>
## 1               4357
## 2           -      1
## 3           +      5
## 4           0    209
## 5           2      1
## 6           3      1
## 7           4      4
## 8           5     18
## 9           6      3
## 10          7      2
## 11          B     40
## 12          h      1
## 13          H      6
## 14          K 229057
## 15          m      7
## 16          M  11319

minMP <- min(min(filter(dtt, PROPDMGEXP == "M" & PROPDMG != 0)$PROPDMG), min(filter(dtt, PROPDMGEXP == "m" & PROPDMG != 0)$PROPDMG), min(filter(dtt, PROPDMGEXP == "6" & PROPDMG != 0)$PROPDMG))
max(filter(dtt, PROPDMGEXP == "")$PROPDMG) >= minMP*1000000

## [1] FALSE

max(filter(dtt, PROPDMGEXP == "0")$PROPDMG)*1000 >= minMP*100000

## [1] TRUE

max(filter(dtt, PROPDMGEXP == "2")$PROPDMG) >= minMP*1000000

## [1] FALSE

max(filter(dtt, PROPDMGEXP == "3")$PROPDMG)*1000 >= minMP*100000

## [1] TRUE

max(filter(dtt, PROPDMGEXP == "4")$PROPDMG)*1000 >= minMP*100000

## [1] TRUE

max(filter(dtt, PROPDMGEXP == "5")$PROPDMG)*1000 >= minMP*100000

## [1] TRUE

# so we only need to check the rows where PROPDMGEXP is equal to "K", "M/m", "B", "3", "4", "5", "6", "7".
# since there are the most data with exponent "K/k/3", if we set the unit to thousand dollars, there will be the least changes we need to make.
for (i in 1:nrow(dtt)) {
        if (dtt$PROPDMGEXP[i] == "B"){
                dtt$PROPDMG[i] <- dtt$PROPDMG[i]*1000000
        }else if (dtt$PROPDMGEXP[i] %in% c("M", "m")){
                dtt$PROPDMG[i] <- dtt$PROPDMG[i]*1000
        }else if (dtt$PROPDMGEXP[i] == "4"){
                dtt$PROPDMG[i] <- dtt$PROPDMG[i]*10
        }else if (dtt$PROPDMGEXP[i] == "5"){
                dtt$PROPDMG[i] <- dtt$PROPDMG[i]*100
        }else if (dtt$PROPDMGEXP[i] == "6"){
                dtt$PROPDMG[i] <- dtt$PROPDMG[i]*1000
        }else if (dtt$PROPDMGEXP[i] == "7"){
                dtt$PROPDMG[i] <- dtt$PROPDMG[i]*10000
        }
}

Group the data by event type for further use of package dplyr, which will not change the data itself.

dtt <- group_by(dtt, EVTYPE)

Results

Which types of events are most harmful to population health?

We are going to calculate the weighted mean of number of people who were killed or injured in each kind of meteorological event.

weight <- sum(dt$INJURIES)/sum(dt$FATALITIES)
health <- summarize(dt, mean(FATALITIES*weight + INJURIES), sum(FATALITIES + INJURIES))
names(health) <- c("type", "mean", "sum")

By checking the total number of injuries and death caused by each type of events, we see there is 78% of events which have never caused any. To save the resource, we decide to ignore these events in the health analysis.

sum(health$sum == 0)/nrow(health)

## [1] 0.7766497

hNonZ <- filter(health, sum != 0)

Let’s see the spread of the weighted mean of bodily harm caused by each kind of events. The black line indicates the 3rd quantile of the data. From the plot below we see that the vase majority of the weighted means are equal or below 10.

f <- ggplot(hNonZ, aes(x = hNonZ$mean))
f + geom_histogram(binwidth = 1, col = topo.colors(233)) + labs(title = "Weighted Mean of Bodily Harm by Type", x = "Mean") + geom_vline(xintercept = quantile(hNonZ$mean, probs = 0.75))

Based on the shape of the plot, we believe, to save the most people by the least resource, if we find the 5% and 1% of events which has the highest average number of people harmed, it can be a clue of finding the events which impose the most bodily danger to the population.

hDanger <- filter(hNonZ, mean >= quantile(hNonZ$mean, probs = 0.95))
hVeryD <- filter(hNonZ, mean >= quantile(hNonZ$mean, probs = 0.99))
print(paste("The top 1% events which are the most dangerous to public health are", as.character(hVeryD$type[1]), ",", as.character(hVeryD$type[2]), "and", as.character(hVeryD$type[3]), "."))

## [1] "The top 1% events which are the most dangerous to public health are COLD AND SNOW , TORNADOES, TSTM WIND, HAIL and TROPICAL STORM GORDON ."

print("The top 5% most dangerous events are in the following list:")

## [1] "The top 5% most dangerous events are in the following list:"

as.character(hDanger$type)

##  [1] "COLD AND SNOW"              "EXTREME HEAT"              
##  [3] "Heat Wave"                  "HEAT WAVE DROUGHT"         
##  [5] "HIGH WIND AND SEAS"         "HIGH WIND/SEAS"            
##  [7] "RECORD/EXCESSIVE HEAT"      "TORNADOES, TSTM WIND, HAIL"
##  [9] "TROPICAL STORM GORDON"      "WILD FIRES"                
## [11] "WINTER STORMS"

Life is priceless. So we need to do more analysis based on the fatalities in the meterological events. Those events are also the most harmful to population health.

casualty <- data.frame(dt$EVTYPE, dt$FATALITIES)
names(casualty) <- c("type", "fatality")
sum(casualty$fatality == 0)/nrow(casualty)

## [1] 0.9922708

From the result above, we can see more than 99% percent of the event had never caused any death. Thus it will be better if we focuse on the non-zero values only.

casualty <- filter(casualty, fatality != 0)
summary(casualty$fatality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   2.172   2.000 583.000

Since the max of the fatality is 583 while the 3rd quantile is 2, there will be outliers in the plot. To plot them, we use logarithm to shorten the range of the data. The blue line is y = 2, which indicates the casualty below 100. The red lind is y = 1, which indicates the casualty below 10. Let’s see the spread of the Number of Fatality caused by each event to find if there is any pattern.

g <- ggplot(casualty, aes(x = casualty$type, y = log(casualty$fatality)), height = 800, width = 1000)
g + geom_boxplot(col = rainbow(168)) + 
        labs(title = "Logged Number of Fatality by Event Type", x = "Event Type", y = "Logged Number of Fatiliets") + 
        geom_abline(slope = 0, intercept = 1, color = "red") + 
        geom_abline(slope = 0, intercept = 2, color = "blue")

From the spread of the fatality, we can see that most of events caused less then 10 people’s life, while the vase majority of the events caused less than 100 life. However, some of the events has caused way more than 100 life. Thus we believe if we find types of events which are the top 1% killer, we will save the most life using the least resource.

hcasu <- filter(casualty, fatality >= quantile(casualty$fatality, probs = 0.99))
casuType <- as.character(unique(hcasu$type))
print("The top 1% killer events are in the following list:")

## [1] "The top 1% killer events are in the following list:"

as.character(casuType)

##  [1] "TORNADO"                    "TORNADOES, TSTM WIND, HAIL"
##  [3] "HEAT"                       "EXCESSIVE HEAT"            
##  [5] "UNSEASONABLY WARM AND DRY"  "HEAT WAVE"                 
##  [7] "EXTREME HEAT"               "HEAVY RAIN"                
##  [9] "TROPICAL STORM"             "TSUNAMI"                   
## [11] "FLASH FLOOD"

Every time they occurs, we shall make our best effort to maintain the public safety.

Which types of events have the greatest economic consequences

We calculated the average total damages (including property damage and damaged crops) caused by each event. Lets see the spread of mean of total damages caused by each type of events. The blue line is the 3rd quantile of the data.

economy <- summarize(dtt, mean(PROPDMG + CROPDMG), sum(PROPDMG + CROPDMG))
names(economy) <- c("type", "mean", "sum")
h <- ggplot(economy, aes(x = log(economy$mean)))
h + geom_histogram(fill = heat.colors(30)) + geom_vline(xintercept = log(quantile(economy$mean, probs = 0.75)), col = "blue") + labs(title = "Average Total Damage by Type", x = "Average Total Damage (in thousand dollars)")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We are going to find the top 5% and top 1% of the events which cause the most economic harm.

eDanger <- filter(economy, mean >= quantile(economy$mean, probs = 0.95))
eVeryD <- filter(economy, mean >= quantile(economy$mean, probs = 0.99))
print(paste("The top 1% events which have the greatest economic consequences are", eVeryD$type[1], ",", eVeryD$type[2], ",", eVeryD$type[3], ",", eVeryD$type[4], "and", eVeryD$type[5], "."))

## [1] "The top 1% events which have the greatest economic consequences are HEAVY RAIN/SEVERE WEATHER , HURRICANE OPAL , HURRICANE/TYPHOON , STORM SURGE and TORNADOES, TSTM WIND, HAIL ."

print("The top 5% events which have the greatest economic consequences are in the following list:")

## [1] "The top 5% events which have the greatest economic consequences are in the following list:"

as.character(eDanger$type)

##  [1] "COLD AND WET CONDITIONS"    "DAMAGING FREEZE"           
##  [3] "DROUGHT"                    "Early Frost"               
##  [5] "EXCESSIVE WETNESS"          "HAILSTORM"                 
##  [7] "HEAVY RAIN/SEVERE WEATHER"  "HURRICANE"                 
##  [9] "HURRICANE EMILY"            "HURRICANE ERIN"            
## [11] "HURRICANE OPAL"             "HURRICANE OPAL/HIGH WINDS" 
## [13] "HURRICANE/TYPHOON"          "MAJOR FLOOD"               
## [15] "RIVER FLOOD"                "SEVERE THUNDERSTORM"       
## [17] "STORM SURGE"                "STORM SURGE/TIDE"          
## [19] "TORNADOES, TSTM WIND, HAIL" "TYPHOON"                   
## [21] "WILD FIRES"                 "WINTER STORM HIGH WINDS"