Synopsis:

This report analyze on United States storm data and conclude the results:
1. Tornado and excessive heat are most harmful with respect to population health.
2. Flood has the greatest economic consequences.

Data Processing

if (!file.exists("data")){
    dir.create("data")
}
if (!file.exists("data/StormData.csv.bz2")) {
    download.file(url, destfile = "data/StormData.csv.bz2")
}

if (!file.exists("data/stormdata.csv")) {
    library(R.utils)
    bunzip2("./data/stormdata.csv.bz2", destname = "./data/stormdata.csv", remove=F)
}

df <- read.csv("./data/stormdata.csv")
sub_df <- df[ ,c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
head(sub_df)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0           
## 6 TORNADO          0        6     2.5          K       0

In this data, it contains 37 columns and 902297 rows which record storm damages.

In order to answer the two questions:
1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
2. Across the United States, which types of events have the greatest economic consequences?

we chosen 6 columns including “EVTYPE”,“FATALITIES”,“INJURIES”,“PROPDMG”,“PROPDMGEXP”,“CROPDMG” and “CROPDMGEXP”. And then combine those information, we can get property damage and crop damage.

getNum <- function(n,n_exp) {
    if (is.na(n_exp) | is.nan(n_exp) | is.null(n_exp) | n_exp=="" ){
        return(0) 
    }
    
    if (n_exp == " ") {
        return(as.numeric(n)*1)
    } else if (n_exp == "-") {
        return(as.numeric(n)*1)
    } else if (n_exp == "?") {
        return(as.numeric(n)*1)
    } else if (n_exp == "+") {
        return(as.numeric(n)*10)
    } else if (n_exp == "0" | n_exp == "1" | n_exp == "2" | 
               n_exp == "3" | n_exp == "4" | n_exp == "5" | 
               n_exp == "6" | n_exp == "7" | n_exp == "8" | n_exp == "9" ) {
        return(as.numeric(n)*10^(as.numeric(n_exp)))
    } else if (n_exp == "h" | n_exp == "H") {
        return(as.numeric(n)*10^2)
    } else if (n_exp == "k" | n_exp == "K") {
        return(as.numeric(n)*10^3)
    } else if (n_exp == "m" | n_exp == "M") {
        return(as.numeric(n)*10^6)
    } else if (n_exp == "b" | n_exp == "B") {
        return(as.numeric(n)*10^9)
    } 
}

sub_df$prop_dmg <- mapply(FUN = getNum,sub_df$PROPDMG,sub_df$PROPDMGEXP)
sub_df$crop_dmg <- mapply(FUN = getNum,sub_df$CROPDMG,sub_df$CROPDMGEXP)

Results

Question 1: Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

“FATALITIES” and “INJURIES” could be used to reflect harmful with respect to population health.

library(ggplot2)
library(tidyr)


fatalities_top <- names(head(with(sub_df,sort(tapply(FATALITIES,EVTYPE,sum),decreasing = TRUE)),n = 20))
injuries_top <- names(head(with(sub_df,sort(tapply(INJURIES,EVTYPE,sum),decreasing = TRUE)),n = 20))
both_sum_top <- names(head(with(sub_df,sort(tapply(FATALITIES+INJURIES,EVTYPE,sum),decreasing = TRUE)),n = 20))
health_harmful_top <- intersect(intersect(both_sum_top,injuries_top),fatalities_top)

health_harmful_df <- gather(sub_df[c("EVTYPE","FATALITIES","INJURIES")],FATALITIES,INJURIES,-EVTYPE)
colnames(health_harmful_df) <- c("EVTYPE","key","val")

fig1 <- ggplot(data = subset(health_harmful_df,EVTYPE %in% health_harmful_top),aes(x =EVTYPE,y=val,fill=factor(key) ))+ 
    geom_bar(stat="identity") +
    scale_x_discrete(limits= rev(health_harmful_top))+
    labs(title="Top Health Damage Depending on Event Types (1996 - 2011)",x="Types of Events",y="Number of People") +
    theme(legend.title=element_blank(),legend.position="top") + 
    coord_flip() 
print(fig1)

Showed in the figure,tornado and excessive heat are most harmful with respect to population health.

Question 2: Across the United States, which types of events have the greatest economic consequences?

“prop_dmg” and “crop_dmg” could be used to reflect harmful with respect to population health.

prop_dmg_top10 <- names(head(sort(with(sub_df,tapply(prop_dmg,EVTYPE,sum)),decreasing = TRUE),n = 5))
crop_dmg_top10 <- names(head(sort(with(sub_df,tapply(crop_dmg,EVTYPE,sum)),decreasing = TRUE),n = 5))

eco_dmg_df <- gather(sub_df[c("EVTYPE","prop_dmg","crop_dmg")],prop_dmg,crop_dmg,-EVTYPE)
colnames(eco_dmg_df) <- c("EVTYPE","key","val")

fig2 <- ggplot(data = subset(eco_dmg_df,EVTYPE %in% prop_dmg_top10 & key=="prop_dmg"),aes(x =EVTYPE,y=val ))+ 
    geom_bar(stat="identity") +
    scale_x_discrete(limits= rev(prop_dmg_top10))+
    coord_flip() +
    labs(title="Top Property Damage Depending on Event Types (1996 - 2011)",x="Types of Events",y="Damage Cost ($)") 
print(fig2)

fig3 <- ggplot(data = subset(eco_dmg_df,EVTYPE %in% crop_dmg_top10 & key=="crop_dmg"),aes(x =EVTYPE,y=val ))+ 
    geom_bar(stat="identity") +
    scale_x_discrete(limits= rev(crop_dmg_top10))+
    coord_flip() +
    labs(title="Top Crop Damage Depending on Event Types (1996 - 2011)",x="Types of Events",y="Damage Cost ($)") 
print(fig3)

Showed in the two figures,flood has the greatest economic consequences.