Project 2 - Storm Data

First, we make sure that everyone will be able to see the R code, we set echo=“TRUE” for the whole document.

Data Processing

if(!file.exists("repdata_data_StormData.csv.bz2")){
        download.file(url, destfile = "D:/DS/reproducible research/repdata_data_StormData.csv.bz2", method="auto")
}
Stormdata <- read.csv("repdata_data_StormData.csv.bz2")

1. Across the United States, which types of events are most harmful with respect to population health?

Look at sum of injuries and fatalities of each type of event.

DataByEventInjuries <- aggregate(Stormdata$INJURIES, by=list(Stormdata$EVTYPE), FUN=sum,na.rm=TRUE)
DataByEventDeath <- aggregate(Stormdata$FATALITIES, by=list(Stormdata$EVTYPE), FUN=sum, na.rm=TRUE)
names(DataByEventInjuries) <- c("Event", "INJURIES")
names(DataByEventDeath) <- c("Event", "FATALITIES")

After creating 2 sets of data: Data by Injuries and Data by Death, sort data from highest to lowest to see which event causes the most injuries and death.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

DataByEventInjuries <- arrange(DataByEventInjuries, desc(INJURIES))
DataByEventInjuries2 <- DataByEventInjuries[1:158,]
DataByEventDeath <- arrange(DataByEventDeath, desc(FATALITIES))
DataByEventDeath2 <- DataByEventDeath[1:168,]

2. Across the United States, which types of events have the greatest economic consequences?

There is two types of damage: property damage and crop damage. Total damage will be sum of property damage and crop damage. Subset all the data that has more than 0 property damage and crop damage. (All the events that have 0 damage will in excluded)

Stormdata2 <- subset(Stormdata, Stormdata$PROPDMG != 0 | Stormdata$CROPDMG != 0)

Creat 3 subset: PROPDMGEXP in B(billions), in M(millions) and in K(thousands), then multiply Property damage column with 1000000000, 1000000, 1000 acordingly. Finally combine 3 subset back together to make one data set

unique(Stormdata2$PROPDMGEXP)

[1] K M B m + 0 5 6 4 h 2 7 3 H - Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M

# Subset the data that has billion $ property damage
Stormdata2B <- subset(Stormdata2, Stormdata2$PROPDMGEXP =="B")
# Subset the data that has million $ property damage
Stormdata2M <- subset(Stormdata2, Stormdata2$PROPDMGEXP == "M" | Stormdata2$PROPDMGEXP == "m")
# Subset the data that has thousand $ property damage
Stormdata2K <- subset(Stormdata2, Stormdata2$PROPDMGEXP == "K")
# Subset the data that has hundreds $ property damage
Stormdata2H <- subset(Stormdata2, Stormdata2$PROPDMGEXP == "H" | Stormdata2$PROPDMGEXP == "h")

Stormdata2B$PROPDMG <- Stormdata2B$PROPDMG * 1000000000
Stormdata2M$PROPDMG <- Stormdata2M$PROPDMG * 1000000
Stormdata2K$PROPDMG <- Stormdata2K$PROPDMG * 1000
Stormdata2H$PROPDMG <- Stormdata2H$PROPDMG * 100
# combine the data after multiply approriate factor 1000, 1000000, or 1000000000
Stormdata3 <- rbind(Stormdata2B, Stormdata2M, Stormdata2K, Stormdata2H)

Next, do the same thing for CROPDMGEXP

unique(Stormdata3$CROPDMGEXP)

[1] M B K m ? 0 k Levels: ? 0 2 B k K m M

# Subset the data that has billion $ crop damage
Stormdata3B <- subset(Stormdata2, Stormdata3$CROPDMGEXP == "B")
# Subset the data that has million $ crop damage
Stormdata3M <- subset(Stormdata2, Stormdata3$PROPDMGEXP == "M" )
# Subset the data that has thousad $ crop damage
Stormdata3K <- subset(Stormdata2, Stormdata3$PROPDMGEXP == "K"| Stormdata2$PROPDMGEXP == "k" )

## Warning in Stormdata3$PROPDMGEXP == "K" | Stormdata2$PROPDMGEXP == "k":
## longer object length is not a multiple of shorter object length

Stormdata3B$CROPDMG <- Stormdata3B$CROPDMG * 1000000000
Stormdata3M$CROPDMG <- Stormdata3M$CROPDMG * 1000000
Stormdata3K$CROPDMG <- Stormdata3K$CROPDMG * 1000
# combine the data after multiply approriate factor 1000, 1000000, or 1000000000
Stormdata4 <- rbind(Stormdata3B, Stormdata3M, Stormdata3K, Stormdata3)

Total damage is sum of property damage and crop damage

Stormdata4$totalDMG <- Stormdata4$PROPDMG + Stormdata4$CROPDMG
StormdatabyEvent <- aggregate(Stormdata4$totalDMG, by=list(Stormdata4$EVTYPE), FUN=sum)
StormdatabyEvent <- arrange(StormdatabyEvent, desc(x))
names(StormdatabyEvent) <- c("Event", "TotalDamageInBillions")
StormdatabyEvent$TotalDamageInBillions <- StormdatabyEvent$TotalDamageInBillions/(1000000000)

Results

Graph of Total injuries vs Event type.

library(ggplot2)
g1 <- ggplot(DataByEventInjuries2[1:5,], aes(factor(Event), INJURIES)) + geom_bar(stat = "identity") + labs(x=" Event", y= "Injuries", title="Total Injuries by Events (Top 5 Events)")
g1

Graph of total death vs Event type

g2 <-ggplot(DataByEventDeath2[1:5,], aes(factor(Event),FATALITIES)) + geom_bar(stat = "identity") + labs(x=" Event", y= "Fatalities", title="Total Fatalities by Events (Top 5 events)")
g2

From two graphs above, it is concluded that Tornado is the most harmful event with respect to population health. Flood causes second most injuries and Excessive heat causes second most fatalities.

Graph of Total economic damage by event:

g3 <- ggplot(StormdatabyEvent[1:5,], aes(factor(Event), TotalDamageInBillions)) + geom_bar(stat= "identity") + labs( x="Event", y=" Total Damage(Billions $ )", title= " Total Economic damage by Event (Top 5 Events)")
g3