This project is to explore the NOAA Storm Database and answer some basic questions about severe weather events. I will use the database (“https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2”) to answer 2 main questions below:
Across the United States, which types of events are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
I will show all the codes for my entire analysis.
First, we make sure that everyone will be able to see the R code, we set echo=“TRUE” for the whole document.
knitr:: opts_chunk$set(echo=TRUE, results = "asis", cache = TRUE)
if(!file.exists("repdata_data_StormData.csv.bz2")){
download.file(url, destfile = "D:/DS/reproducible research/repdata_data_StormData.csv.bz2", method="auto")
}
Stormdata <- read.csv("repdata_data_StormData.csv.bz2")
Look at sum of injuries and fatalities of each type of event.
DataByEventInjuries <- aggregate(Stormdata$INJURIES, by=list(Stormdata$EVTYPE), FUN=sum,na.rm=TRUE)
DataByEventDeath <- aggregate(Stormdata$FATALITIES, by=list(Stormdata$EVTYPE), FUN=sum, na.rm=TRUE)
names(DataByEventInjuries) <- c("Event", "INJURIES")
names(DataByEventDeath) <- c("Event", "FATALITIES")
After creating 2 sets of data: Data by Injuries and Data by Death, sort data from highest to lowest to see which event causes the most injuries and death.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
DataByEventInjuries <- arrange(DataByEventInjuries, desc(INJURIES))
DataByEventInjuries2 <- DataByEventInjuries[1:158,]
DataByEventDeath <- arrange(DataByEventDeath, desc(FATALITIES))
DataByEventDeath2 <- DataByEventDeath[1:168,]
There is two types of damage: property damage and crop damage. Total damage will be sum of property damage and crop damage. Subset all the data that has more than 0 property damage and crop damage. (All the events that have 0 damage will in excluded)
Stormdata2 <- subset(Stormdata, Stormdata$PROPDMG != 0 | Stormdata$CROPDMG != 0)
Creat 3 subset: PROPDMGEXP in B(billions), in M(millions) and in K(thousands), then multiply Property damage column with 1000000000, 1000000, 1000 acordingly. Finally combine 3 subset back together to make one data set
unique(Stormdata2$PROPDMGEXP)
[1] K M B m + 0 5 6 4 h 2 7 3 H - Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
# Subset the data that has billion $ property damage
Stormdata2B <- subset(Stormdata2, Stormdata2$PROPDMGEXP =="B")
# Subset the data that has million $ property damage
Stormdata2M <- subset(Stormdata2, Stormdata2$PROPDMGEXP == "M" | Stormdata2$PROPDMGEXP == "m")
# Subset the data that has thousand $ property damage
Stormdata2K <- subset(Stormdata2, Stormdata2$PROPDMGEXP == "K")
# Subset the data that has hundreds $ property damage
Stormdata2H <- subset(Stormdata2, Stormdata2$PROPDMGEXP == "H" | Stormdata2$PROPDMGEXP == "h")
Stormdata2B$PROPDMG <- Stormdata2B$PROPDMG * 1000000000
Stormdata2M$PROPDMG <- Stormdata2M$PROPDMG * 1000000
Stormdata2K$PROPDMG <- Stormdata2K$PROPDMG * 1000
Stormdata2H$PROPDMG <- Stormdata2H$PROPDMG * 100
# combine the data after multiply approriate factor 1000, 1000000, or 1000000000
Stormdata3 <- rbind(Stormdata2B, Stormdata2M, Stormdata2K, Stormdata2H)
Next, do the same thing for CROPDMGEXP
unique(Stormdata3$CROPDMGEXP)
[1] M B K m ? 0 k Levels: ? 0 2 B k K m M
# Subset the data that has billion $ crop damage
Stormdata3B <- subset(Stormdata2, Stormdata3$CROPDMGEXP == "B")
# Subset the data that has million $ crop damage
Stormdata3M <- subset(Stormdata2, Stormdata3$PROPDMGEXP == "M" )
# Subset the data that has thousad $ crop damage
Stormdata3K <- subset(Stormdata2, Stormdata3$PROPDMGEXP == "K"| Stormdata2$PROPDMGEXP == "k" )
## Warning in Stormdata3$PROPDMGEXP == "K" | Stormdata2$PROPDMGEXP == "k":
## longer object length is not a multiple of shorter object length
Stormdata3B$CROPDMG <- Stormdata3B$CROPDMG * 1000000000
Stormdata3M$CROPDMG <- Stormdata3M$CROPDMG * 1000000
Stormdata3K$CROPDMG <- Stormdata3K$CROPDMG * 1000
# combine the data after multiply approriate factor 1000, 1000000, or 1000000000
Stormdata4 <- rbind(Stormdata3B, Stormdata3M, Stormdata3K, Stormdata3)
Total damage is sum of property damage and crop damage
Stormdata4$totalDMG <- Stormdata4$PROPDMG + Stormdata4$CROPDMG
StormdatabyEvent <- aggregate(Stormdata4$totalDMG, by=list(Stormdata4$EVTYPE), FUN=sum)
StormdatabyEvent <- arrange(StormdatabyEvent, desc(x))
names(StormdatabyEvent) <- c("Event", "TotalDamageInBillions")
StormdatabyEvent$TotalDamageInBillions <- StormdatabyEvent$TotalDamageInBillions/(1000000000)
library(ggplot2)
g1 <- ggplot(DataByEventInjuries2[1:5,], aes(factor(Event), INJURIES)) + geom_bar(stat = "identity") + labs(x=" Event", y= "Injuries", title="Total Injuries by Events (Top 5 Events)")
g1
g2 <-ggplot(DataByEventDeath2[1:5,], aes(factor(Event),FATALITIES)) + geom_bar(stat = "identity") + labs(x=" Event", y= "Fatalities", title="Total Fatalities by Events (Top 5 events)")
g2
g3 <- ggplot(StormdatabyEvent[1:5,], aes(factor(Event), TotalDamageInBillions)) + geom_bar(stat= "identity") + labs( x="Event", y=" Total Damage(Billions $ )", title= " Total Economic damage by Event (Top 5 Events)")
g3