Kao Cheng Hsuan Nov 1st, 2018
The main purpose of this analysis is to understand the real impact of historical weather events in United States, between 1950 and November of 2011. For this, we will use the U.S. National Oceanic and Atmospheric Administration“s (NOAA) storm database. The analysis basically measure the impact in human health (considering injuries and fatalities) by type of weather event. Additionally, the analysis measures the economic impact of these weather events, considering damage in properties and crops during the years. At the end of the analysis, we can understand that the weather event with more fatalities and injuries is the tornado, but overall the most important event in terms of the cost is the flood.
Load all the required packages for this anaylsis.
library(dplyr)
library(ggplot2)
library(gridExtra)
library(knitr)
library(R.utils)
Download data, and unzip, then using read.csv to load the data.
# You can use the following code to get the original data from website.
#fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#Download <- download.file(fileUrl,destfile = "~/Downloads/PeerAssessment2/tmp.zip")
#Unzip <- bunzip2("~/Downloads/PeerAssessment2/tmp.zip",destname="tmp.csv",overwrite=TRUE)
# I downloaded the data, and save it the the following path.
Unzip <- "~/Downloads/PeerAssessment2/StormData.csv"
stormdata <- read.csv(Unzip, header = T)
colnames(stormdata)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
The only columns we need for the analysis are the ones related to the weather events, and their health and economic damages: “EVTYPE”, “FATALITIES”, “INJURIES”, “PROPDMG”, “PROPDMGEXP”,“CROPDMG” and “CROPDMGEXP”.
stormdata1 <- stormdata[,c(8,23:28)]
# Subset the data set for "FATALITIES" analysis
FT <- select(stormdata1,c(EVTYPE,FATALITIES))
# Subset the data set for "INJURIES" analysis
IN <- select(stormdata1,c(EVTYPE,INJURIES))
According to Storm Data Documentation, there are only references about what mean the “K”, “M” and “B” considering the damage costs. Particularly, the document explains: “Alphabetical characters used to signify magnitude include”K" for thousands, “M” for millions, and “B” for billions. So we only considered the cases labeled by “K”, “M” or “B”, basically because they include the main cases.
# Subset the data set for "PROPDMG" analysis
PR <- arrange(count(select(stormdata1,c(EVTYPE,PROPDMG,PROPDMGEXP)),PROPDMGEXP),desc(n))
# For the analysis, we need to delete the cases unrelated, i.e., PROPDMGEXP = " "
PR <- slice(PR,-1)
# To get the numbers without scientific notation
options(scipen=999)
# Calculate the percentage of the different cases of PROPDMGEXP
PR$Percentage <- as.numeric(format((PR$n/sum(PR$n))*100,digits=2))
SPR <- sum(PR[PR$PROPDMGEXP=="K"|PR$PROPDMGEXP=="B"|PR$PROPDMGEXP=="M",]$Percentage)
# Subset the data set for "CROPDMG" analysis
CR <- arrange(count(select(stormdata1,c(EVTYPE,CROPDMG,CROPDMGEXP)),CROPDMGEXP),desc(n))
# For the analysis, we need to delete the cases unrelated, i.e., CROPDMGEXP = " "
CR <- slice(CR,-1)
# Calculate the percentage of the different cases of CROPDMGEXP
CR$Percentage <- as.numeric(format((CR$n/sum(CR$n))*100,digits=2))
SCR <- sum(CR[CR$CROPDMGEXP=="K"|CR$CROPDMGEXP=="B"|CR$CROPDMGEXP=="M",]$Percentage)
Therefore, the PROPDMG cases labeled by “K”, “M”, or “B” are 99.92483% of the cases and for CROPDMG are 99.98274% of the cases. Consequently, we are only going to use these labeled cases.
# Subset the data set for "PROPDMG" analysis
S1 <- filter(select(stormdata1,c(EVTYPE,PROPDMG,PROPDMGEXP)),PROPDMGEXP=="K"|PROPDMGEXP=="M"|PROPDMGEXP=="B")
S1$Cost <- ifelse(S1$PROPDMGEXP=="K",S1$PROPDMG*1000,ifelse(S1$PROPDMGEXP=="M",S1$PROPDMG*1000000,S1$PROPDMG*1000000000))
# Subset the data set for "CROPDMG" analysis
S2 <- filter(select(stormdata1,c(EVTYPE,CROPDMG,CROPDMGEXP)),CROPDMGEXP=="K"|CROPDMGEXP=="M"|CROPDMGEXP=="B")
S2$Cost <- ifelse(S2$CROPDMGEXP=="K",S2$CROPDMG*1000,ifelse(S2$CROPDMGEXP=="M",S2$CROPDMG*1000000,S2$CROPDMG*1000000000))
# Calculate the events which are most harmful
# Considering "FATALITIES"
RFT <- arrange(summarize(group_by(FT,EVTYPE),sum(FATALITIES)),desc(summarize(group_by(FT,EVTYPE),sum(FATALITIES))$`sum(FATALITIES)`))
# For the analysis we considered only the top 25 events
RFFT <- slice(RFT, 1:25)
colnames(RFFT) <- c("EVTYPE","FATALITIES")
# Plot Fatalities
RFFT$EVTYPE <- factor(RFFT$EVTYPE, levels = RFFT$EVTYPE[order(-RFFT$FATALITIES)])
g1 <- ggplot(RFFT, aes(EVTYPE,FATALITIES),fill=FATALITIES)+geom_bar(stat = "identity",color="black",fill="green")+theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))+labs(title="Total Number of Fatalities by Event Type")+labs(x="Event Type",y="Number of Fatalities")+theme(plot.title = element_text(hjust = 0.5,size = 10, face = "bold"),axis.text.x = element_text(size=7),axis.title.x=element_text(size=8),axis.title.y=element_text(size=9))
# Considering "INJURIES"
RIN <- arrange(summarize(group_by(IN,EVTYPE),sum(INJURIES)),desc(summarize(group_by(IN,EVTYPE),sum(INJURIES))$`sum(INJURIES)`))
# For the analysis we considered only the top 25 events
RFIN <- slice(RIN, 1:25)
colnames(RFIN) <- c("EVTYPE","INJURIES")
RFIN$EVTYPE <- factor(RFIN$EVTYPE, levels = RFIN$EVTYPE[order(-RFIN$INJURIES)])
g2 <- ggplot(RFIN, aes(EVTYPE,INJURIES),fill=INJURIES)+geom_bar(stat = "identity",color="black",fill="green")+theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))+labs(title="Total Number of Injuries by Event Type")+labs(x="Event Type",y="Number of Injuries")+theme(plot.title = element_text(hjust = 0.5,size = 10, face = "bold"),axis.text.x = element_text(size=7),axis.title.x=element_text(size=8),axis.title.y=element_text(size=9))
# Ploting together
grid.arrange(g1,g2,ncol=2)
# Considering PROPDMG
S3 <- arrange(summarize(group_by(S1,EVTYPE),sum(Cost)),desc(summarize(group_by(S1,EVTYPE),sum(Cost))$`sum(Cost)`))
# For the analysis we considered only the top 25 events
S4 <- slice(S3, 1:25)
# Transfor the cost column into "Billion dollars"
S4$`sum(Cost)` <- S4$`sum(Cost)`/1000000000
colnames(S4) <- c("EVTYPE", "Cost")
# Plot PROPDMG
S4$EVTYPE <- factor(S4$EVTYPE, levels = S4$EVTYPE[order(-S4$Cost)])
g3 <- ggplot(S4, aes(EVTYPE,Cost),fill=Cost)+geom_bar(stat = "identity",color="black",fill="green")+theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))+labs(title="Total Cost in Prop by Event Type (Billion Dollars)")+labs(x="Event Type",y="Cost in Prop (Billion Dollars)")+theme(plot.title = element_text(hjust = 0.5,size = 8.5, face = "bold"),axis.text.x = element_text(size=7),axis.title.x=element_text(size=8),axis.title.y=element_text(size=9))
# Considering CROPDMG
S5 <- arrange(summarize(group_by(S2,EVTYPE),sum(Cost)),desc(summarize(group_by(S2,EVTYPE),sum(Cost))$`sum(Cost)`))
# For the analysis we considered only the top 25 events
S6 <- slice(S5, 1:25)
# Transfor the cost column into "Billion dollars"
S6$`sum(Cost)` <- S6$`sum(Cost)`/1000000000
colnames(S6) <- c("EVTYPE", "Cost")
S6$EVTYPE <- factor(S6$EVTYPE, levels = S6$EVTYPE[order(-S6$Cost)])
g4 <- ggplot(S6, aes(EVTYPE,Cost),fill=Cost)+geom_bar(stat = "identity",color="black",fill="green")+theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))+labs(title="Total Cost in Crop by Event Type (Billion Dollars)")+labs(x="Event Type",y="Cost in Crop (Billion Dollars)")+theme(plot.title = element_text(hjust = 0.5,size = 8.5, face = "bold"),axis.text.x = element_text(size=7),axis.title.x=element_text(size=8),axis.title.y=element_text(size=9))
grid.arrange(g3,g4,ncol=2)
#Considering PROPDMG and CROPDMG together
S7 <- rbind(S4,S6)
colnames(S7) <- c("EVTYPE", "Cost")
S8 <- arrange(summarize(group_by(S7,EVTYPE), sum(Cost)),desc(summarize(group_by(S7,EVTYPE), sum(Cost))$`sum(Cost)`))
# For the analysis we considered only the top 25 events
S9 <- slice(S8, 1:25)
colnames(S9) <- c("EVTYPE", "Cost")
S9$EVTYPE <- factor(S9$EVTYPE, levels = S9$EVTYPE[order(-S9$Cost)])
g5 <- ggplot(S9, aes(EVTYPE,Cost),fill=Cost)+geom_bar(stat = "identity",color="black",fill="green")+theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))+labs(title="Total Cost in Prop and Crop by Event Type (Billion Dollars)")+labs(x="Event Type",y="Cost in Prop and Crop (Billion Dollars)")+theme(plot.title = element_text(hjust = 0.5,size = 10, face = "bold"),axis.text.x = element_text(size=7),axis.title.x=element_text(size=8),axis.title.y=element_text(size=9))
g5