As part of “Coursera and John Hopkins Bloomberg School of Public Health” Reproducible Research module, we will analyze National Weather Service's Storm Data between 1950 and November 2011 to find the answers for
The initial dataset can be found https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2.
We will use:
Environment from which this document is produced:
require(knitr)
# Set Global options to display the code
opts_chunk$set(echo=TRUE,cache=TRUE,
fig.width=18, fig.height=8)
options("scipen"=100, "digits"=4)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Download datafile
if (!file.exists("data")) {
dir.create("data")
}
ad<-"http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(ad,
"./data/repdata-data-StormData.csv.bz2")
# load initial data
q1 <- read.csv("./data/repdata-data-StormData.csv.bz2" )
# restrict data to EVTYPE, FATALITIES, INJURIES
hp <- q1[,c("EVTYPE","FATALITIES","INJURIES")]
colnames(hp) <-c("event_type","fatalities","injuries")
# processing data for injuries analysis
sinj <- group_by(hp,event_type) %>% summarize(sum(injuries))
colnames(sinj) <- c("event_type","total_injuries")
sinj <- sinj[(sinj$total_injuries>100),]
l <- sinj[(sinj$total_injuries==max(sinj$total_injuries)),]
e <- l$event_type
n <- l$total_injuries
# processing data for fatalities analysis
sfat <- group_by(hp,event_type) %>% summarize(sum(fatalities))
colnames(sfat) <- c("event_type","total_fatalities")
sfat <- sfat[(sfat$total_fatalitie>5),]
l1 <- sfat[(sfat$total_fatalities==max(sfat$total_fatalities)),]
e1 <- l1$event_type
n1 <- l1$total_fatalities
# Load damage cost property
# Cleanse data EXP column only has K,M,B
damp <- q1[,c("EVTYPE","PROPDMG","PROPDMGEXP")]
damp <- damp[(damp$PROPDMGEXP %in% c("K","k","M","m","B","b")),]
damp$PROPDMGEXP <- as.character(damp$PROPDMGEXP)
damp$PROPDMGEXP <- ifelse((damp$PROPDMGEXP=="m"),"M",
damp$PROPDMGEXP)
damp$cost_k <- ifelse((damp$PROPDMGEXP=="M"),damp$PROPDMG*1000,
damp$PROPDMG)
damp$cost_k <- ifelse((damp$PROPDMGEXP=="B"),damp$PROPDMG*1000000,
damp$cost_k)
damps <- group_by(damp,EVTYPE) %>% summarize(sum(cost_k))
colnames(damps) <- c("event_type","total_cost_k")
damps$good <- c(rep("Property",nrow(damps)))
damc <- q1[,c("EVTYPE","CROPDMG","CROPDMGEXP")]
damc <- damc[(damc$CROPDMGEXP %in% c("K","k","M","m","B","b")),]
damc$CROPDMGEXP <- as.character(damc$CROPDMGEXP)
damc$CROPDMGEXP <- ifelse((damc$CROPDMGEXP=="m"),"M",
damc$CROPDMGEXP)
damc$CROPDMGEXP <- ifelse((damc$CROPDMGEXP=="k"),"K",
damc$CROPDMGEXP)
damc$cost_k <- ifelse((damc$CROPDMGEXP=="M"),damc$CROPDMG*1000,
damc$CROPDMG)
damc$cost_k <- ifelse((damc$CROPDMGEXP=="B"),damc$CROPDMG*1000000,
damc$cost_k)
damcs <- group_by(damc,EVTYPE) %>% summarize(sum(cost_k))
colnames(damcs) <- c("event_type","total_cost_k")
damcs$good <- c(rep("Crop",nrow(damcs)))
# merge the 2 datasets
dampcs <- rbind(damps,damcs)
# Get the max loss combined crop and property
cp <- group_by(dampcs,event_type) %>% summarize(sum(total_cost_k))
colnames(cp) <- c("event_type","sum_total_cost_k")
v <- cp[(cp$sum_total_cost_k==max(cp$sum_total_cost_k)),]
g <- ggplot(sinj,aes(x=event_type ,y=total_injuries))
plot10 <- g +
geom_bar(stat="identity", position="identity",fill="pink",
colour="white") +
theme(axis.text.x = element_text(angle = 90, hjust=1, size=10, colour="black")) +
labs(x="Type of Event", y="Total injuries") +
ggtitle("Total of injuries over event type \n")
print(plot10)
g <- ggplot(sfat,aes(x=event_type ,y=total_fatalities))
plot1 <- g +
geom_bar(stat="identity", position="identity",fill="pink",
colour="white") +
theme(axis.text.x = element_text(angle = 90, hjust=1,size=10,colour="black")) +
labs(x="Type of Event", y="Total fatalities") +
ggtitle("Total of fatalities over event type \n")
print(plot1)
For the figure to be more readable, the data is restricted to losses over 5 millions, for the investigation on highest event types responsible for the losses, this is not a problem, because the numbers are over billions. By restricting the display, we will be able to focus on the most critical event types.
# For display filter all cost lesser than 5 millions
dampcs <- dampcs[(dampcs$total_cost_k>5000),]
ggplot(dampcs, aes(x = event_type, y = log(total_cost_k),
fill = good)) +
geom_bar(stat = "identity",colour="white") +
theme(axis.text.x = element_text(angle = 90, hjust=1,
size=10,colour="black")) +
labs(x="Type of Event",
y="Total Cost in Log thousands of dollars") +
ggtitle("Loss across US due to damage from different type of events related to Weather\n")
summary(sinj$total_injuries)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 129 251 545 3750 1360 91300
summary(sfat$total_fatalities)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6 13 33 217 103 5630
summary(cp$sum_total_cost_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 16 225 1110000 6320 150000000