This analysis aims at studying a database that tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.The author therefore only has considered events from year 2007 for the analysis. The data analysis tries to answer questions about which types of events are most harmful with respect to population health and which types of events have the greatest economic consequences.
Global options
library(knitr)
## Warning: package 'knitr' was built under R version 3.2.5
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.2.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.5
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.5
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.2.4
opts_chunk$set(echo = TRUE)
opts_chunk$set(cache=TRUE)
options(scipen = 999) ## prevent exponiential notation in output
dest.file = "./data/StormData.csv.bz2"
if (!file.exists("./data")) {dir.create("./data")}
if(!file.exists(dest.file)) {
data.url <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(data.url, dest.file)
}
ptm <- proc.time()
data<- read.csv(dest.file, header=TRUE, sep=",")
etm <- proc.time() - ptm
Elapsed time to load csv file to data df is 829.5 seconds.
As the data is huge and too time consuming, we take subset of the years from 2007 to arrive at our conclusions.
## subset only 2007 and beyond for data
data$Date <- as.Date(strptime(data$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"))
data$Year <- format(data$Date,format="%Y")
subset.data <- data[data$Year > 2006,]
Aggregate and find the top 5 most harmful (healthwise) events since 2007
Harmful.data <- subset.data[,c("EVTYPE","FATALITIES","INJURIES")]
Sum.harmful.data <- aggregate(Harmful.data$FATALITIES + Harmful.data$INJURIES,
by=list(Harmful.data$EVTYPE),sum)
names(Sum.harmful.data) <- c("EVTYPE","TOTALEVENTS")
Top.harmful.data <- head(Sum.harmful.data[order(-Sum.harmful.data$TOTALEVENTS),],5)
Lets understand the various unique multipliers that are involved in the data for property damage.
sqldf("select distinct PROPDMGEXP from data order by PROPDMGEXP")
## Loading required package: tcltk
## PROPDMGEXP
## 1
## 2 +
## 3 -
## 4 0
## 5 1
## 6 2
## 7 3
## 8 4
## 9 5
## 10 6
## 11 7
## 12 8
## 13 ?
## 14 B
## 15 H
## 16 K
## 17 M
## 18 h
## 19 m
## lets see how the multipliers are distributed
sqldf("select distinct PROPDMGEXP, count(*) from data group by PROPDMGEXP")
## PROPDMGEXP count(*)
## 1 465934
## 2 + 5
## 3 - 1
## 4 0 216
## 5 1 25
## 6 2 13
## 7 3 4
## 8 4 4
## 9 5 28
## 10 6 4
## 11 7 5
## 12 8 1
## 13 ? 8
## 14 B 40
## 15 H 6
## 16 K 424665
## 17 M 11330
## 18 h 1
## 19 m 7
Lets understand the various unique multipliers that are involved in the data for crop damage.
sqldf("select distinct CROPDMGEXP from data order by CROPDMGEXP")
## CROPDMGEXP
## 1
## 2 0
## 3 2
## 4 ?
## 5 B
## 6 K
## 7 M
## 8 k
## 9 m
## lets see how the multipliers are distributed
sqldf("select distinct CROPDMGEXP, count(*) from data group by CROPDMGEXP")
## CROPDMGEXP count(*)
## 1 618413
## 2 0 19
## 3 2 1
## 4 ? 7
## 5 B 9
## 6 K 281832
## 7 M 1994
## 8 k 21
## 9 m 1
Based on the data, it seems the multipliers M,K and B make the most impact. Also the processing power of my PC is too limited to process all the rows.
Based on these unique multipliers subset the data further.Also alter the column for damage amount to show actual amount (based on multiplier)
ptm <- proc.time()
property.data <- subset(subset.data, (toupper(data$CROPDMGEXP) == "M" |
toupper(data$CROPDMGEXP) == "K" |
toupper(data$CROPDMGEXP) == "B" |
toupper(data$PROPDMGEXP) =="M" |
toupper(data$PROPDMGEXP) =="K" |
toupper(data$PROPDMGEXP) =="B"))
for(i in 1:length(subset.data$PROPDMGEXP)) {
if (toupper(subset.data$PROPDMGEXP[i]) == "K" )
subset.data$PROPDMG[i] <- subset.data$PROPDMG[i] * 1000
else if ( toupper(subset.data$PROPDMGEXP[i]) == "M" )
subset.data$PROPDMG[i] <- subset.data$PROPDMG[i] * 1000000
else if ( toupper(subset.data$PROPDMGEXP[i]) == "B" )
subset.data$PROPDMG[i] <- subset.data$PROPDMG[i] * 1000000000
}
etm <- proc.time() - ptm
Elapsed time is 208.65 seconds.
ptm <- proc.time()
for(i in 1:length(subset.data$CROPDMGEXP)) {
if ( toupper(subset.data$CROPDMGEXP[i]) == "K" )
subset.data$CROPDMG[i] <- subset.data$CROPDMG[i] * 1000
else if ( toupper(subset.data$CROPDMGEXP[i]) == "M" )
subset.data$CROPDMG[i] <- subset.data$CROPDMG[i] * 1000000
else if ( toupper(subset.data$CROPDMGEXP[i]) == "B" )
subset.data$CROPDMG[i] <- subset.data$CROPDMG[i] * 1000000000
}
etm <- proc.time() - ptm
Elapsed time is 196.28 seconds.
economic.data <- subset.data[,c("EVTYPE","PROPDMG","CROPDMG")]
Sum.economic.data <- aggregate(economic.data$PROPDMG + economic.data$CROPDMG,
by=list(economic.data$EVTYPE),sum)
names(Sum.economic.data) <- c("EVTYPE","TOTALDAMAGE")
Top.economic.data <- head(Sum.economic.data[order(-Sum.economic.data$TOTALDAMAGE),],5)
Lets plot the graph and understand which event is the most catastrophic to life
plot.most.destructive <- ggplot(Top.harmful.data, aes(x=EVTYPE,y=TOTALEVENTS)) + geom_bar(stat="identity") + ylab("Casualities") + xlab("Event") +
ggtitle("Plot to understand the event that casued most impact to human lives ")
print(plot.most.destructive)
As we can see the most life threatening event is TORNADO.
Lets plot the graph and understand which event is the most economically catastrophic
plot.most.economic.destructive <- ggplot(Top.economic.data, aes(x=EVTYPE,y=TOTALDAMAGE)) + geom_bar(stat="identity") + ylab("Monetary losses") + xlab("Event") +
ggtitle("Plot to understand the most economically damaging event ")
print(plot.most.economic.destructive)
As we can see the most economically challenging event is FLOOD.