The goal of this project is to explore NOAA Storm Database and answer some questions regarding events. The events in database start in the year 1950 and end in November 2011.
This Project analysis address the following questions:
The data for this analysis come in the form of comma-separated-value file compressed via the bzip2 algorithm to reduce its size.
To know about variables of the database , please see below links.
Download the file from location and load this to data.table
dataurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists("StormData.csv.bz2")) {
download.file(dataurl,destfile="StormData.csv.bz2")
}
storm <- read.csv("StormData.csv.bz2")
library(data.table)
storm <- as.data.table(storm)
names(storm)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
keepcols <- c("EVTYPE","INJURIES","FATALITIES","PROPDMG","PROPDMGEXP",
"CROPDMG","CROPDMGEXP")
Subset the data where there is proper event recorded and variables used in the analysis are having more than 0.
storm <- storm[, keepcols,with=FALSE]
cols <- c("PROPDMGEXP", "CROPDMGEXP")
## changing the type from factor to character for data transformations
stormDT <- storm[, (cols) := lapply(.SD,as.character),
.SDcols = cols]
## subset data for the events and measure variables are greater than 0
stormDT <- stormDT[(EVTYPE != "?" &
(INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0))]
Data Transformation for Crop damage and Property damage exponential. Blank values are converted to 0, All alpha numeric notations are made in same case and converted to common notation.
table(stormDT$CROPDMGEXP)
##
## ? 0 B k K m M
## 152663 6 17 7 21 99932 1 1985
table(stormDT$PROPDMGEXP)
##
## - + 0 2 3 4 5 6 7 B
## 11585 1 5 210 1 1 4 18 3 3 40
## h H K m M
## 1 6 231427 7 11320
stormDT[CROPDMGEXP=="",CROPDMGEXP := 0]
stormDT[PROPDMGEXP=="",PROPDMGEXP := 0]
stormDT[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
table(stormDT$CROPDMGEXP)
##
## ? 0 B K M
## 6 152680 7 99953 1986
table(stormDT$PROPDMGEXP)
##
## - + 0 2 3 4 5 6 7 B H
## 1 5 11795 1 1 4 18 3 3 40 7
## K M
## 231427 11327
cropDmgExpKey <- c("?" = 10^0,
"0" = 10^0,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
propDmgExpKey <- c("-" = 10^0,
"+" = 10^0,
"0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
stormDT[, CROPDMGEXP := cropDmgExpKey[as.character(stormDT[,CROPDMGEXP])]]
stormDT[, PROPDMGEXP := propDmgExpKey[as.character(stormDT[,PROPDMGEXP])]]
table(stormDT$CROPDMGEXP)
##
## 1 1000 1e+06 1e+09
## 152686 99953 1986 7
table(stormDT$PROPDMGEXP)
##
## 1 100 1000 10000 1e+05 1e+06 1e+07 1e+09
## 11801 8 231428 4 18 11330 3 40
Cost of Property and Crop damage is calculated as follows : Cost * Cost Exp
stormDT <- stormDT[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, PROPCOST = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, CROPCOST = CROPDMG * CROPDMGEXP)]
In this step, total fatalaties and Injuries are calculated and Top 10 are used for results
totalhealthDT <- stormDT[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTALS = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
totalhealthDT <- totalhealthDT[order(-FATALITIES), ]
totalhealthDT <- totalhealthDT[1:10, ]
head(totalhealthDT)
## EVTYPE FATALITIES INJURIES TOTALS
## 1: TORNADO 5633 91346 96979
## 2: EXCESSIVE HEAT 1903 6525 8428
## 3: FLASH FLOOD 978 1777 2755
## 4: HEAT 937 2100 3037
## 5: LIGHTNING 816 5230 6046
## 6: TSTM WIND 504 6957 7461
In this step, Total Property and Crop Damage costs are calculated and Top 10 are used for results
totalCostDT <- stormDT[, .(PROPCOST = sum(PROPCOST), CROPCOST = sum(CROPCOST), TOTALCOST = sum(PROPCOST) + sum(CROPCOST)), by = .(EVTYPE)]
totalCostDT <- totalCostDT[order(-TOTALCOST), ]
totalCostDT <- totalCostDT[1:10, ]
head(totalCostDT)
## EVTYPE PROPCOST CROPCOST TOTALCOST
## 1: FLOOD 144657709807 5661968450 150319678257
## 2: HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3: TORNADO 56947380677 414953270 57362333947
## 4: STORM SURGE 43323536000 5000 43323541000
## 5: HAIL 15735267513 3025954473 18761221986
## 6: FLASH FLOOD 16822673979 1421317100 18243991079
melting data for plot reprsentation
health_consequences <- melt(totalhealthDT, id.vars="EVTYPE", variable.name = "HEALTHTYPE")
head(health_consequences)
## EVTYPE HEALTHTYPE value
## 1: TORNADO FATALITIES 5633
## 2: EXCESSIVE HEAT FATALITIES 1903
## 3: FLASH FLOOD FATALITIES 978
## 4: HEAT FATALITIES 937
## 5: LIGHTNING FATALITIES 816
## 6: TSTM WIND FATALITIES 504
library(ggplot2)
ggplot(health_consequences, aes(x=reorder(EVTYPE,value), y=value))+
geom_bar(stat="identity",aes(fill=HEALTHTYPE),position="dodge")+
labs(title="Top 10 Event Types for Fatalaties/Injuries",x="Event Type", y="Number of Health Consequences")+
coord_flip()
melting data for plot reprsentation
econ_consequences <- melt(totalCostDT, id.vars="EVTYPE", variable.name = "DAMAGETYPE")
econ_consequences[, DAMAGETYPE := ordered(DAMAGETYPE, levels = c("CROPCOST","PROPCOST","TOTALCOST"))]
head(econ_consequences)
## EVTYPE DAMAGETYPE value
## 1: FLOOD PROPCOST 144657709807
## 2: HURRICANE/TYPHOON PROPCOST 69305840000
## 3: TORNADO PROPCOST 56947380677
## 4: STORM SURGE PROPCOST 43323536000
## 5: HAIL PROPCOST 15735267513
## 6: FLASH FLOOD PROPCOST 16822673979
library(ggplot2)
options(scipen=999)
ggplot(econ_consequences, aes(x=reorder(EVTYPE,value), y=value))+
geom_bar(stat="identity",aes(fill=DAMAGETYPE),position="dodge")+
labs(title="Top 10 Event Types impacts economy",x="Event Type", y="Cost of Impact")+
coord_flip()
options(scipen=0)