library(hashmap)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The goal of this assignment is to explore the NOAA Storm Database and answer the following basic questions about severe weather events.
1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
2. Across the United States, which types of events have the greatest economic consequences?
It has been concluded that Tornado caused maximum fatalities & injuries -whereas- flood and draught caused maximum property & crop damages respectively.
stormPartial <- read.csv("./StormData.csv", nrows = 2)
cols <- colnames(stormPartial)
cols
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Concerned columns for analysis of population health and economic consequences are “EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP”
# Filling "colclasses" with selected columns to be read from .csv data file
relevantCols = rep('NULL', length(cols))
relevantCols[which(cols == c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP"))] = NA
## Warning in cols == c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
## "PROPDMGEXP", : longer object length is not a multiple of shorter object
## length
stormData <- read.csv("./StormData.csv", colClasses = relevantCols)
dim(stormData)
## [1] 902297 7
unique(stormData$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(stormData$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
expCodes <- c(
"-", "?", "+", # invalid exponents
"", "0",
"1",
"2", "h", "H",
"3", "k", "K",
"4",
"5",
"6", "m", "M",
"7",
"8",
"b", "B")
multVals <- c(
0, 0, 0,
1, 1,
10,
10^2, 10^2, 10^2,
10^3, 10^3, 10^3,
10^4,
10^5,
10^6, 10^6, 10^6,
10^7,
10^8,
10^9, 10^9
)
exp2MultMap <- hashmap(expCodes, multVals)
stormData$PROPDMG <- stormData$PROPDMG * exp2MultMap[[as.character(stormData$PROPDMGEXP)]]
#head(stormData$PROPDMG, 100)
stormData$CROPDMG <- stormData$CROPDMG * exp2MultMap[[as.character(stormData$CROPDMGEXP)]]
#head(stormData$CROPDMG, 100)
str(stormData)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25000 2500 25000 2500 2500 2500 2500 2500 25000 25000 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
# Totalling the data by event
fatal <- aggregate(FATALITIES ~ EVTYPE, stormData, FUN = sum)
injury <- aggregate(INJURIES ~ EVTYPE, stormData, FUN = sum)
propdmg <- aggregate(PROPDMG ~ EVTYPE, stormData, FUN = sum)
cropdmg <- aggregate(CROPDMG ~ EVTYPE, stormData, FUN = sum)
fatalSum <- sum(fatal$FATALITIES)
fatal <- mutate(fatal, pct=FATALITIES * 100 / fatalSum)
fatal <- arrange(fatal, desc(pct))
head(fatal, 10)
## EVTYPE FATALITIES pct
## 1 TORNADO 5633 37.193793
## 2 EXCESSIVE HEAT 1903 12.565203
## 3 FLASH FLOOD 978 6.457577
## 4 HEAT 937 6.186860
## 5 LIGHTNING 816 5.387917
## 6 TSTM WIND 504 3.327831
## 7 FLOOD 470 3.103334
## 8 RIP CURRENT 368 2.429845
## 9 HIGH WIND 248 1.637504
## 10 AVALANCHE 224 1.479036
injurySum <- sum(injury$INJURIES)
injury <- mutate(injury, pct=INJURIES * 100 / injurySum)
injury <- arrange(injury, desc(pct))
head(injury, 10)
## EVTYPE INJURIES pct
## 1 TORNADO 91346 65.0019925
## 2 TSTM WIND 6957 4.9506148
## 3 FLOOD 6789 4.8310657
## 4 EXCESSIVE HEAT 6525 4.6432028
## 5 LIGHTNING 5230 3.7216782
## 6 HEAT 2100 1.4943641
## 7 ICE STORM 1975 1.4054139
## 8 FLASH FLOOD 1777 1.2645167
## 9 THUNDERSTORM WIND 1488 1.0588637
## 10 HAIL 1361 0.9684903
propdmgSum <- sum(propdmg$PROPDMG)
propdmg <- mutate(propdmg, pct=PROPDMG * 100 / propdmgSum)
propdmg <- arrange(propdmg, desc(pct))
head(propdmg, 10)
## EVTYPE PROPDMG pct
## 1 FLOOD 144657709807 33.780782
## 2 HURRICANE/TYPHOON 69305840000 16.184450
## 3 TORNADO 56947380617 13.298476
## 4 STORM SURGE 43323536000 10.117006
## 5 FLASH FLOOD 16822673979 3.928467
## 6 HAIL 15735267513 3.674534
## 7 HURRICANE 11868319010 2.771516
## 8 TROPICAL STORM 7703890550 1.799029
## 9 WINTER STORM 6688497251 1.561912
## 10 HIGH WIND 5270046260 1.230673
cropdmgSum <- sum(cropdmg$CROPDMG)
cropdmg <- mutate(cropdmg, pct=CROPDMG * 100 / cropdmgSum)
cropdmg <- arrange(cropdmg, desc(pct))
head(cropdmg, 10)
## EVTYPE CROPDMG pct
## 1 DROUGHT 13972566000 28.454935
## 2 FLOOD 5661968450 11.530519
## 3 RIVER FLOOD 5029459000 10.242423
## 4 ICE STORM 5022113500 10.227464
## 5 HAIL 3025954473 6.162314
## 6 HURRICANE 2741910000 5.583861
## 7 HURRICANE/TYPHOON 2607872800 5.310896
## 8 FLASH FLOOD 1421317100 2.894492
## 9 EXTREME COLD 1292973000 2.633121
## 10 FROST/FREEZE 1094086000 2.228091
par(mfrow = c(1, 2), mar = c(12, 4, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(fatal$FATALITIES[1:10], las = 3, names.arg = fatal$EVTYPE[1:10], main = "Events with Highest Fatalities", ylab = "Number of fatalities", col = "light blue")
barplot(injury$INJURIES[1:10], las = 3, names.arg = injury$EVTYPE[1:10], main = "Events with Highest Injuries", ylab = "Number of injuries", col = "light blue")
par(mfrow = c(1, 2), mar = c(12, 4, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(propdmg$PROPDMG[1:10], las = 3, names.arg = propdmg$EVTYPE[1:10], main = "Events with Highest Property Damages", ylab = "Amount of damages", col = "light blue")
barplot(cropdmg$CROPDMG[1:10], las = 3, names.arg = cropdmg$EVTYPE[1:10], main = "Events with Highest Crop Damages", ylab = "Number of damages", col = "light blue")
Tornado caused maximum fatalities & injuries -whereas- flood and draught caused maximum property & crop damages respectively.