The data are first read into a data frame. We examine the structure of the dataset and determine the variables that we require. Examing the event types indicates that most events are classified as being one of a handful of types. We notice that “Thunderstorm Wind” and “Thunderstorm Winds” are classified as different event types. We assume that this is unintnetional and club them together. It is possible that there are other such instances, but since there are very few events of those types, we do not make any other modifications. We check that the dataset is complete i.e. without any missing variable values. The property damage and crop damage values are then converted from the units in the raw data to dollars. We then create a new data frame containing the event type, fatalties, injuries and economic damage.
Data <- read.csv("repdata-data-StormData.csv.bz2")
names(Data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
sel <- c( "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
RedData <- Data[, sel]
# Correct one event type
RedData$EVTYPE <- as.factor(RedData$EVTYPE)
RedData$EVTYPE[(RedData$EVTYPE == "THUNDERSTORM WINDS")] <- "THUNDERSTORM WIND"
RedData$EVTYPE[(RedData$EVTYPE == "HIGH WINDS")] <- "HIGH WIND"
RedData$EVTYPE[(RedData$EVTYPE == "RIP CURRENTS")] <- "RIP CURRENT"
# Check for NA
sum(is.na(RedData))
## [1] 0
# Convert crop and property damage units to dollars
RedData$PROPDMGEXP <- as.character(RedData$PROPDMGEXP)
RedData$PROPDMGEXP[!grepl("K|M|B", RedData$PROPDMGEXP, ignore.case = TRUE)] <- "0" # everything exept K,M,B is dollar
RedData$PROPDMGEXP[grep("K", RedData$PROPDMGEXP, ignore.case = TRUE)] <- "3"
RedData$PROPDMGEXP[grep("B", RedData$PROPDMGEXP, ignore.case = TRUE)] <- "9"
RedData$PROPDMGEXP[grep("M", RedData$PROPDMGEXP, ignore.case = TRUE)] <- "6"
RedData$PROPDMGEXP[grep("H", RedData$PROPDMGEXP, ignore.case = TRUE)] <- "2"
RedData$PROPDMG <- RedData$PROPDMG*10^(as.numeric(RedData$PROPDMGEXP))
RedData$CROPDMGEXP <- as.character(RedData$CROPDMGEXP)
RedData$CROPDMGEXP[!grepl("K|M|B", RedData$CROPDMGEXP, ignore.case = TRUE)] <- "0" # everything exept K,M,B is dollar
RedData$CROPDMGEXP[grep("K", RedData$CROPDMGEXP, ignore.case = TRUE)] <- "3"
RedData$CROPDMGEXP[grep("B", RedData$CROPDMGEXP, ignore.case = TRUE)] <- "9"
RedData$CROPDMGEXP[grep("M", RedData$CROPDMGEXP, ignore.case = TRUE)] <- "6"
RedData$CROPDMGEXP[grep("H", RedData$CROPDMGEXP, ignore.case = TRUE)] <- "2"
RedData$CROPDMG <- RedData$CROPDMG*10^(as.numeric(RedData$CROPDMGEXP))
# Create new data frame with event type, fatalties, injuries, economic damage
NewData <- data.frame(Event=RedData$EVTYPE,Fatalties=RedData$FATALITIES,Injuries=RedData$INJURIES,Cost=RedData$PROPDMG+RedData$CROPDMG)
Result <- aggregate(. ~ Event, NewData,sum)
head(Result[order(-Result$Fatalties),])
## Event Fatalties Injuries Cost
## 831 TORNADO 5633 91346 57352114049
## 130 EXCESSIVE HEAT 1903 6525 500155700
## 153 FLASH FLOOD 978 1777 17562129167
## 275 HEAT 937 2100 403258500
## 463 LIGHTNING 816 5230 940751537
## 584 RIP CURRENT 572 529 163000
head(Result[order(-Result$Injuries),])
## Event Fatalties Injuries Cost
## 831 TORNADO 5633 91346 57352114049
## 853 TSTM WIND 504 6957 5038935845
## 170 FLOOD 470 6789 150319678257
## 130 EXCESSIVE HEAT 1903 6525 500155700
## 463 LIGHTNING 816 5230 940751537
## 758 THUNDERSTORM WIND 197 2396 5824578145
head(Result[order(-Result$Cost),])
## Event Fatalties Injuries Cost
## 170 FLOOD 470 6789 150319678257
## 410 HURRICANE/TYPHOON 64 1275 71913712800
## 831 TORNADO 5633 91346 57352114049
## 668 STORM SURGE 13 38 43323541000
## 244 HAIL 15 1361 18758221521
## 153 FLASH FLOOD 978 1777 17562129167
Tornados are the most harmful type of event to human health in terms of both fatalties and injuries. However, floods cause the greatest economic damage. The top 5 types of events with the highest fatalties, injuries and economic cost are shown graphically below:
library(ggplot2)
g <- ggplot(Result[order(-Result$Fatalties),][1:5,], aes(x=reorder(Event,-Fatalties),y=Fatalties)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))+ggtitle("Top 5 Event Types with Highest Total Fatalities") +labs(x="Event Type", y="Fatalities")
g
g <- ggplot(Result[order(-Result$Injuries),][1:5,], aes(x=reorder(Event,-Injuries),y=Injuries)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))+ggtitle("Top 5 Event Types with Highest Total Injuries") +labs(x="Event Type", y="Injuries")
g
g <- ggplot(Result[order(-Result$Cost),][1:5,], aes(x=reorder(Event,-Cost),y=Cost/1e9)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))+ggtitle("Top 5 Event Types with Highest Total Economic Damage") +labs(x="Event Type", y="Cost of Economic Damage (Billion $)")
g