For the following analysis we have the NOAA storm data from 1950 to 2011. From the given dataset we will take a look at the number of injuries, fatalities as well as the financial damage incurred due to various natural events occured.
The data is from the National Weather Service(https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2) and is unzipped using the bunzip2 function and loaded in R.
library(R.utils)
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "storm_data.csv.bz2", method = "curl")
bunzip2("storm_data.csv.bz2")
storm_data <- read.csv("storm_data.csv", header = TRUE)
We will now explore the data and plot some figures to give us an overview of the data as a whole and the factors governing it. From the dataset we will use the factors of fatalaties and injuries first with respect to events and followed by the damages to Property and Crops by events and plot them for a basic trend.
We will now select the columns which are required for this.
library(dplyr)
new_storm_data <- select(storm_data, "STATE", "EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
Figuring out all the events in the dataset
unique_events <- unique(new_storm_data$EVTYPE)
length(unique_events)
## [1] 985
As we can see there are 985 unique events in the dataset. So it makes sense to take the top 6 events with the sum of fatalities by events.
fatalities <- with(new_storm_data,aggregate(FATALITIES, by = list(EVTYPE = EVTYPE), FUN = sum))
fatalities <- arrange(fatalities, desc(x))
head(fatalities, n = 6)
## EVTYPE x
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
Plotting for the first 6 highest total fatalities
library(ggplot2)
ggplot(fatalities[1:6,], aes(EVTYPE,x)) + geom_bar(stat = "identity") + xlab("EVENT TYPE") + ylab("FATALITIES")
We will follow the same procedure as we did for fatalities
injuries <- with(new_storm_data, aggregate(INJURIES, by = list(EVTYPE = EVTYPE), FUN = sum))
injuries <- arrange(injuries, desc(x))
head(injuries, n = 6)
## EVTYPE x
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
Plotting for injuries by events
ggplot(injuries[1:6,], aes(EVTYPE,x)) + geom_bar(stat = "identity") + xlab("Event type") + ylab("injuries")
In the dataset Property damage is mention as the exponential factor i.e 10^x format
unique(new_storm_data$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
From this we can conclude that in the exponential elements Billion is written as “B” which is 9 as the exponential factor. Similarly M is million i.e 6 K is thousand i.e 3 and H is hundred i.e 2.
For total economic damage we will take the sum of the total of peoperty damage and crop damage.
We will first convert all the symbols to upper case and will replace non-equivalent symbols with “0”
new_storm_data$PROPDMGEXP <- toupper(new_storm_data$PROPDMGEXP)
new_storm_data$CROPDMGEXP <- toupper(new_storm_data$CROPDMGEXP)
damage <- select(new_storm_data, "EVTYPE", "PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
damage$PROPDMGEXP[damage$PROPDMGEXP %in% c("","+","?","-")] <- "0"
damage$CROPDMGEXP[damage$CROPDMGEXP %in% c("","+","?","-")] <- "0"
#Assigning respective values for property
damage$PROPDMGEXP[damage$PROPDMGEXP %in% c("B")] <- "9"
damage$PROPDMGEXP[damage$PROPDMGEXP %in% c("M")] <- "6"
damage$PROPDMGEXP[damage$PROPDMGEXP %in% c("K")] <- "3"
damage$PROPDMGEXP[damage$PROPDMGEXP %in% c("H")] <- "2"
#Assigning respective values for crop
damage$CROPDMGEXP[damage$CROPDMGEXP %in% c("B")] <- "9"
damage$CROPDMGEXP[damage$CROPDMGEXP %in% c("M")] <- "6"
damage$CROPDMGEXP[damage$CROPDMGEXP %in% c("K")] <- "3"
damage$CROPDMGEXP[damage$CROPDMGEXP %in% c("H")] <- "2"
Now getting the full damage by multiplying with the exponential component
damage$PROPDMGEXP <- 10^(as.numeric(damage$PROPDMGEXP))
damage$PROPERTY <- damage$PROPDMG * damage$PROPDMGEXP
damage$CROPDMGEXP <- 10^(as.numeric(damage$CROPDMGEXP))
damage$CROPS <- damage$CROPDMG * damage$CROPDMGEXP
Now we will calculate the total sum for property and crop damage respectively based on events
prop_damage <- with(damage, aggregate(PROPERTY, by = list(EVTYPE = EVTYPE), FUN = sum))
crop_damage <- with(damage, aggregate(CROPS, by = list(EVTYPE = EVTYPE), FUN = sum))
prop_damage <- arrange(prop_damage, desc(x))
head(prop_damage, n = 6)
## EVTYPE x
## 1 FLOOD 144657709807
## 2 HURRICANE/TYPHOON 69305840000
## 3 TORNADO 56947380677
## 4 STORM SURGE 43323536000
## 5 FLASH FLOOD 16822673979
## 6 HAIL 15735267513
crop_damage <- arrange(crop_damage, desc(x))
head(crop_damage, n= 6)
## EVTYPE x
## 1 DROUGHT 13972566000
## 2 FLOOD 5661968450
## 3 RIVER FLOOD 5029459000
## 4 ICE STORM 5022113500
## 5 HAIL 3025954473
## 6 HURRICANE 2741910000
From the above 2 data frames we can conclude that the most damage in property is done due to Floods and for crops it is by Drought.
Now for the total damage we need to sum the property damage and crop damage
new_total <- within(damage, TOTALDMG <- PROPDMG * PROPDMGEXP + CROPDMG * CROPDMGEXP)
#Now we have the total we just need to pair it with the respective event
event_type <- aggregate(new_total$TOTALDMG, by = list(EVTYPE = new_total$EVTYPE), FUN = sum)
event_type <- arrange(event_type, desc(x))
head(event_type, n = 5)
## EVTYPE x
## 1 FLOOD 150319678257
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57362333947
## 4 STORM SURGE 43323541000
## 5 HAIL 18761221986
ggplot(event_type[1:5,], aes(EVTYPE,x, fill = EVTYPE)) + geom_bar(stat = "identity") + xlab("Events") + ylab("Damage(in dollars)")