#1. Introduction:
#Storms and other severe weather events cause many problems randing from fatalities, injuries, and damage to crops and property. We will analyse the impact of these natural events on humans, crops and property.
#This project involves exploring the U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
#The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
#2. Data Processing:
#Path of the csv file that we have used for analysis:
#https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
#Load Data (Set up working directory)
setwd("/Users/yogesh")
dir()
## [1] "Applications" "BrawlhallaReplays"
## [3] "Chinmay Q3 Receipt 2018.pdf" "Desktop"
## [5] "Documents" "Downloads"
## [7] "FreeAgent" "Google Drive"
## [9] "Library" "Movies"
## [11] "Music" "Pictures"
## [13] "Public" "repdata-data-StormData.csv"
#Read Data and assign a variable for easy access
stormmaster <- read.csv("repdata-data-StormData.csv", header = TRUE)
#We will check first few rows of storm data
head(stormmaster)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
#Check dimensions of data
dim(stormmaster)
## [1] 902297 37
#3. Results
#There are chiefly four types of damages that are caused by these events, viz. Fatalities, Injuries, Crop Damage and Property Damage (Header = EVTYPE). Crop and Property damage needs to be converted into USD ($) valuesfirst, as data has been recorded in Bilions, Millions and Thousands (exponent values). We will be totalling damage in USD only after exponent values have been converted
#We will start aggregating data for Fatalities and Injuries
#Aggregate FATALITIES as per event types (EVTYPE)
Fatal_aggregate <- aggregate(FATALITIES~EVTYPE, data=stormmaster, sum)
#We will check first few rows of aggregated FATALITIES data here (watch out for 0 values)
head(Fatal_aggregate)
## EVTYPE FATALITIES
## 1 HIGH SURF ADVISORY 0
## 2 COASTAL FLOOD 0
## 3 FLASH FLOOD 0
## 4 LIGHTNING 0
## 5 TSTM WIND 0
## 6 TSTM WIND (G45) 0
#Check dimensions of aggregate FATALITIES data
dim(Fatal_aggregate)
## [1] 985 2
#We will exclude 0 values from FATALITIES data for clean data and analysis
Fatal_aggregate_Non0 <- Fatal_aggregate[Fatal_aggregate$FATALITIES >0, ]
#We will check dimensions of revised FATALITIES dataset after excluding 0 values
dim(Fatal_aggregate_Non0)
## [1] 168 2
#We will set FATALITIES in descending order for better comparison
Fatal_aggregate_Non0_Order <- Fatal_aggregate_Non0[order(Fatal_aggregate_Non0$FATALITIES, decreasing = TRUE), ]
#We will check the summary of FATALITIES data below to see the devastating impact on humans. We will find that top 2 event types causing human fatalities are Tornadoes and Excessive heat
head(Fatal_aggregate_Non0_Order)
## EVTYPE FATALITIES
## 826 TORNADO 5633
## 124 EXCESSIVE HEAT 1903
## 151 FLASH FLOOD 978
## 271 HEAT 937
## 453 LIGHTNING 816
## 846 TSTM WIND 504
#Aggregate INJURIES as per event types (EVTYPE)
Injury_aggregate <- aggregate(INJURIES~EVTYPE, data=stormmaster, sum)
#We will check first few rows of aggregated INJURIES data here (watch out for 0 values)
head(Injury_aggregate)
## EVTYPE INJURIES
## 1 HIGH SURF ADVISORY 0
## 2 COASTAL FLOOD 0
## 3 FLASH FLOOD 0
## 4 LIGHTNING 0
## 5 TSTM WIND 0
## 6 TSTM WIND (G45) 0
#Check dimensions of aggregate INJURIES data
dim(Injury_aggregate)
## [1] 985 2
#We will exclude 0 values from INJURIES data for clean data and analysis
Injury_aggregate_Non0 <- Injury_aggregate[Injury_aggregate$INJURIES >0, ]
#We will check dimensions of revised INJURIES dataset after excluding 0 values
dim(Injury_aggregate_Non0)
## [1] 158 2
#We will set INJURIES in descending order for better comparison
Injury_aggregate_Non0_Order <- Injury_aggregate_Non0[order(Injury_aggregate_Non0$INJURIES, decreasing = TRUE), ]
#We will check the summary of INJURIES data below to see the 2nd most devastating impact on humans, which is causing injuries. We will notice that top 2 event types causing human fatalities are Tornadoe and Thunderstorm Wind
head(Injury_aggregate_Non0_Order)
## EVTYPE INJURIES
## 826 TORNADO 91346
## 846 TSTM WIND 6957
## 167 FLOOD 6789
## 124 EXCESSIVE HEAT 6525
## 453 LIGHTNING 5230
## 271 HEAT 2100
#We are now going to represent both Fatalities and Injuries through Barplots below
#Create a bar plot of FATALITIES in descending order for better visualization. Notice top 2 events causing fatalities are Tornado and Excessive Heat
barplot(Fatal_aggregate_Non0_Order[1:10, 2],col = 1:10, legend.text = Fatal_aggregate_Non0_Order [1:10, 1], ylab = "Fatality", main = "Ten natural events causing most fatalities")

#Create a bar plot of INJURIES in descending order for better visualization. Notice top 2 events causing fatalities are Tornado and Thunderstorm Wind
barplot(Injury_aggregate_Non0_Order [1:10, 2], col = 1:10, legend.text = Injury_aggregate_Non0_Order
[1:10, 1], ylab = "Injury", main = "Ten natural events causing most Injuries")

#We will go an extra step to find out if there is any correlation between events causing Fatalities and Injuries. This will be done by matching data between both data frames. We will notice that out of top 10 event types for both Fatalities and Injuries, 7 are matching. Top two events most harmful to population health are Tornado and Excessive Heat
intersect(Fatal_aggregate_Non0_Order [1:10,1], Injury_aggregate_Non0_Order [1:10,1])
## [1] "TORNADO" "EXCESSIVE HEAT" "FLASH FLOOD" "HEAT"
## [5] "LIGHTNING" "TSTM WIND" "FLOOD"
#Description and justification for any data transformations is given below:
#Since there are different types of monetary groupings available in table for damage values, we will have to convert all of these exponents into $values. Point to note here: B|b stands for Billion, M|m stands for Million, K|k stands for Thousand
#Property Damages: First we will figure out the unique values in the header PROPDMGEXP. This will help us know which values are in exponent form
unique(stormmaster$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: + - 0 1 2 3 4 5 6 7 8 ? B H K M h m
#Crop Damages: First we will figure out the unique values in the header CROPDMGEXP. This will help us know which values are in exponent form
unique(stormmaster$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: 0 2 ? B K M k m
#Conversion of exponential values (Billion, Million and Thousands) in USD. Note that we had to look out for capital and small letters depicting billions, millions and thousands (k, K, b, B, m, M)
#Property Damage
stormmaster[stormmaster$PROPDMGEXP == "K", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "K", ]$PROPDMG * 1000
stormmaster[stormmaster$PROPDMGEXP == "M", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "M", ]$PROPDMG * 1e+06
stormmaster[stormmaster$PROPDMGEXP == "m", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "m", ]$PROPDMG * 1e+06
stormmaster[stormmaster$PROPDMGEXP == "B", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "B", ]$PROPDMG * 1e+09
#Crop Damage
stormmaster[stormmaster$CROPDMGEXP == "K", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "K", ]$CROPDMG * 1000
stormmaster[stormmaster$CROPDMGEXP == "k", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "k", ]$CROPDMG * 1000
stormmaster[stormmaster$CROPDMGEXP == "M", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "M", ]$CROPDMG * 1e+06
stormmaster[stormmaster$CROPDMGEXP == "m", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "m", ]$CROPDMG * 1e+06
stormmaster[stormmaster$CROPDMGEXP == "B", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "B", ]$CROPDMG * 1e+09
#As we did above with Fatalities and Injuries, we will now aggregate the sum total of damages to property and crops
PROPDMG_aggregate <- aggregate(PROPDMG~EVTYPE, data=stormmaster, sum)
#We will exclude 0 values from data for clean data and analysis
PROPDMG_aggregate_Non0 <- PROPDMG_aggregate [PROPDMG_aggregate$PROPDMG >0, ]
#We need to sort the values in descending order
PROPDMG_aggregate_Non0_Order <- PROPDMG_aggregate_Non0 [order(PROPDMG_aggregate_Non0$PROPDMG, decreasing = TRUE), ]
#We will aggregate sum of CROP Damage
CROPDMG_aggregate <- aggregate(CROPDMG~EVTYPE, data=stormmaster, sum)
#We will exclude 0 values from data for clean data and analysis
CROPDMG_aggregate_Non0 <- CROPDMG_aggregate [CROPDMG_aggregate$CROPDMG >0, ]
#We will sort the values in descending order for better comparison
CROPDMG_aggregate_Non0_Order <- CROPDMG_aggregate_Non0 [order(CROPDMG_aggregate_Non0$CROPDMG, decreasing = TRUE), ]
#We will prepare a Barplot of top 10 reasons for damage to PROPERTY
barplot(PROPDMG_aggregate_Non0_Order[1:10, 2], col = 1:10, legend.text = PROPDMG_aggregate_Non0_Order [1:10, 1], ylab = "Damage to Property", main = "10 natural events causing most damage to properties")

#We will prepare a Barplot of top 10 reasons for damage to CROPS
barplot(CROPDMG_aggregate_Non0_Order[1:10, 2], col = 1:10, legend.text = CROPDMG_aggregate_Non0_Order [1:10, 1], ylab = "Damage to Crops", main = "10 natural events causing most damage to Crops")

#Top events that have the greatest economic consequences are:
#1. Damage to Property - Flood and Hurricane/Typhoon
#2. Damage to Crops - Drought and Flood
#Occurence of these natural events is beyond human control, however, studying the outcome of these events can help government and respective agencies to plan and allocate funds/resources in advance as per the anticipated damages.
#Credits: https://rpubs.com/matrod/rresearch2