#1. Introduction:
#Storms and other severe weather events cause many problems randing from fatalities, injuries, and damage to crops and property. We will analyse the impact of these natural events on humans, crops and property.
#This project involves exploring the U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
  
#The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

#2. Data Processing:  
#Path of the csv file that we have used for analysis:
#https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2

#Load Data (Set up working directory)
setwd("/Users/yogesh")
dir()
##  [1] "Applications"                "BrawlhallaReplays"          
##  [3] "Chinmay Q3 Receipt 2018.pdf" "Desktop"                    
##  [5] "Documents"                   "Downloads"                  
##  [7] "FreeAgent"                   "Google Drive"               
##  [9] "Library"                     "Movies"                     
## [11] "Music"                       "Pictures"                   
## [13] "Public"                      "repdata-data-StormData.csv"
#Read Data and assign a variable for easy access
stormmaster <- read.csv("repdata-data-StormData.csv", header = TRUE)

#We will check first few rows of storm data
head(stormmaster)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0                                               0
## 2 TORNADO         0                                               0
## 3 TORNADO         0                                               0
## 4 TORNADO         0                                               0
## 5 TORNADO         0                                               0
## 6 TORNADO         0                                               0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0                      14.0   100 3   0          0
## 2         NA         0                       2.0   150 2   0          0
## 3         NA         0                       0.1   123 2   0          0
## 4         NA         0                       0.0   100 2   0          0
## 5         NA         0                       0.0   150 2   0          0
## 6         NA         0                       1.5   177 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0                                    
## 2        0     2.5          K       0                                    
## 3        2    25.0          K       0                                    
## 4        2     2.5          K       0                                    
## 5        2     2.5          K       0                                    
## 6        6     2.5          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3
## 4     3458      8626          0          0              4
## 5     3412      8642          0          0              5
## 6     3450      8748          0          0              6
#Check dimensions of data
dim(stormmaster)
## [1] 902297     37
#3. Results
#There are chiefly four types of damages that are caused by these events, viz. Fatalities, Injuries, Crop Damage and Property Damage (Header = EVTYPE). Crop and Property damage needs to be converted into USD ($) valuesfirst, as data has been recorded in Bilions, Millions and Thousands (exponent values). We will be totalling damage in USD only after exponent values have been converted 

#We will start aggregating data for Fatalities and Injuries
#Aggregate FATALITIES as per event types (EVTYPE)
Fatal_aggregate <- aggregate(FATALITIES~EVTYPE, data=stormmaster, sum)

#We will check first few rows of aggregated FATALITIES data here (watch out for 0 values)
head(Fatal_aggregate)
##                  EVTYPE FATALITIES
## 1    HIGH SURF ADVISORY          0
## 2         COASTAL FLOOD          0
## 3           FLASH FLOOD          0
## 4             LIGHTNING          0
## 5             TSTM WIND          0
## 6       TSTM WIND (G45)          0
#Check dimensions of aggregate FATALITIES data
dim(Fatal_aggregate)
## [1] 985   2
#We will exclude 0 values from FATALITIES data for clean data and analysis
Fatal_aggregate_Non0 <- Fatal_aggregate[Fatal_aggregate$FATALITIES >0, ]

#We will check dimensions of revised FATALITIES dataset after excluding 0 values
dim(Fatal_aggregate_Non0)
## [1] 168   2
#We will set FATALITIES in descending order for better comparison
Fatal_aggregate_Non0_Order <- Fatal_aggregate_Non0[order(Fatal_aggregate_Non0$FATALITIES, decreasing = TRUE), ]

#We will check the summary of FATALITIES data below to see the devastating impact on humans. We will find that top 2 event types causing human fatalities are Tornadoes and Excessive heat
head(Fatal_aggregate_Non0_Order)
##             EVTYPE FATALITIES
## 826        TORNADO       5633
## 124 EXCESSIVE HEAT       1903
## 151    FLASH FLOOD        978
## 271           HEAT        937
## 453      LIGHTNING        816
## 846      TSTM WIND        504
#Aggregate INJURIES as per event types (EVTYPE)
Injury_aggregate <- aggregate(INJURIES~EVTYPE, data=stormmaster, sum)

#We will check first few rows of aggregated INJURIES data here (watch out for 0 values)
head(Injury_aggregate)
##                  EVTYPE INJURIES
## 1    HIGH SURF ADVISORY        0
## 2         COASTAL FLOOD        0
## 3           FLASH FLOOD        0
## 4             LIGHTNING        0
## 5             TSTM WIND        0
## 6       TSTM WIND (G45)        0
#Check dimensions of aggregate INJURIES data
dim(Injury_aggregate)
## [1] 985   2
#We will exclude 0 values from INJURIES data for clean data and analysis
Injury_aggregate_Non0 <- Injury_aggregate[Injury_aggregate$INJURIES >0, ]

#We will check dimensions of revised INJURIES dataset after excluding 0 values
dim(Injury_aggregate_Non0)
## [1] 158   2
#We will set INJURIES in descending order for better comparison
Injury_aggregate_Non0_Order <- Injury_aggregate_Non0[order(Injury_aggregate_Non0$INJURIES, decreasing = TRUE), ]
 
#We will check the summary of INJURIES data below to see the 2nd most devastating impact on humans, which is causing injuries. We will notice that top 2 event types causing human fatalities are Tornadoe and Thunderstorm Wind
head(Injury_aggregate_Non0_Order)
##             EVTYPE INJURIES
## 826        TORNADO    91346
## 846      TSTM WIND     6957
## 167          FLOOD     6789
## 124 EXCESSIVE HEAT     6525
## 453      LIGHTNING     5230
## 271           HEAT     2100
#We are now going to represent both Fatalities and Injuries through Barplots below

#Create a bar plot of FATALITIES in descending order for better visualization. Notice top 2 events causing fatalities are Tornado and Excessive Heat
barplot(Fatal_aggregate_Non0_Order[1:10, 2],col = 1:10,  legend.text = Fatal_aggregate_Non0_Order [1:10, 1], ylab = "Fatality", main = "Ten natural events causing most fatalities")

#Create a bar plot of INJURIES in descending order for better visualization. Notice top 2 events causing fatalities are Tornado and Thunderstorm Wind
barplot(Injury_aggregate_Non0_Order [1:10, 2], col = 1:10, legend.text = Injury_aggregate_Non0_Order
 [1:10, 1], ylab = "Injury", main = "Ten natural events causing most Injuries")

#We will go an extra step to find out if there is any correlation between events causing Fatalities and Injuries. This will be done by matching data between both data frames. We will notice that out of top 10 event types for both Fatalities and Injuries, 7 are matching. Top two events most harmful to population health are Tornado and Excessive Heat
intersect(Fatal_aggregate_Non0_Order [1:10,1], Injury_aggregate_Non0_Order [1:10,1])
## [1] "TORNADO"        "EXCESSIVE HEAT" "FLASH FLOOD"    "HEAT"          
## [5] "LIGHTNING"      "TSTM WIND"      "FLOOD"
#Description and justification for any data transformations is given below:

#Since there are different types of monetary groupings available in table for damage values, we will have to convert all of these exponents into $values. Point to note here: B|b stands for Billion, M|m stands for Million, K|k stands for Thousand

#Property Damages: First we will figure out the unique values in the header PROPDMGEXP. This will help us know which values are in exponent form
unique(stormmaster$PROPDMGEXP)
##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  + - 0 1 2 3 4 5 6 7 8 ? B H K M h m
#Crop Damages: First we will figure out the unique values in the header CROPDMGEXP. This will help us know which values are in exponent form
unique(stormmaster$CROPDMGEXP)
## [1]   M K m B ? 0 k 2
## Levels:  0 2 ? B K M k m
#Conversion of exponential values (Billion, Million and Thousands) in USD. Note that we had to look out for capital and small letters depicting billions, millions and thousands (k, K, b, B, m, M)

#Property Damage
stormmaster[stormmaster$PROPDMGEXP == "K", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "K", ]$PROPDMG * 1000
stormmaster[stormmaster$PROPDMGEXP == "M", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "M", ]$PROPDMG * 1e+06
stormmaster[stormmaster$PROPDMGEXP == "m", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "m", ]$PROPDMG * 1e+06
stormmaster[stormmaster$PROPDMGEXP == "B", ]$PROPDMG <- stormmaster[stormmaster$PROPDMGEXP == "B", ]$PROPDMG * 1e+09

#Crop Damage
stormmaster[stormmaster$CROPDMGEXP == "K", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "K", ]$CROPDMG * 1000
stormmaster[stormmaster$CROPDMGEXP == "k", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "k", ]$CROPDMG * 1000
stormmaster[stormmaster$CROPDMGEXP == "M", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "M", ]$CROPDMG * 1e+06
stormmaster[stormmaster$CROPDMGEXP == "m", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "m", ]$CROPDMG * 1e+06
stormmaster[stormmaster$CROPDMGEXP == "B", ]$CROPDMG <- stormmaster[stormmaster$CROPDMGEXP == "B", ]$CROPDMG * 1e+09

#As we did above with Fatalities and Injuries, we will now aggregate the sum total of damages to property and crops

PROPDMG_aggregate <- aggregate(PROPDMG~EVTYPE, data=stormmaster, sum)

#We will exclude 0 values from data for clean data and analysis
PROPDMG_aggregate_Non0 <- PROPDMG_aggregate [PROPDMG_aggregate$PROPDMG >0, ]

#We need to sort the values in descending order
PROPDMG_aggregate_Non0_Order <- PROPDMG_aggregate_Non0 [order(PROPDMG_aggregate_Non0$PROPDMG, decreasing = TRUE), ]

#We will aggregate sum of CROP Damage
CROPDMG_aggregate <- aggregate(CROPDMG~EVTYPE, data=stormmaster, sum)

#We will exclude 0 values from data for clean data and analysis
CROPDMG_aggregate_Non0 <- CROPDMG_aggregate [CROPDMG_aggregate$CROPDMG >0, ]

#We will sort the values in descending order for better comparison
CROPDMG_aggregate_Non0_Order <- CROPDMG_aggregate_Non0 [order(CROPDMG_aggregate_Non0$CROPDMG, decreasing = TRUE), ]

#We will prepare a Barplot of top 10 reasons for damage to PROPERTY
barplot(PROPDMG_aggregate_Non0_Order[1:10, 2], col = 1:10, legend.text = PROPDMG_aggregate_Non0_Order [1:10, 1], ylab = "Damage to Property", main = "10 natural events causing most damage to properties")

#We will prepare a Barplot of top 10 reasons for damage to CROPS
barplot(CROPDMG_aggregate_Non0_Order[1:10, 2], col = 1:10, legend.text = CROPDMG_aggregate_Non0_Order [1:10, 1], ylab = "Damage to Crops", main = "10 natural events causing most damage to Crops")

#Top events that have the greatest economic consequences are:
#1. Damage to Property - Flood and Hurricane/Typhoon
#2. Damage to Crops - Drought and Flood

#Occurence of these natural events is beyond human control, however, studying the outcome of these events can help government and respective agencies to plan and allocate funds/resources in advance as per the anticipated damages.

#Credits: https://rpubs.com/matrod/rresearch2