Research

#This article is made as a homework project for the Coursera Reproducible Research course. The project aims to identify the most affecting natural events on US population health, and economic cost. The first aim was operationalized as the top 10 most injurous and fatal events existing in the dataset, while the second aim was operationalized as the top 10 most costly events that caused property and crops monetary damage.

Loading Packages and Dataset

library(ggplot2)
library(reshape)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.1     v purrr   0.3.2
## v tidyr   0.8.3     v dplyr   0.8.1
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.1.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::rename() masks reshape::rename()
data <- read.csv("repdata_data_StormData.csv")

#make a new dataset with a column that only contains the un-frequented (i.e. unique) event types.
smallerdata <- data.frame(EVS=unique(data$EVTYPE))

#add a column that is the sum of all fatalities caused by the event
smallerdata$FATALITIES <- tapply(data$FATALITIES, data$EVTYPE , sum)[smallerdata$EVS]

#add a column that is the sum of all injuries caused by the event
smallerdata$INJ <- tapply(data$INJURIES, data$EVTYPE , sum)[smallerdata$EVS]

#add a column that is the sum of injuries and fatalities caused by the event
smallerdata$totalhealthissues <- smallerdata$INJ + smallerdata$FATALITIES

#add a column that is the sum of all property damage costs (in Thousand dollars) caused by the event
smallerdata$PROPDMG <- (tapply(data$PROPDMG, data$EVTYPE , sum)[smallerdata$EVS])/1000

#add a column that is the sum of all crop damage costs (in Thousand dollars) caused by the event
smallerdata$CROPDMG <- (tapply(data$CROPDMG, data$EVTYPE , sum)[smallerdata$EVS])/1000

#add a column that is the sum of poperty damage costs and crop damage costs caused by the event (in Thousand $)
smallerdata$totalcosts <- smallerdata$PROPDMG + smallerdata$CROPDMG

Question1

#Ordered the most fatal events descendingly and we report the Top 10 of them, followed by the most injurous events in the same manner.


#returns the top 10 most fatal events
mostfatal <- head(smallerdata[order(smallerdata$FATALITIES, decreasing= TRUE), ], 10)
#returns the top 10 most injurous events
mostinj <- head(smallerdata[order(smallerdata$INJ, decreasing= TRUE), ], 10)
#returns the top 10 most impactful events on health (i.e. fatalities + injuries)
mosttotalhealthissue <- head(smallerdata[order(smallerdata$totalhealthissues, decreasing= TRUE), ], 10)

#making sure the events are ordered properly and not messed up in the ggplot, ordering them according to total health impact (i.e. injuries + fatalities) is the most intuitive
mosttotalhealthissue$EVS <- factor(mosttotalhealthissue$EVS, levels = mosttotalhealthissue$EVS[order(mosttotalhealthissue$totalhealthissues, decreasing= TRUE)])

# Making a union between the TOP 10 most INJUROUS events with the top 10 most FATAL
unimostinjrsorfatal <- union(mostfatal[,1], mostinj[,1])

unimostinjrsorfatal
##  [1] "TORNADO"           "EXCESSIVE HEAT"    "FLASH FLOOD"      
##  [4] "HEAT"              "LIGHTNING"         "TSTM WIND"        
##  [7] "FLOOD"             "RIP CURRENT"       "HIGH WIND"        
## [10] "AVALANCHE"         "ICE STORM"         "THUNDERSTORM WIND"
## [13] "HAIL"

Visuals-Question1

#a melted dataset to be utilized in plotting the data in a gg barplot using only the intersection of most fatal and most injurous events
intrsctmostinjandfatal <- intersect(mostfatal[,1], mostinj[,1])
meltedmostfatalinj <- melt(mosttotalhealthissue[mosttotalhealthissue$EVS %in% intrsctmostinjandfatal,], id.vars =c("EVS", "PROPDMG", "CROPDMG", "totalcosts"))

#gg bartplot
ggplot(meltedmostfatalinj, aes(EVS, value)) + geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
      ggtitle(label = "Top Most Weather Events Impacting Human Health in the USA") +
      xlab("Event") + ylab ("Affected # of People") + 
      scale_fill_discrete(name = "Variable", labels = c("Fatalities", "Injuries", "Total(Fatalities + Injuries)"))

#The graph shows that highest impact to be that of tornados, and that injuries are much more common than fatalities.

Question2

#In answering question 2. We look for the top 10 most economically costly events (i.e. highest events with crop_damage + property_damage costs), and plot them in a clear barplot.

#returns top 10 most costly events
mosteconomicalcost <- head(smallerdata[order(smallerdata$totalcosts, decreasing= TRUE), ], 10)
mosteconomicalcost
##                   EVS FATALITIES   INJ totalhealthissues   PROPDMG
## 1             TORNADO       5633 91346             96979 3212.2582
## 20        FLASH FLOOD        978  1777              2755 1420.1246
## 2           TSTM WIND        504  6957              7461 1335.9656
## 3                HAIL         15  1361              1376  688.6934
## 36              FLOOD        470  6789              7259  899.9385
## 16  THUNDERSTORM WIND        133  1488              1621  876.8442
## 15          LIGHTNING        816  5230              6046  603.3518
## 10 THUNDERSTORM WINDS         64   908               972  446.2932
## 46          HIGH WIND        248  1137              1385  324.7316
## 8        WINTER STORM        206  1321              1527  132.7206
##      CROPDMG totalcosts
## 1  100.01852  3312.2767
## 20 179.20046  1599.3251
## 2  109.20260  1445.1682
## 3  579.59628  1268.2897
## 36 168.03788  1067.9764
## 16  66.79145   943.6356
## 15   3.58061   606.9324
## 10  18.68493   464.9781
## 46  17.28321   342.0148
## 8    1.97899   134.6996

Question2 Visual

#make sure of proper ordering
mosteconomicalcost$EVS <- factor(mosteconomicalcost$EVS, levels = mosteconomicalcost$EVS[order(mosteconomicalcost$totalcosts, decreasing= TRUE)])

#prepares the latter dataset for ggplotting
meltedmostecocost <- melt(mosteconomicalcost, id.vars =c("EVS", "FATALITIES", "INJ", "totalhealthissues"))

#again, make sure of tidy ordering
meltedmostecocost$variable <- factor(meltedmostecocost$variable, levels = c("CROPDMG", "PROPDMG", "totalcosts"))

#gg barplotting

ggplot(meltedmostecocost, aes(EVS, value)) +
      geom_bar(aes(fill = variable), position = "dodge", stat="identity") +
      ggtitle(label = "Top 10 Weather Events Impacting Economy in the USA") +
      xlab("Event") + ylab ("Thousand US $ (Ignoring the EXPonential variables)") + 
      scale_fill_discrete(name = "Variable", labels = c("Crop Losses", "Property Losses", "Total Losses(Crop + Property)"))

#We see again that tornadoes are the most costly weather events to the US, with property damage to be obviously more affected by weather events than crop damage.