Research
#This article is made as a homework project for the Coursera Reproducible Research course. The project aims to identify the most affecting natural events on US population health, and economic cost. The first aim was operationalized as the top 10 most injurous and fatal events existing in the dataset, while the second aim was operationalized as the top 10 most costly events that caused property and crops monetary damage.
Loading Packages and Dataset
library(ggplot2)
library(reshape)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::rename() masks reshape::rename()
data <- read.csv("repdata_data_StormData.csv")
#make a new dataset with a column that only contains the un-frequented (i.e. unique) event types.
smallerdata <- data.frame(EVS=unique(data$EVTYPE))
#add a column that is the sum of all fatalities caused by the event
smallerdata$FATALITIES <- tapply(data$FATALITIES, data$EVTYPE , sum)[smallerdata$EVS]
#add a column that is the sum of all injuries caused by the event
smallerdata$INJ <- tapply(data$INJURIES, data$EVTYPE , sum)[smallerdata$EVS]
#add a column that is the sum of injuries and fatalities caused by the event
smallerdata$totalhealthissues <- smallerdata$INJ + smallerdata$FATALITIES
#add a column that is the sum of all property damage costs (in Thousand dollars) caused by the event
smallerdata$PROPDMG <- (tapply(data$PROPDMG, data$EVTYPE , sum)[smallerdata$EVS])/1000
#add a column that is the sum of all crop damage costs (in Thousand dollars) caused by the event
smallerdata$CROPDMG <- (tapply(data$CROPDMG, data$EVTYPE , sum)[smallerdata$EVS])/1000
#add a column that is the sum of poperty damage costs and crop damage costs caused by the event (in Thousand $)
smallerdata$totalcosts <- smallerdata$PROPDMG + smallerdata$CROPDMG
Question1
#Ordered the most fatal events descendingly and we report the Top 10 of them, followed by the most injurous events in the same manner.
#returns the top 10 most fatal events
mostfatal <- head(smallerdata[order(smallerdata$FATALITIES, decreasing= TRUE), ], 10)
#returns the top 10 most injurous events
mostinj <- head(smallerdata[order(smallerdata$INJ, decreasing= TRUE), ], 10)
#returns the top 10 most impactful events on health (i.e. fatalities + injuries)
mosttotalhealthissue <- head(smallerdata[order(smallerdata$totalhealthissues, decreasing= TRUE), ], 10)
#making sure the events are ordered properly and not messed up in the ggplot, ordering them according to total health impact (i.e. injuries + fatalities) is the most intuitive
mosttotalhealthissue$EVS <- factor(mosttotalhealthissue$EVS, levels = mosttotalhealthissue$EVS[order(mosttotalhealthissue$totalhealthissues, decreasing= TRUE)])
# Making a union between the TOP 10 most INJUROUS events with the top 10 most FATAL
unimostinjrsorfatal <- union(mostfatal[,1], mostinj[,1])
unimostinjrsorfatal
## [1] "TORNADO" "EXCESSIVE HEAT" "FLASH FLOOD"
## [4] "HEAT" "LIGHTNING" "TSTM WIND"
## [7] "FLOOD" "RIP CURRENT" "HIGH WIND"
## [10] "AVALANCHE" "ICE STORM" "THUNDERSTORM WIND"
## [13] "HAIL"
Visuals-Question1
#a melted dataset to be utilized in plotting the data in a gg barplot using only the intersection of most fatal and most injurous events
intrsctmostinjandfatal <- intersect(mostfatal[,1], mostinj[,1])
meltedmostfatalinj <- melt(mosttotalhealthissue[mosttotalhealthissue$EVS %in% intrsctmostinjandfatal,], id.vars =c("EVS", "PROPDMG", "CROPDMG", "totalcosts"))
#gg bartplot
ggplot(meltedmostfatalinj, aes(EVS, value)) + geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
ggtitle(label = "Top Most Weather Events Impacting Human Health in the USA") +
xlab("Event") + ylab ("Affected # of People") +
scale_fill_discrete(name = "Variable", labels = c("Fatalities", "Injuries", "Total(Fatalities + Injuries)"))

#The graph shows that highest impact to be that of tornados, and that injuries are much more common than fatalities.
Question2
#In answering question 2. We look for the top 10 most economically costly events (i.e. highest events with crop_damage + property_damage costs), and plot them in a clear barplot.
#returns top 10 most costly events
mosteconomicalcost <- head(smallerdata[order(smallerdata$totalcosts, decreasing= TRUE), ], 10)
mosteconomicalcost
## EVS FATALITIES INJ totalhealthissues PROPDMG
## 1 TORNADO 5633 91346 96979 3212.2582
## 20 FLASH FLOOD 978 1777 2755 1420.1246
## 2 TSTM WIND 504 6957 7461 1335.9656
## 3 HAIL 15 1361 1376 688.6934
## 36 FLOOD 470 6789 7259 899.9385
## 16 THUNDERSTORM WIND 133 1488 1621 876.8442
## 15 LIGHTNING 816 5230 6046 603.3518
## 10 THUNDERSTORM WINDS 64 908 972 446.2932
## 46 HIGH WIND 248 1137 1385 324.7316
## 8 WINTER STORM 206 1321 1527 132.7206
## CROPDMG totalcosts
## 1 100.01852 3312.2767
## 20 179.20046 1599.3251
## 2 109.20260 1445.1682
## 3 579.59628 1268.2897
## 36 168.03788 1067.9764
## 16 66.79145 943.6356
## 15 3.58061 606.9324
## 10 18.68493 464.9781
## 46 17.28321 342.0148
## 8 1.97899 134.6996
Question2 Visual
#make sure of proper ordering
mosteconomicalcost$EVS <- factor(mosteconomicalcost$EVS, levels = mosteconomicalcost$EVS[order(mosteconomicalcost$totalcosts, decreasing= TRUE)])
#prepares the latter dataset for ggplotting
meltedmostecocost <- melt(mosteconomicalcost, id.vars =c("EVS", "FATALITIES", "INJ", "totalhealthissues"))
#again, make sure of tidy ordering
meltedmostecocost$variable <- factor(meltedmostecocost$variable, levels = c("CROPDMG", "PROPDMG", "totalcosts"))
#gg barplotting
ggplot(meltedmostecocost, aes(EVS, value)) +
geom_bar(aes(fill = variable), position = "dodge", stat="identity") +
ggtitle(label = "Top 10 Weather Events Impacting Economy in the USA") +
xlab("Event") + ylab ("Thousand US $ (Ignoring the EXPonential variables)") +
scale_fill_discrete(name = "Variable", labels = c("Crop Losses", "Property Losses", "Total Losses(Crop + Property)"))

#We see again that tornadoes are the most costly weather events to the US, with property damage to be obviously more affected by weather events than crop damage.