author: “Katharhy” date: “6/29/2020”
The United States historical weather data was used to understand the health and economic harm caused by different events in dollars. The weather data came from the United States National Oceanic and Atmospheric Administration (NOAA) storm database (1950-to-November 2011). In the below analysis, the impact on population health was estimated utilizing variables on fatalities and injuries. The economic impact was measured by applying variables on property and crop damage. Then, losses were evaluated in dollars amounts to create a financial sense. The evaluation indicates that hurricanes and tornadoes result in significant population harm and commercial injuries.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.0 (2020-02-14 07:10:20 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.23.0 successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.9.2 successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
## warnings
# check of the file exist if not create a new file to contain the weather dataset.
if (!file.exists("StormData.csv.bz2")){
urlfile <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(urlfile, destfile = "StormData.csv.bz2", method = "curl")
}
# NOAA data save in stormdata and check with fun head().
stormdata <- read.csv("StormData.csv.bz2", stringsAsFactors = F)
head(stormdata)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
# create a subset with the require data to calculate health impacts
# fatalities and injuries
health_impact <- stormdata[, c(8,23:28)]
# grouping by the event type on the EVTYPE variable
fatality <- select(health_impact, c(EVTYPE,FATALITIES))
# evaluate most harmful to population health
injury <- select(health_impact, c(EVTYPE, INJURIES))
#order by the sum of fatalities and injuries
# select the variable (PROPDMG) property damage for evaluation
propdmg1 <- arrange(count(select(health_impact, c(EVTYPE,PROPDMG,PROPDMGEXP)),
PROPDMGEXP), desc(n))
# removing unrelated cases PROPDMGEXP= " "
propdmg1 <- slice(propdmg1, -1)
# reformating the scienctific notation
options(scipen = 999)
# determining the percent of K, M, B
propdmg1$percent <- as.numeric(format((propdmg1$n/sum(propdmg1$n))*100,digits=2))
# sum of the percents:
spropdmg1 <- sum(propdmg1[propdmg1$PROPDMGEXP=="K"|propdmg1$PROPDMGEXP=="M"|
propdmg1$PROPDMGEXP=="B",]$percent)
4.1. Processing crop damage (cropd) data.
# create a subset using the dataset formed in the health impact data processing.
cropd <- arrange(count(select(health_impact, c(EVTYPE,CROPDMG,CROPDMGEXP)),
CROPDMGEXP), desc(n))
# removing unrelated cases in the newly form dataset (cropd)
# and set the percent of dollars (K, M, B) in damages.
cropd <- slice(cropd,-1)
cropd$percent <- as.numeric(format((cropd$n/sum(cropd$n))*100,digits=2))
# sum of crop damage in percents:
scropd <- sum(cropd[cropd$CROPDMGEXP=="K"|cropd$CROPDMGEXP=="M"|
cropd$CROPDMGEXP=="B",]$percent)
4.2. Property damage (PROPDMG) in dollar (K, M, B) amounts.
# create a subset with the damage expenses and cost table for later.
eco1 <- filter(select(health_impact,c(EVTYPE, PROPDMG, PROPDMGEXP)),
PROPDMGEXP=="K"|PROPDMGEXP=="M"|PROPDMGEXP=="B")
eco1$Cost <- ifelse(eco1$PROPDMGEXP=="K", eco1$PROPDMG*1000,
ifelse(eco1$PROPDMGEXP=="M", eco1$PROPDMG*1000000,
eco1$PROPDMG*1000000000))
4.3. Crop damage (CROPDMG) in dollar (K, M, B) amounts.
# create a subset with the damage expenses and cost table for later.q
eco2 <- filter(select(health_impact,c(EVTYPE, CROPDMG, CROPDMGEXP)),
CROPDMGEXP=="K"|CROPDMGEXP=="M"|CROPDMGEXP=="B")
eco2$Cost <- ifelse(eco2$CROPDMGEXP=="K", eco2$CROPDMG*1000,
ifelse(eco2$CROPDMGEXP=="M", eco2$CROPDMG*1000000,
eco2$CROPDMG*1000000000))
# this plot looks into the most harmful events in population health
# it considers fatalities and injuries to evaluate harm.
# it uses subset data "fatality" created in the health_impat data processing.
most_impact <- arrange(summarise(group_by(fatality, EVTYPE),
sum(FATALITIES)),desc(summarise(group_by(fatality,EVTYPE),
sum(FATALITIES))$`sum(FATALITIES)`))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# check for the top 20 fatalities by event type
top_impact <- slice(most_impact, 1:20)
colnames(top_impact)<- c("EVTYPE", "FATALITIES")
# use ggplot to create a graph with fatalities
top_impact$EVTYPE <- factor(top_impact$EVTYPE,
levels =top_impact$EVTYPE[order(-top_impact$FATALITIES)])
# fatalities graph = gft
gft <- ggplot(top_impact, aes(EVTYPE,FATALITIES), fill=FATALITIES)+
geom_bar(stat = "identity", color="green", fill="darkgray")+
theme(axis.text.x = element_text(angle = 70,hjust =1, vjust=0.5))+
labs(title ="Fatalities by Event Type")+
labs(x="Event Type", y="Fatalities")+
theme(plot.title = element_text(hjust=0.5, size=10, face="bold"),
axis.title.x = element_text(size = 7),
axis.text.x = element_text(size = 8),
axis.title.y = element_text(size = 9))
# taking the subset "injury" formed in the health_impact data processing.
# injuries = inj
most_inj <- arrange(summarise(group_by(injury, EVTYPE), sum(INJURIES)),
desc(summarise(group_by(injury,EVTYPE),
sum(INJURIES))$`sum(INJURIES)`))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# check for the top 20 injuries
top_inj <- slice(most_inj, 1:20)
colnames(top_inj)<- c("EVTYPE", "INJURIES")
top_inj$EVTYPE <- factor(top_inj$EVTYPE, levels = top_inj$EVTYPE[order(-top_inj$INJURIES)])
# plot injuries = ginj
ginj <- ggplot(top_inj, aes(EVTYPE,INJURIES), fill=INJURIES)+
geom_bar(stat = "identity", color="green", fill="darkgray")+
theme(axis.text.x = element_text(angle = 70,hjust= 1,vjust = 0.5))+
labs(title ="Injuries by Event Type")+
labs(x="Event Type", y="Injuries")+
theme(plot.title = element_text(hjust=0.5, size=10, face="bold"),
axis.title.x = element_text(size = 7),
axis.text.x = element_text(size = 8),
axis.title.y = element_text(size = 9))
grid.arrange(gft, ginj,ncol=2)
4.1. Across the United States, which types of events have the greatest economic consequences?
# taking into consideration property damage = PROPDMG = pd.
# create the subset "pd" and "cd" suing the dataset "eco1"
# and "eco2" from economic data processing.
# Also, use the "Cost" dataset.
pd <- arrange(summarise(group_by(eco1,EVTYPE), sum(Cost)), desc(summarise(group_by(eco1,EVTYPE), sum(Cost))$`sum(Cost)`))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# reduce the analysis to top 20 samples
pd1 <- slice(pd, 1:20)
# set up the variable cost to billion in dollars
pd1$`sum(Cost)`<- pd1$`sum(Cost)`/10000000000
colnames(pd1)<- c("EVTYPE", "Cost")
# create a plot for the top 20 events
pd1$EVTYPE <- factor(pd1$EVTYPE, levels = pd1$EVTYPE[order(-pd1$Cost)])
gpd <- ggplot(pd1, aes(EVTYPE,Cost), fill=Cost)+
geom_bar(stat = "identity", color="blue", fill="steelblue")+
theme(axis.title.x = element_text(angle = 70,hjust =1,vjust = 0.5))+
labs(title = "Propertity damage by Event Type in dollars")+
labs(x="Event Type", y="Property Cost in Billion Dollars")+
theme(plot.title =element_text(hjust =0.5, size=8,face = "bold"),
axis.text.x = element_text(size = 7),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 9))
# taking into consideration crop damage = CROPDMG = cd
cd <- arrange(summarise(group_by(eco2,EVTYPE), sum(Cost)), desc(summarise(group_by(eco2,EVTYPE), sum(Cost))$`sum(Cost)`))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# reduce the analysis to top 20 samples
cd1 <- slice(cd, 1:20)
# set up the variable cost to billion in dollars
cd1$`sum(Cost)`<- cd1$`sum(Cost)`/10000000000
colnames(cd1)<- c("EVTYPE", "Cost")
# create a plot for the top 20 events
cd1$EVTYPE <- factor(cd1$EVTYPE, levels = cd1$EVTYPE[order(-cd1$Cost)])
gcd <- ggplot(pd1, aes(EVTYPE,Cost), fill=Cost)+
geom_bar(stat = "identity", color="blue", fill="steelblue")+
theme(axis.title.x =element_text(angle = 70,hjust= 1,vjust = 0.5))+
labs(title = "Crop damage by Event Type in dollars")+
labs(x="Event Type", y="Crop Cost in Billion Dollars")+
theme(plot.title = element_text(hjust=0.5, size=8,face = "bold"),
axis.text.x = element_text(size = 7),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 9))
grid.arrange(gpd, gcd, ncol=2)
Here, it considers both property and crop damage to determine the most harmful events to the economy.
# combine pd and cd on their rows: rbind
pd_cd <- rbind(pd1,cd1)
colnames(pd_cd)<- c("EVTYPE", "Cost")
pd_cd1 <- arrange(summarise(group_by(pd_cd,EVTYPE), sum(Cost)), desc(summarise(group_by(pd_cd,EVTYPE), sum(Cost))$`sum(Cost)`))
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# evaluate the top 20 events
pd_cd2 <- slice(pd_cd1, 1:20)
colnames(pd_cd2)<- c("EVTYPE", "Cost")
pd_cd2$EVTYPE <- factor(pd_cd2$EVTYPE, levels = pd_cd2$EVTYPE[order(-pd_cd2$Cost)])
# combined graph
gpd_cd <- ggplot(pd_cd2, aes(EVTYPE, Cost), fill=Cost)+
geom_bar(stat = "identity", col="gray", fill="white")+
theme(axis.text.x = element_text(angle = 75, hjust=1,vjust=0.5))+
labs(title="Combine Property and Crop Damage by Event")+
labs(x="Event Type", y="Damages Cost in dollars")+
theme(plot.title = element_text(hjust=0.5,size=10,face = "bold"),
axis.text.x = element_text(size = 7),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 9))
print(gpd_cd)