Storm as a natural and vulnerable event have many consequences on economies and human health. It is interesting to quantify the effect of this consequences to have a better idea on how these events affect societies. In this analysis we are going to be using the NOAA (U.S. National Oceanic and Atmospheric Administration) storm database which contains a dataset collected on a regular basis from year 1950 to 2011 with a specific goal of finding the storm efects on economics of the region and Human Health. The analysis is going to be reproducable and available in RMarkdown as below.
For this study , we are going to read the data from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database.
library(knitr)
wd <- getwd()
zipfile <-download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = paste0(wd,"/noaa.csv.bz2"))
require(R.utils)
## Loading required package: R.utils
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.4.4
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.22.0 (2018-04-21) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.6.0 (2017-11-04) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
bunzip2("noaa.csv.bz2", "noaa.csv", remove = FALSE, skip = TRUE)
## [1] "noaa.csv"
## attr(,"temporary")
## [1] FALSE
maindataset <- read.csv(paste0(wd,"/noaa.csv"),sep = ",")
head(maindataset)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
names(maindataset)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
### check the FATALITIES field
summary(maindataset$FATALITIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0168 0.0000 583.0000
sum(is.na(maindataset$FATALITIES))
## [1] 0
### Check the INJURIES field
summary(maindataset$INJURIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1557 0.0000 1700.0000
sum(is.na(maindataset$INJURIES))
## [1] 0
### Check the PROPDMG field
summary(maindataset$PROPDMG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 12.06 0.50 5000.00
sum(is.na(maindataset$PROPDMG))
## [1] 0
### Check the PROPDMGEXP field
summary(maindataset$PROPDMGEXP)
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
sum(is.na(maindataset$PROPDMGEXP))
## [1] 0
### Check the PROPDMG field
summary(maindataset$CROPDMG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.527 0.000 990.000
sum(is.na(maindataset$CROPDMG))
## [1] 0
### Check the PROPDMGEXP field
summary(maindataset$CROPDMGEXP)
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
sum(is.na(maindataset$CROPDMGEXP))
## [1] 0
Based on the field summary above, the data transformation should be done on the fields CROPDMGEXP and PROPDMGEXP we are using the mapvalue function to transform the values in the CROPDMGEXT and PROPDMGEXP to numeric values. These field are considered as the power ten magnitude of the value that we have in the CROPDMG and PROPDMG fields
require(plyr)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tidystorm <-maindataset
tidystorm$PROPDMGEXP <- mapvalues(tidystorm$PROPDMGEXP,
from = c("K", "M","", "B", "m", "+", "0", "5", "6", "?", "4", "2", "3", "h", "7", "H", "-", "1", "8"),
to = c(10^3, 10^6, 1, 10^9, 10^6, 1,1,10^5, 10^6, 1, 10^4, 10^2, 10^3, 10^2, 10^7, 10^2, 0, 10, 10^8))
tidystorm$PROPDMGEXP <- as.numeric(as.character(tidystorm$PROPDMGEXP))
tidystorm$PROPDMGQUANT <- (tidystorm$PROPDMG * tidystorm$PROPDMGEXP)
tidystorm$CROPDMGEXP <- mapvalues(tidystorm$CROPDMGEXP,
from = c("","M", "K", "m", "B", "?", "0", "k","2"),
to = c(1,10^6, 10^3, 10^6, 10^9, 1, 1, 10^3, 10^2))
tidystorm$CROPDMGEXP <- as.numeric(as.character(tidystorm$CROPDMGEXP))
tidystorm$CROPDMGQUANT <- (tidystorm$CROPDMG * tidystorm$CROPDMGEXP)
tidystorm$DAMAGE <- tidystorm$PROPDMGQUANT + tidystorm$CROPDMGQUANT
storm_summary_type<- tidystorm %>% mutate(event_type=
ifelse(grepl("AVALAN",EVTYPE), "AVALANCHE",
ifelse(grepl("BLIZZARD", EVTYPE), "BLIZZARD",
ifelse(grepl("ABNORMAL", EVTYPE), "ABNORMAL TEMP/HUMIDITY",
ifelse(grepl("COLD|Cold|Snow|Ice|COOL|SNOW|WINTER|WINTRY|SLEET|ICE|FREEZE|ICY", EVTYPE), "COLD/SNOW/ICE",
ifelse(grepl("FOG|VISIBILITY|DARK|DUST", EVTYPE), "FOG",
ifelse(grepl("HEAT|WARM|HOT|HIGH +TEMP|RECORD +TEMP|DRY", EVTYPE), "HEAT",
ifelse(grepl("RAIN|FLOOD|WET|FLD|HURRICANE", EVTYPE), "RAIN",
ifelse(grepl("WIND", EVTYPE), "WIND",
ifelse(grepl("FLOOD | FLD ", EVTYPE), "FLOOD",
ifelse(grepl("VOLC", EVTYPE), "VOLCANIC ACTIVITY",
ifelse(grepl("FREEZING | Freezing", EVTYPE), "FREEZING",
ifelse(grepl("HAIL", EVTYPE), "HAIL",
ifelse(grepl("DROUGHT", EVTYPE), "DROUGHT",
ifelse(grepl("WAVE|SURF|SURGE|TIDE|TSUNAMI|SWELL", EVTYPE), "WAVE",
ifelse(grepl("LIGHTNING | LIGNTNING", EVTYPE), "LIGHTNING",
ifelse(grepl("MUD", EVTYPE), "MUD",
ifelse(grepl("CURRENT", EVTYPE), "CURRENT",
ifelse(grepl("THUNDER", EVTYPE), "THUNDER",
ifelse(grepl("SURGE", EVTYPE), "SURGE",
ifelse(grepl("STORM|TORNADO|FUNNEL", EVTYPE), "TORNADO",
ifelse(grepl("TROPICAL +STORM", EVTYPE), "TROPICAL STORM",
ifelse(grepl("TSTM", EVTYPE), "TSTM",
ifelse(grepl("FIRE", EVTYPE), "FIRE","OTHER"))))))))))))))))))))))))
## Warning: package 'bindrcpp' was built under R version 3.4.4
storm_summary_type$event_type <- as.factor(storm_summary_type$event_type)
sum_fatalities <- aggregate(FATALITIES ~ event_type, data = storm_summary_type, sum, na.rm=TRUE)
sum_injuries <- aggregate(INJURIES ~ event_type, data = storm_summary_type, sum, na.rm=TRUE)
sum_damage <- aggregate(DAMAGE ~ event_type, data = storm_summary_type, sum, na.rm=TRUE)
In this study we are going to project two plots. The first plot will represent the number of fatalities based on the disasterous event and the second plot is to show the number of injuries per event across United States
library(ggplot2)
fatalities <- ggplot(sum_fatalities, aes(event_type, FATALITIES, fill=FATALITIES))
fatalities + geom_bar(stat = "identity")+theme(axis.text.x = element_text(angle = 90, hjust = 0))+labs(title="Effect of disatrous event on fatalities across United States", x="Disaster Event Type", y="Number of Fatalities Across US")
library(ggplot2)
injuries<- ggplot(sum_injuries, aes(event_type, INJURIES, fill=INJURIES))
injuries + geom_bar(stat = "identity")+theme(axis.text.x = element_text(angle = 90, hjust = 0))+labs(title="Effect of disatrous event on Injuries across United States", x="Disaster Event Type", y="Number of Injuries Caused Across US")
library(ggplot2)
damage <- ggplot(sum_damage, aes(event_type, DAMAGE/1000000000, fill=DAMAGE))
damage + geom_bar(stat = "identity")+theme(axis.text.x = element_text(angle = 90, hjust = 0))+labs(title="Damage Caused by each event in Billion Dollars", x="Disaster Event Type", y="Amount of damage per Billion Dollars")
In this research we looked at the data collected by U.S. National Oceanic and Atmospheric Administration from 1950 to 2011 on different disaterous natural events that happened to the United States. The dataset indicates three elements of number of fatalities, Injuries and the amount of damage to crops and properties as the factors to consider the damage resulted from each event. Since, the fatalities and Injuries are two different measures, we looked at them separately in different graphs and the damage is considered as the total of the property and crops damage. Based on the first graph, the highest three events that left most fatalities are coming from Tornadoes, Heat and rain. Also the events that left the highest injuries are Tornadoes, wind and rain. It is so interesting that the Rain category followed by Tornadoes and Waves came out as the events that left the most amount of damages to the properties and crops. Tornadoes are considered the most dangerous event to the human health and rain is the most destructive in terms of damage.