In this report, we’re focusing on the NOAA storm database to find out the most harmful event with respect to population health and what types of events have the greatest economic consequences in the U.S. from the year 1950 to November 2011.
Loading the required packages using which data analysis has to be performed.
library(dplyr)
library(tidyr)
library(ggplot2)
From the NOAA Satellite and Information Service, we obtained the U.S. storm database between year 1950 - November 2011.
Documentation for the dataset can be found here.
dataFile <- "repdata_data_StormData.csv.bz2"
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists(dataFile)) {
download.file(fileUrl, destfile = dataFile)
}
df <- read.csv(dataFile, na.strings = "", stringsAsFactors = FALSE)
NOAA dataset contains 37 variables, hence subsetting the dataset to the variables of interest for data analysis.
names(df)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
df <- select(df, "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
EVTYPE contains multiple ‘names’ representing the same event type. Like Ice, Snow, Frost, Sleet reprenting in general “Snow”, hence regrouping the EVTYPE for better data analysis.
df$EVTYPE[grepl("FLOOD", df$EVTYPE, ignore.case = TRUE)] <- "FLOOD"
df$EVTYPE[grepl("TORNADO", df$EVTYPE, ignore.case = TRUE)] <- "TORNADO"
df$EVTYPE[grepl("TSTM|THUNDERSTORM", df$EVTYPE, ignore.case = TRUE)] <- "THUNDERSTORM"
df$EVTYPE[grepl("TROPICAL|STORM", df$EVTYPE, ignore.case = TRUE)] <- "STORM"
df$EVTYPE[grepl("HURRICANE", df$EVTYPE, ignore.case = TRUE)] <- "HURRICANE"
df$EVTYPE[grepl("SLEET|ICE|FROST|SNOW", df$EVTYPE, ignore.case = TRUE)] <- "SNOW"
df$EVTYPE[grepl("FOG", df$EVTYPE, ignore.case = TRUE)] <- "FOG"
df$EVTYPE[grepl("COLD|WINDCHILL|FREEZE|WINTER", df$EVTYPE, ignore.case = TRUE)] <- "COLD"
df$EVTYPE[grepl("HEAT|WARM|HOT", df$EVTYPE, ignore.case = TRUE)] <- "HEAT"
df$EVTYPE[grepl("CLOUD|FUNNEL", df$EVTYPE, ignore.case = TRUE)] <- "CLOUD"
df$EVTYPE[grepl("HAIL", df$EVTYPE, ignore.case = TRUE)] <- "HAIL"
df$EVTYPE[grepl("DROUGHT|DRY", df$EVTYPE, ignore.case = TRUE)] <- "DROUGHT"
df$EVTYPE[grepl("LIGHT", df$EVTYPE, ignore.case = TRUE)] <- "LIGHTNING"
df$EVTYPE[grepl("FIRE", df$EVTYPE, ignore.case = TRUE)] <- "FIRE"
df$EVTYPE[grepl("RAIN|SHOWER", df$EVTYPE, ignore.case = TRUE)] <- "RAIN"
df$EVTYPE[grepl("WATERSPOUT", df$EVTYPE, ignore.case = TRUE)] <- "WATERSPOUT"
df$EVTYPE[grepl("SURF", df$EVTYPE, ignore.case = TRUE)] <- "SURF"
df$EVTYPE[grepl("CURRENT", df$EVTYPE, ignore.case = TRUE)] <- "CURRENT"
df$EVTYPE[grepl("WIND|MICROBURST", df$EVTYPE, ignore.case = TRUE)] <- "WIND"
df$EVTYPE[grepl("BLIZZARD", df$EVTYPE, ignore.case = TRUE)] <- "BLIZZARD"
df$EVTYPE[grepl("SLIDE", df$EVTYPE, ignore.case = TRUE)] <- "LANDSLIDE"
df$EVTYPE[grepl("DUST", df$EVTYPE, ignore.case = TRUE)] <- "DUST"
df$EVTYPE<-factor(df$EVTYPE)
Converting Crop and Property damage in their respective notation by multiplying it to _EXP values. K for thousand, M for million, B for billion.
df$PROPDMGEXP <- recode(df$PROPDMGEXP, 'K' = 1000, 'M' = 1000000, 'B' = 1000000000, .default = 1)
df$CROPDMGEXP <- recode(df$CROPDMGEXP, 'K' = 1000, 'M' = 1000000,'B' = 1000000000, .default = 1)
df$PROPDMGVALUE <- df$PROPDMG * df$PROPDMGEXP
df$CROPDMGVALUE <- df$CROPDMG * df$CROPDMGEXP
Our data is ready and prepared for analysis.
It is calculated by Fatalities and Injuries caused collectively by an event.
healthDamage <-
(df %>%
group_by(EVTYPE) %>%
summarise(FATALITIES = sum(FATALITIES, na.rm = TRUE), INJURIES = sum(INJURIES, na.rm = TRUE)) %>%
arrange(desc(c(FATALITIES + INJURIES)))
)
mostHealthDamage <- healthDamage[1:10, ]
mostHealthDamage <- gather(mostHealthDamage, TYPE, VALUE, FATALITIES:INJURIES)
ggplot(mostHealthDamage, aes(x=reorder(EVTYPE,-VALUE), y=VALUE, fill=TYPE)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Top 10 Harmful Events to Population Health", x="Event Type", y="Count") +
guides(fill = guide_legend(title = "Type"))
It is calculated by Crop and Property damage caused collectively by an event.
economicDamage <-
(df %>%
group_by(EVTYPE) %>%
summarise(PROPDMGVALUE = sum(PROPDMGVALUE, na.rm = TRUE),CROPDMGVALUE = sum(CROPDMGVALUE, na.rm = TRUE)) %>%
arrange(desc(c(PROPDMGVALUE+CROPDMGVALUE)))
)
mostEconomicDamage <- economicDamage[1:10, ]
mostEconomicDamage <- gather(mostEconomicDamage, TYPE, VALUE, PROPDMGVALUE:CROPDMGVALUE)
ggplot(mostEconomicDamage, aes(x=reorder(EVTYPE,-VALUE), y=VALUE, fill=factor(TYPE, labels=c("crop Damage", "Property Damage")))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Economically Harmful Events", x="Event Type", y="Count") +
guides(fill = guide_legend(title = "Type"))