The report analyzes the U.S National Oceanic and Atmospheric Administration (NOAA) storm database to determine :-
Key finding :-
The analysis uses the data from 1950-2011, which is entirely processed in R, prepared as a R Markdown file and published in Rpubs.
First we need to input the data in the Rstudio. The link to the data is given so rather than downloading it manually, I would love to use R to download and load the data. This is done to help in the reproducibility in research later on.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
if(!file.exists("repdata_data_StormData.csv")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
"repdata_data_StormData.csv.bz2")
system("bzip2 -dk repdata_data_StormData.csv.bz2")
}
storm_data <- read.csv("repdata_data_StormData.csv")
stopifnot(exists("storm_data"))
At the beginning, we load the relevant libraries which will help us in this project. Then, we downloaded the data form the website and also read the data as “storm_data”. Finally, we examined the structure of the data to aid in our next process.
# Now safe to use storm_data
storm_data$EVTYPE <- tolower(storm_data$EVTYPE)
storm_data <- storm_data %>%
mutate(EVTYPE = case_when(
grepl("tornado|funnel",EVTYPE) ~ "tornado",
grepl("heat|hot",EVTYPE) ~ "heat",
grepl("flood|flash",EVTYPE) ~ "flood",
grepl("hurricane|typhoon",EVTYPE) ~ "hurricane",
grepl("thunderstorm|tstm",EVTYPE) ~ "thunderstorm wind",
grepl("winter|snow|ice|blizzard",EVTYPE) ~ "winter weather",
TRUE ~ EVTYPE
))
storm_data <- storm_data %>%
mutate(PROPDMG_NUM = case_when(
grepl("K", PROPDMGEXP, ignore.case = T) ~ PROPDMG * 1e3,
grepl("M", PROPDMGEXP, ignore.case = T) ~ PROPDMG * 1e6,
grepl("B", PROPDMGEXP, ignore.case = T) ~ PROPDMG * 1e9, TRUE ~ PROPDMG
),
CROPDMG_NUM = case_when(
grepl("K", CROPDMGEXP, ignore.case = T) ~ CROPDMG * 1e3,
grepl("M", CROPDMGEXP, ignore.case = T) ~ CROPDMG * 1e6,
grepl("B", CROPDMGEXP, ignore.case = T) ~ CROPDMG * 1e9, TRUE ~ CROPDMG
))
After looking at the structure of the “storm_data”, we cleaned and formatted the data so that the names of the event types were standardize and the damage values form the events were also to numeric.
health_impact <- storm_data %>% group_by(EVTYPE) %>%
summarise(fatalities = sum(FATALITIES, na.rm = T),
injuries = sum(INJURIES, na.rm = T)
) %>% arrange(desc(fatalities + injuries))
head(health_impact,5)
## # A tibble: 5 × 3
## EVTYPE fatalities injuries
## <chr> <dbl> <dbl>
## 1 tornado 5661 91410
## 2 heat 3138 9224
## 3 thunderstorm wind 729 9544
## 4 flood 1525 8604
## 5 winter weather 645 6014
Here are the five most harmful events for the population health.
health_impact_top5 <- head(health_impact, 5) %>%
pivot_longer(cols = c(fatalities, injuries),
names_to = "impact_type",
values_to = "count")
ggplot(health_impact_top5,
aes(x = reorder(EVTYPE,-count),
y = count, fill = impact_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top 5 Most Harmuful Weather Events (1950 - 2011)", x = "Event Types", y = "Total Count",
fill = "Impact Type" +
theme_minimal()
)
economic_impact <- storm_data %>% group_by(EVTYPE) %>%
summarise(
property_damage = sum(PROPDMG_NUM, na.rm = T),
crop_damage = sum(CROPDMG_NUM, na.rm = T)
) %>%
mutate(total_damage = property_damage + crop_damage) %>% arrange(desc(total_damage))
head(economic_impact,5)
## # A tibble: 5 × 4
## EVTYPE property_damage crop_damage total_damage
## <chr> <dbl> <dbl> <dbl>
## 1 flood 167529740932. 12380109100 179909850032.
## 2 hurricane 85356410010 5516117800 90872527810
## 3 tornado 58593297629. 417461520 59010759149.
## 4 storm surge 43323536000 5000 43323541000
## 5 hail 15732267048. 3025954473 18758221521.
Here are the five most costliest events.
economic_impact_top5 <- head(economic_impact, 5) %>%
pivot_longer(cols = c(property_damage, crop_damage),
names_to = "damage_type",
values_to = "cost")
ggplot(economic_impact_top5,
aes(x = reorder(EVTYPE, -cost),
y = cost / 1e9, fill = damage_type)) +
geom_bar(stat = "identity", postition = "stack") +
labs(title = "Top 5 Costliest Weather Events (1950-2011)",
x = "Event Type",
y = "Total Damage (Billions USD)",
fill = "Damage Type") +
theme_minimal()
## Warning in geom_bar(stat = "identity", postition = "stack"): Ignoring unknown
## parameters: `postition`