knitr::opts_chunk$set(echo = TRUE, cache= TRUE)
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Adjuntando el paquete: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(gghighlight)
library(gt)
library(gtExtras)
library(cowplot)
##
## Adjuntando el paquete: 'cowplot'
##
## The following object is masked from 'package:gt':
##
## as_gtable
##
## The following object is masked from 'package:lubridate':
##
## stamp
This analysis explores the impact of severe weather events in the United States from 1951 to 2011 using the provided dataset. The primary goal is to determine which types of events have caused the most harm to public health (injuries and fatalities) and which have led to the greatest economic losses (property and crop damage). Additionally, this study identifies the states that have suffered the highest combined impact in both economic and health-related terms.
First, I downloaded the document directly from the URL provided on Coursera, and saved it in a file called “StormData.csv”.
I then loaded the file into R Studio using the read_csv function and also used the “clean_names” function from the janitor library to better manage the variables.
Finally I made some transformations in the dataframe, such as: converting some columns to factor, and changing the type of the “bgn_date” and “end_date” columns from character to date.
# Download y save the file
download.file(
url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = "StormData.csv",
)
# Load csv file into R Studio and cleaning names of the data frame
storm<- read_csv("StormData.csv") %>%
clean_names()
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl (1): COUNTYENDN
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Some transformations in the dataframe
storm<- storm %>%
mutate_at(.vars = c("state", "countyname", "state_2", "evtype"),
.funs = as.factor) %>%
mutate(bgn_date= str_remove(str_sub(string = bgn_date, start = 1, end = 10), " 0"),
bgn_date= parse_date(bgn_date, "%m/%d/%Y")) %>%
mutate(end_date= str_remove(str_sub(string = end_date, start = 1, end = 10), " 0"),
end_date= parse_date(end_date, "%m/%d/%Y"))
A: Tornadoes are undoubtedly the events that have affected the greatest number of people in terms of health, that is, injuries and deaths.
storm %>%
summarise(.by = evtype,
affected= sum(injuries+fatalities, na.rm = T)) %>%
arrange(desc(affected)) %>%
head(10) %>%
ggplot(mapping = aes(x= fct_inorder(evtype), y= affected, fill= evtype))+
geom_col(show.legend = F)+
gghighlight(max(affected) > 75000)+
xlab("Types Of Events")+
ylab("# People Affected")+
ggtitle(label = "Top 10 Harmful Events",
subtitle = "to population health")+
theme_bw()+
theme(axis.text.x = element_text(size = 5.5),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5))
A: In total terms, floods had the greatest economic impact, however, if we divide the impact by property and crops, it can be seen that in terms of crops, river flooding and ice storms had the second and third greatest impact. On the other hand, tornadoes and hurricanes caused quite a bit of property damage.
# First, convert damage multiplier variables
multiplit <- function(exp) {
exp <- toupper(exp)
ifelse(exp == "B", 1e9,
ifelse(exp == "M", 1e6,
ifelse(exp == "K", 1e3, 1)))
}
storm <- storm %>%
mutate(
propdmgexp = multiplit(propdmgexp),
cropdmgexp = multiplit(cropdmgexp),
propdmg = propdmg * propdmgexp,
cropdmg = cropdmg * cropdmgexp,
)
# Then, create a variable (vector) that includes the 10 types of events that have produced the greatest total economic impact
econ_events<- storm %>%
mutate(evtype= as.character(evtype)) %>%
summarise(.by = evtype,
damage= sum(propdmg + cropdmg, na.rm = T)) %>%
arrange(desc(damage)) %>%
head(10) %>%
pull(evtype)
# Then use this variable to filter my df and only include this type of events, and from this create a table with the gt library that is divided by damage to Crop and Property and that also includes the total damage
storm %>%
filter(evtype %in% econ_events) %>%
summarise(.by = evtype,
crop = sum(cropdmg, na.rm = T)/1000000,
prop = sum(propdmg, na.rm = T)/1000000
) %>%
mutate(`Total`= ifelse(is.na(crop + prop),0,crop + prop)) %>%
arrange(desc(`Total`)) %>%
rename('Property' = prop,
`Event Type`= evtype) %>%
gt() %>%
gt_theme_538() %>%
tab_header(title = "Top 10 Events With Greatest Economic Consequences") %>%
data_color(columns = crop, palette = "viridis", domain = c(0.80, 5700), reverse = T) %>%
data_color(columns = `Property`, palette = "viridis", domain = c(3400, 145000), reverse = T) %>%
gt::fmt_currency() %>%
gt::tab_caption(caption = "*figures in millions of dollars")
Top 10 Events With Greatest Economic Consequences | |||
Event Type | crop | Property | Total |
---|---|---|---|
FLOOD | $5,661.97 | $144,657.71 | $150,319.68 |
HURRICANE/TYPHOON | $2,607.87 | $69,305.84 | $71,913.71 |
TORNADO | $414.95 | $56,937.16 | $57,352.11 |
HAIL | $3,025.95 | $15,732.27 | $18,758.22 |
FLASH FLOOD | $1,421.32 | $16,140.86 | $17,562.18 |
HURRICANE | $2,741.91 | $11,868.32 | $14,610.23 |
RIVER FLOOD | $5,029.46 | $5,118.95 | $10,148.40 |
ICE STORM | $5,022.11 | $3,944.93 | $8,967.04 |
STORM SURGE/TIDE | $0.85 | $4,641.19 | $4,642.04 |
THUNDERSTORM WIND | $414.84 | $3,483.12 | $3,897.96 |
In this additional section, considering that the data set allows for a very detailed analysis, I only wanted to answer the following question:
Are the states most affected in terms of health the most affected economically?
A: Overall, the states most impacted economically also experience significant health impacts; however, only two states, Texas and Mississippi, appear in the top 5 for both categories.
# Health Impact by State
h_plot<- storm %>%
summarise(.by = state_2,
affected= sum(injuries+fatalities, na.rm = T)) %>%
filter(affected>0) %>%
arrange(affected) %>%
ggplot(mapping = aes(x=fct_inorder(state_2), y= affected))+
geom_col(fill= "red")+
coord_flip()+
gghighlight(max(affected) > 7000)+
ylab("Injuries + Fatalities")+
xlab("State")+
ggtitle(label = "Health Impact by State")+
theme_bw()+
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
# Economic Impact by State
e_plot<- storm %>%
summarise(.by = state_2,
damage= sum(propdmg+cropdmg, na.rm = T)/1000000) %>%
filter(damage>0) %>%
arrange(damage) %>%
ggplot(mapping = aes(x=fct_inorder(state_2), y= damage))+
geom_col(fill= "red")+
coord_flip()+
gghighlight(max(damage) > 11000)+
ylab("Property and Crop Damage ($M)")+
xlab("State")+
ggtitle(label = "Economic Impact by State")+
theme_bw()+
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
# Include both graphs in a grid
plot_grid(e_plot, h_plot, ncol = 2)