knitr::opts_chunk$set(echo = TRUE, cache= TRUE)
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Adjuntando el paquete: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(gghighlight)
library(gt)
library(gtExtras)
library(cowplot)
## 
## Adjuntando el paquete: 'cowplot'
## 
## The following object is masked from 'package:gt':
## 
##     as_gtable
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp

Synopsis

This analysis explores the impact of severe weather events in the United States from 1951 to 2011 using the provided dataset. The primary goal is to determine which types of events have caused the most harm to public health (injuries and fatalities) and which have led to the greatest economic losses (property and crop damage). Additionally, this study identifies the states that have suffered the highest combined impact in both economic and health-related terms.

Data Processing

First, I downloaded the document directly from the URL provided on Coursera, and saved it in a file called “StormData.csv”.

I then loaded the file into R Studio using the read_csv function and also used the “clean_names” function from the janitor library to better manage the variables.

Finally I made some transformations in the dataframe, such as: converting some columns to factor, and changing the type of the “bgn_date” and “end_date” columns from character to date.

# Download y save the file
download.file(
  url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
  destfile = "StormData.csv",
)

# Load csv file into R Studio and cleaning names of the data frame
storm<- read_csv("StormData.csv") %>% 
  clean_names()
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl  (1): COUNTYENDN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Some transformations in the dataframe
storm<- storm %>% 
  mutate_at(.vars = c("state", "countyname", "state_2", "evtype"),
            .funs = as.factor) %>% 
  mutate(bgn_date= str_remove(str_sub(string = bgn_date, start = 1, end = 10), " 0"),
         bgn_date= parse_date(bgn_date, "%m/%d/%Y")) %>% 
  mutate(end_date= str_remove(str_sub(string = end_date, start = 1, end = 10), " 0"),
         end_date= parse_date(end_date, "%m/%d/%Y"))

Results

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

A: Tornadoes are undoubtedly the events that have affected the greatest number of people in terms of health, that is, injuries and deaths.

storm %>% 
  summarise(.by = evtype,
            affected= sum(injuries+fatalities, na.rm = T)) %>% 
  arrange(desc(affected)) %>%
  head(10) %>% 
  ggplot(mapping = aes(x= fct_inorder(evtype), y= affected, fill= evtype))+
  geom_col(show.legend = F)+
  gghighlight(max(affected) > 75000)+
  xlab("Types Of Events")+
  ylab("# People Affected")+
  ggtitle(label = "Top 10 Harmful Events",
          subtitle = "to population health")+
  theme_bw()+
  theme(axis.text.x = element_text(size = 5.5),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 10, hjust = 0.5))

  1. Across the United States, which types of events have the greatest economic consequences?

A: In total terms, floods had the greatest economic impact, however, if we divide the impact by property and crops, it can be seen that in terms of crops, river flooding and ice storms had the second and third greatest impact. On the other hand, tornadoes and hurricanes caused quite a bit of property damage.

# First, convert damage multiplier variables

multiplit <- function(exp) {
  exp <- toupper(exp)
  ifelse(exp == "B", 1e9, 
         ifelse(exp == "M", 1e6, 
                ifelse(exp == "K", 1e3, 1)))
}

storm <- storm %>%
  mutate(
    propdmgexp = multiplit(propdmgexp),
    cropdmgexp = multiplit(cropdmgexp),
    propdmg = propdmg * propdmgexp,
    cropdmg = cropdmg * cropdmgexp,
  )
# Then, create a variable (vector) that includes the 10 types of events that have produced the greatest total economic impact 

econ_events<- storm %>%
  mutate(evtype= as.character(evtype)) %>% 
  summarise(.by = evtype,
            damage= sum(propdmg + cropdmg,  na.rm = T)) %>% 
  arrange(desc(damage)) %>%
  head(10) %>% 
  pull(evtype)

# Then use this variable to filter my df and only include this type of events, and from this create a table with the gt library that is divided by damage to Crop and Property and that also includes the total damage

storm %>%
  filter(evtype %in% econ_events) %>% 
  summarise(.by = evtype,
            crop = sum(cropdmg, na.rm = T)/1000000,
            prop = sum(propdmg, na.rm = T)/1000000
  ) %>%
  mutate(`Total`= ifelse(is.na(crop + prop),0,crop + prop)) %>% 
  arrange(desc(`Total`)) %>% 
  rename('Property' = prop,
         `Event Type`= evtype) %>% 
  gt() %>% 
  gt_theme_538() %>% 
  tab_header(title = "Top 10 Events With Greatest Economic Consequences") %>% 
  data_color(columns = crop, palette = "viridis", domain = c(0.80, 5700), reverse = T) %>%
  data_color(columns = `Property`, palette = "viridis", domain = c(3400, 145000), reverse = T) %>%
  gt::fmt_currency() %>% 
  gt::tab_caption(caption = "*figures in millions of dollars")
*figures in millions of dollars
Top 10 Events With Greatest Economic Consequences
Event Type crop Property Total
FLOOD $5,661.97 $144,657.71 $150,319.68
HURRICANE/TYPHOON $2,607.87 $69,305.84 $71,913.71
TORNADO $414.95 $56,937.16 $57,352.11
HAIL $3,025.95 $15,732.27 $18,758.22
FLASH FLOOD $1,421.32 $16,140.86 $17,562.18
HURRICANE $2,741.91 $11,868.32 $14,610.23
RIVER FLOOD $5,029.46 $5,118.95 $10,148.40
ICE STORM $5,022.11 $3,944.93 $8,967.04
STORM SURGE/TIDE $0.85 $4,641.19 $4,642.04
THUNDERSTORM WIND $414.84 $3,483.12 $3,897.96

Extra Section

In this additional section, considering that the data set allows for a very detailed analysis, I only wanted to answer the following question:

Are the states most affected in terms of health the most affected economically?

A: Overall, the states most impacted economically also experience significant health impacts; however, only two states, Texas and Mississippi, appear in the top 5 for both categories.

# Health Impact by State
h_plot<- storm %>% 
  summarise(.by = state_2,
            affected= sum(injuries+fatalities, na.rm = T)) %>% 
  filter(affected>0) %>% 
  arrange(affected) %>% 
  ggplot(mapping = aes(x=fct_inorder(state_2), y= affected))+
  geom_col(fill= "red")+
  coord_flip()+
  gghighlight(max(affected) > 7000)+
  ylab("Injuries + Fatalities")+
  xlab("State")+
  ggtitle(label = "Health Impact by State")+
  theme_bw()+
  theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))


# Economic Impact by State
e_plot<- storm %>% 
  summarise(.by = state_2,
            damage= sum(propdmg+cropdmg, na.rm = T)/1000000) %>% 
  filter(damage>0) %>% 
  arrange(damage) %>% 
  ggplot(mapping = aes(x=fct_inorder(state_2), y= damage))+
  geom_col(fill= "red")+
  coord_flip()+
  gghighlight(max(damage) > 11000)+
  ylab("Property and Crop Damage ($M)")+
  xlab("State")+
  ggtitle(label = "Economic Impact by State")+
  theme_bw()+
  theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))

# Include both graphs in a grid 
plot_grid(e_plot, h_plot, ncol = 2)