This project look at the NOAA Storm Database (1950 - 2011) to identify the types of severe weather events that are most harmful to population health and have the greatest economic consequences. The analysis includes data processing steps, results with visualizations, and a discussion of the findings.
First, load packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Download file
url_data <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_path <- file.path("data","StormData.csv.bz2")
if (!dir.exists("data")) {
dir.create("data")
}
download.file(url_data, data_path)
Import data to R
# Readr package automatically unzip bz2
raw_data <- read_csv(data_path)
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl (1): COUNTYENDN
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Transform and clean data
# Convert date columns
data <- raw_data |>
mutate(BGN_DATE = strptime(BGN_DATE, format = "%m/%d/%Y %H:%M:%S"))
# Filter relevant columns
data <- data |>
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
# Handle missing values
data <- na.omit(data)
# Creating a function to convert exponent symbols to numeric values
convert_exponent <- function(exp) {
if (sum(exp %in% c("K", "k"))) {
return(1e3)
} else if (sum(exp %in% c("M", "m"))) {
return(1e6)
} else if (sum(exp %in% c("B", "b"))) {
return(1e9)
} else if (sum(exp %in% c("H", "h"))) {
return(1e2)
} else if (sum(exp %in% c("", "+", "-", "?", "0"))) {
return(1)
} else {
return(as.numeric(exp))
}
}
# Create relevant variables
data <- data |>
mutate(numPROPDMGEXP = convert_exponent(PROPDMGEXP),
numCROPDMGEXP = convert_exponent(CROPDMGEXP),
totalPROPDMGE = numPROPDMGEXP * PROPDMG,
totalCROPDMGE = numCROPDMGEXP * CROPDMG,
totalEconomicDmg = totalPROPDMGE + totalCROPDMGE)
Most Harmful Events to Population Health (Fatalities)
# Aggregate data by event type
health_impact <- data |>
group_by(EVTYPE) |>
summarize(Total_Fatalities = sum(FATALITIES), Total_Injuries = sum(INJURIES)) |>
arrange(desc(Total_Fatalities))
# Plot the results
ggplot(health_impact[1:10, ], aes(x = reorder(EVTYPE, -Total_Fatalities), y = Total_Fatalities)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Most Harmful Events to Population Health (Fatalities)",
x = "Event Type", y = "Total Fatalities") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Most Harmful Events to Population Health (Injuries)
# Plot the results
health_impact <- data |>
group_by(EVTYPE) |>
summarize(Total_Fatalities = sum(FATALITIES), Total_Injuries = sum(INJURIES)) |>
arrange(desc(Total_Injuries))
ggplot(health_impact[1:10, ], aes(x = reorder(EVTYPE, -Total_Injuries), y = Total_Injuries)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Most Harmful Events to Population Health (Injuries)",
x = "Event Type", y = "Total Injuries") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Events with Greatest Economic Consequences
economic_impact <- data %>%
group_by(EVTYPE) %>%
summarize(Total_Economic_Damage = sum(totalEconomicDmg)) %>%
arrange(desc(Total_Economic_Damage))
# Plot the results
ggplot(economic_impact[1:10, ], aes(x = reorder(EVTYPE, -Total_Economic_Damage), y = Total_Economic_Damage)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Events with Greatest Economic Consequences",
x = "Event Type", y = "Total Economic Damage (in dollars)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
As we see, Tornado is the most harmful weather event. It is important that authorities help to solve this situation.