This analysis explores the NOAA Storm Database to determine which event types have the greatest impact in terms of deaths, injuries, and economc impact. The dataset is from 1950 to 2011. ## Data Processing The data comes from the U.S. National Oceanic and Atmospheric Administration (NOAA) Storm Database, which explains multiple weather events on the United States and the number of deaths, injuries, property damage, and crop damage.
The format of the data is a CSV file. Data is loaded using read_csv() function from the readr package. The variables we focus in our analysis are: EVTYPE (Event Type) FATALITIES (Number of deaths) INJURIES (Number of injuries) PROPDMG (Property damage amount) PROPDMGEXP (Exponent for property damage) CROPDMG (Crop damage amount) CROPDMGEXP (Exponent for crop damage) Because it is such a large dataset and the time to evaluate the plots was long the data was filtered to exclude any data that did not result in deaths, injuries, or damage to property/crops.
Also the property and crop damage values (PROPDMG and CROPDMG) are converted into actual numerical values (“K” = thoudands, “M” = millions, “B” = billions). ### Loading Data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
# Load dataset
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(file_url, destfile = "StormData.csv.bz2", mode = "wb")
storm_data <- read_csv("StormData.csv.bz2")
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl (1): COUNTYENDN
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- storm_data %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
filter(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0) # Remove irrelevant rows
# Select relevant columns
data <- storm_data %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
str(data)
## tibble [902,297 × 7] (S3: tbl_df/tbl/data.frame)
## $ EVTYPE : chr [1:902297] "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num [1:902297] 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num [1:902297] 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num [1:902297] 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr [1:902297] "K" "K" "K" "K" ...
## $ CROPDMG : num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr [1:902297] NA NA NA NA ...
summary(data)
## EVTYPE FATALITIES INJURIES PROPDMG
## Length:902297 Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Mode :character Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
## PROPDMGEXP CROPDMG CROPDMGEXP
## Length:902297 Min. : 0.000 Length:902297
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
colnames(data)
## [1] "EVTYPE" "FATALITIES" "INJURIES" "PROPDMG" "PROPDMGEXP"
## [6] "CROPDMG" "CROPDMGEXP"
# Define the exponent map BEFORE using it
exp_map <- c("K" = 1e3, "M" = 1e6, "B" = 1e9, " " = 1, "0" = 1)
data <- data %>%
mutate(
PROPDMGEXP = toupper(as.character(PROPDMGEXP)),
CROPDMGEXP = toupper(as.character(CROPDMGEXP)),
# Replace NA values with "0" before mapping
PROPDMGEXP = ifelse(is.na(PROPDMGEXP) | PROPDMGEXP == "", "0", PROPDMGEXP),
CROPDMGEXP = ifelse(is.na(CROPDMGEXP) | CROPDMGEXP == "", "0", CROPDMGEXP),
# Convert to multipliers
PROPDMGEXP = ifelse(PROPDMGEXP %in% names(exp_map), exp_map[PROPDMGEXP], 1),
CROPDMGEXP = ifelse(CROPDMGEXP %in% names(exp_map), exp_map[CROPDMGEXP], 1),
# Ensure numeric conversion works properly
PROPDMGEXP = as.numeric(PROPDMGEXP),
CROPDMGEXP = as.numeric(CROPDMGEXP),
# Compute total damages
TOTAL_PROP_DMG = PROPDMG * PROPDMGEXP,
TOTAL_CROP_DMG = CROPDMG * CROPDMGEXP,
TOTAL_ECONOMIC_DMG = TOTAL_PROP_DMG + TOTAL_CROP_DMG
)
health_impact <- data %>%
group_by(EVTYPE) %>%
summarise(TOTAL_FATALITIES = sum(FATALITIES, na.rm = TRUE),
TOTAL_INJURIES = sum(INJURIES, na.rm = TRUE)) %>%
arrange(desc(TOTAL_FATALITIES + TOTAL_INJURIES)) %>%
top_n(10, TOTAL_FATALITIES + TOTAL_INJURIES)
ggplot(health_impact, aes(x = reorder(EVTYPE, -(TOTAL_FATALITIES + TOTAL_INJURIES)),
y = TOTAL_FATALITIES + TOTAL_INJURIES, fill = EVTYPE)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Weather Events Causing Most Harm to Population Health",
x = "Event Type", y = "Total Fatalities + Injuries") +
theme_minimal()
###Greatest Economic Consequences
economic_impact <- data %>%
group_by(EVTYPE) %>%
summarise(TOTAL_ECONOMIC_DMG = sum(TOTAL_ECONOMIC_DMG, na.rm = TRUE)) %>%
arrange(desc(TOTAL_ECONOMIC_DMG)) %>%
top_n(10, TOTAL_ECONOMIC_DMG)
ggplot(economic_impact, aes(x = reorder(EVTYPE, -TOTAL_ECONOMIC_DMG),
y = TOTAL_ECONOMIC_DMG / 1e9, fill = EVTYPE)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Weather Events with Greatest Economic Impact",
x = "Event Type", y = "Total Economic Damage (Billion USD)") +
theme_minimal()
This analysis shows that tornadoes have the most significant impact on public health, causing the highest number of injuries and detahs. On the other hand floods have the greatest effect on economic impact.