This report explores the NOAA Storm Database (1950–2011) to identify which event types are most harmful to population health and which have the greatest economic consequences in the United States. Health impact is measured as the sum of fatalities and injuries. Economic impact is measured as the sum of property and crop damages after converting NOAA damage exponent codes (K/M/B) into numeric multipliers.
The dataset was loaded from the original raw CSV file provided for the assignment. To improve performance, only the variables required for the analysis were imported.
library(data.table)
file <- "/home/rstudio/Reproducible Research/week2/repdata_data_StormData1.csv"
# Keep only needed columns
cols <- c("EVTYPE","FATALITIES","INJURIES",
"PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
dat <- fread(file, select = cols, showProgress = TRUE)
dim(dat)
## [1] 902297 7
str(dat)
## Classes 'data.table' and 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
NOAA stores damages as a number (e.g., 2.5) plus an exponent code (K/M/B). We convert them into multipliers: K=1,000; M=1,000,000; B=1,000,000,000. Unknown/blank codes are treated as multiplier 1 (conservative).
exp_to_mult <- function(x){
x <- toupper(trimws(x))
mult <- rep(1, length(x))
mult[x == "K"] <- 1e3
mult[x == "M"] <- 1e6
mult[x == "B"] <- 1e9
# Some datasets contain digits (0-8) meaning 10^digit
is_digit <- grepl("^[0-9]$", x)
mult[is_digit] <- 10^(as.numeric(x[is_digit]))
# Treat NA as 1 (safe default)
mult[is.na(mult)] <- 1
mult
}
dat$PROP_MULT <- exp_to_mult(dat$PROPDMGEXP)
dat$CROP_MULT <- exp_to_mult(dat$CROPDMGEXP)
dat$PROP_USD <- dat$PROPDMG * dat$PROP_MULT
dat$CROP_USD <- dat$CROPDMG * dat$CROP_MULT
dat$ECON_USD <- dat$PROP_USD + dat$CROP_USD
dat$HEALTH <- dat$FATALITIES + dat$INJURIES
summary(dat$ECON_USD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 0.00e+00 0.00e+00 5.29e+05 1.00e+03 1.15e+11
summary(dat$HEALTH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1725 0.0000 1742.0000
library(dplyr)
df <- as.data.frame(dat) # dplyr lavora più tranquillo su data.frame
health_by_event <- df %>%
group_by(EVTYPE) %>%
summarise(
fatalities = sum(FATALITIES, na.rm = TRUE),
injuries = sum(INJURIES, na.rm = TRUE),
health = sum(HEALTH, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(health))
econ_by_event <- df %>%
group_by(EVTYPE) %>%
summarise(
prop_usd = sum(PROP_USD, na.rm = TRUE),
crop_usd = sum(CROP_USD, na.rm = TRUE),
econ_usd = sum(ECON_USD, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(econ_usd))
head(health_by_event, 10)
## # A tibble: 10 x 4
## EVTYPE fatalities injuries health
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
## 6 HEAT 937 2100 3037
## 7 FLASH FLOOD 978 1777 2755
## 8 ICE STORM 89 1975 2064
## 9 THUNDERSTORM WIND 133 1488 1621
## 10 WINTER STORM 206 1321 1527
head(econ_by_event, 10)
## # A tibble: 10 x 4
## EVTYPE prop_usd crop_usd econ_usd
## <chr> <dbl> <dbl> <dbl>
## 1 FLOOD 144657709807 5661968450 150319678257
## 2 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3 TORNADO 56947380676. 414953270 57362333946.
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 HAIL 15735267018. 3025954473 18761221491.
## 6 FLASH FLOOD 16822673978. 1421317100 18243991078.
## 7 DROUGHT 1046106000 13972566000 15018672000
## 8 HURRICANE 11868319010 2741910000 14610229010
## 9 RIVER FLOOD 5118945500 5029459000 10148404500
## 10 ICE STORM 3944927860 5022113500 8967041360
We rank event types by total (fatalities + injuries) and show the top 10.
library(dplyr)
library(ggplot2)
topH <- health_by_event %>%
slice_head(n = 10) %>%
mutate(EVTYPE = reorder(as.character(EVTYPE), health))
ggplot(topH, aes(x = EVTYPE, y = health)) +
geom_col(fill = "#2b8cbe") +
coord_flip() +
labs(title = "Top 10 event types by population health impact",
x = "Event type",
y = "Fatalities + Injuries")
Figure 1. Bar chart of the 10 event types with the largest combined number of fatalities and injuries.
We rank event types by total damage (property + crop) and show the top 10.
library(dplyr)
library(ggplot2)
# Display in billions for readability
topE <- econ_by_event %>%
slice_head(n = 10) %>%
mutate(
EVTYPE = reorder(as.character(EVTYPE), econ_usd),
econ_bil = econ_usd / 1e9
)
ggplot(topE, aes(x = EVTYPE, y = econ_bil)) +
geom_col(fill = "#31a354") +
coord_flip() +
labs(
title = "Top 10 event types by economic impact",
x = "Event type",
y = "Total damage (billion USD)"
)
Figure 2. Bar chart of the 10 event types with the largest total economic damage (property + crop).
library(dplyr)
as.data.frame(health_by_event) %>% slice_head(n = 10)
## EVTYPE fatalities injuries health
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
## 6 HEAT 937 2100 3037
## 7 FLASH FLOOD 978 1777 2755
## 8 ICE STORM 89 1975 2064
## 9 THUNDERSTORM WIND 133 1488 1621
## 10 WINTER STORM 206 1321 1527
as.data.frame(econ_by_event) %>% slice_head(n = 10)
## EVTYPE prop_usd crop_usd econ_usd
## 1 FLOOD 144657709807 5661968450 150319678257
## 2 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3 TORNADO 56947380676 414953270 57362333946
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 HAIL 15735267018 3025954473 18761221491
## 6 FLASH FLOOD 16822673978 1421317100 18243991078
## 7 DROUGHT 1046106000 13972566000 15018672000
## 8 HURRICANE 11868319010 2741910000 14610229010
## 9 RIVER FLOOD 5118945500 5029459000 10148404500
## 10 ICE STORM 3944927860 5022113500 8967041360