This report analyzes the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to identify storm event types that are most harmful to population health and those that have the greatest economic consequences across the United States. The data spans from 1950 to November 2011.
This section details the steps taken to clean and prepare the data for analysis. This includes selecting relevant columns, cleaning the event type (EVTYPE) variable, and calculating numerical values for economic damages.
file_path <- "repdata_data_StormData.csv.bz2"
if (file.exists(file_path)) {
storm_data_raw <- read.csv(bzfile(file_path), stringsAsFactors = FALSE)
print(paste("Successfully loaded", nrow(storm_data_raw), "rows and", ncol(storm_data_raw), "columns."))
} else {
stop("Data file not found. Please check the file_path variable.")
}
## [1] "Successfully loaded 902297 rows and 37 columns."
storm_data <- storm_data_raw %>%
select(BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,
CROPDMG, CROPDMGEXP, STATE, REFNUM)
storm_data$BGN_DATE_PARSED <- as.Date(storm_data$BGN_DATE, format="%m/%d/%Y %H:%M:%S")
storm_data$YEAR <- as.numeric(format(storm_data$BGN_DATE_PARSED, "%Y"))
storm_data$EVTYPE_CLEAN <- toupper(trimws(storm_data$EVTYPE))
storm_data$EVTYPE_CLEAN <- str_replace_all(storm_data$EVTYPE_CLEAN,
c("^TSTM WIND.*|^THUNDERSTORM WIND.*S$" = "THUNDERSTORM WIND",
"^THUNDERSTORM$" = "THUNDERSTORM WIND",
"^FLASH FLOOD.*" = "FLASH FLOOD",
"^FLOOD$" = "FLOOD",
"^(COASTAL|CSTL) FLOOD.*" = "COASTAL FLOOD",
"^RIP CURRENT.*" = "RIP CURRENT",
"^TORNADO.*|^TORNDAO$" = "TORNADO",
"^HAIL.*" = "HAIL",
"^HURRICANE.*|^TYPHOON$" = "HURRICANE/TYPHOON",
"^STORM SURGE.*" = "STORM SURGE/TIDE",
"^HEAVY SNOW.*" = "HEAVY SNOW",
"^BLIZZARD.*" = "BLIZZARD",
"^HEAVY RAIN.*" = "HEAVY RAIN",
"^WINTER STORM.*" = "WINTER STORM",
"^WINTER WEATHER.*" = "WINTER WEATHER",
"^(EXTREME )?COLD.*|^(EXTREME )?WIND CHILL.*" = "EXTREME COLD/WIND CHILL",
"^(EXCESSIVE )?HEAT.*" = "HEAT",
"^DROUGHT.*" = "DROUGHT",
"^WILD.*FIRE.*|^FOREST FIRE.*" = "WILDFIRE",
"^LIGHTNING.*" = "LIGHTNING",
"^DENSE FOG.*" = "DENSE FOG",
"^ICE STORM.*" = "ICE STORM",
"^AVALANCHE.*" = "AVALANCHE",
"^(HIGH|STRONG) WIND.*" = "HIGH WIND",
"^WATERSPOUT.*" = "WATERSPOUT",
"^DUST DEVIL.*" = "DUST DEVIL",
"^VOLCANIC ASH.*" = "VOLCANIC ASH",
"^MARINE TSTM WIND|^MARINE THUNDERSTORM WIND" = "MARINE THUNDERSTORM WIND",
"^MARINE HAIL" = "MARINE HAIL",
"^MARINE HIGH WIND" = "MARINE HIGH WIND",
"^MARINE STRONG WIND" = "MARINE STRONG WIND",
"SUMMARY.*" = "SUMMARY"))
storm_data <- storm_data %>% filter(!grepl("SUMMARY", EVTYPE_CLEAN))
get_multiplier <- function(exp_char) {
exp_char <- toupper(as.character(exp_char))
if (is.na(exp_char) || exp_char %in% c("", "+", "-", "?")) return(1)
else if (exp_char == "H") return(100)
else if (exp_char == "K") return(1000)
else if (exp_char == "M") return(1000000)
else if (exp_char == "B") return(1000000000)
else if (grepl("^[0-9]$", exp_char)) return(10^as.numeric(exp_char))
else return(1)
}
storm_data$PROPDMG_VALUE <- storm_data$PROPDMG * sapply(storm_data$PROPDMGEXP, get_multiplier)
storm_data$CROPDMG_VALUE <- storm_data$CROPDMG * sapply(storm_data$CROPDMGEXP, get_multiplier)
storm_data$TOTAL_ECONOMIC_DMG <- storm_data$PROPDMG_VALUE + storm_data$CROPDMG_VALUE
health_impact <- storm_data %>%
group_by(EVTYPE_CLEAN) %>%
summarise(TotalFatalities = sum(FATALITIES, na.rm = TRUE),
TotalInjuries = sum(INJURIES, na.rm = TRUE),
.groups = 'drop') %>%
mutate(TotalCasualties = TotalFatalities + TotalInjuries) %>%
filter(TotalCasualties > 0) %>%
arrange(desc(TotalCasualties))
kable(head(health_impact, 10), caption = "Top 10 Event Types by Total Casualties")
| EVTYPE_CLEAN | TotalFatalities | TotalInjuries | TotalCasualties |
|---|---|---|---|
| TORNADO | 5658 | 91364 | 97022 |
| HEAT | 3021 | 9019 | 12040 |
| THUNDERSTORM WIND | 708 | 9468 | 10176 |
| FLOOD | 470 | 6789 | 7259 |
| LIGHTNING | 817 | 5232 | 6049 |
| FLASH FLOOD | 1018 | 1785 | 2803 |
| HIGH WIND | 404 | 1772 | 2176 |
| ICE STORM | 89 | 1977 | 2066 |
| WILDFIRE | 90 | 1606 | 1696 |
| WINTER STORM | 217 | 1353 | 1570 |
top_n_health_plot <- head(health_impact, 10)
ggplot(top_n_health_plot, aes(x = reorder(EVTYPE_CLEAN, TotalCasualties), y = TotalCasualties, fill = EVTYPE_CLEAN)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(title = "Top 10 Most Harmful Event Types to Population Health",
x = "Event Type",
y = "Total Casualties") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
economic_impact <- storm_data %>%
group_by(EVTYPE_CLEAN) %>%
summarise(TotalPropertyDamage_USD = sum(PROPDMG_VALUE, na.rm = TRUE),
TotalCropDamage_USD = sum(CROPDMG_VALUE, na.rm = TRUE),
TotalCombinedDamage_USD = sum(TOTAL_ECONOMIC_DMG, na.rm = TRUE),
.groups = 'drop') %>%
filter(TotalCombinedDamage_USD > 0) %>%
arrange(desc(TotalCombinedDamage_USD))
kable(head(economic_impact, 10), caption = "Top 10 Event Types by Total Economic Damage (USD)")
| EVTYPE_CLEAN | TotalPropertyDamage_USD | TotalCropDamage_USD | TotalCombinedDamage_USD |
|---|---|---|---|
| FLOOD | 144657709807 | 5661968450 | 150319678257 |
| HURRICANE/TYPHOON | 85356410010 | 5516117800 | 90872527810 |
| TORNADO | 58552152377 | 417461470 | 58969613847 |
| STORM SURGE/TIDE | 47964724000 | 855000 | 47965579000 |
| HAIL | 15977470013 | 3026094623 | 19003564636 |
| FLASH FLOOD | 17414731089 | 1437163150 | 18851894239 |
| DROUGHT | 1046106000 | 13972571780 | 15018677780 |
| THUNDERSTORM WIND | 9968511026 | 1225260988 | 11193772014 |
| RIVER FLOOD | 5118945500 | 5029459000 | 10148404500 |
| ICE STORM | 3944927860 | 5022113500 | 8967041360 |
top_n_economic_plot <- head(economic_impact, 10)
ggplot(top_n_economic_plot, aes(x = reorder(EVTYPE_CLEAN, TotalCombinedDamage_USD),
y = TotalCombinedDamage_USD / 1e9, fill = EVTYPE_CLEAN)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(title = "Top 10 Event Types by Economic Consequences",
x = "Event Type",
y = "Total Economic Damage (Billions USD)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))