Synopsis

This analysis uses the NOAA Storm Events database (1950 through November 2011) to identify which event types cause the greatest harm to population health (fatalities + injuries) and which cause the largest economic damage (property + crop losses). The R Markdown document downloads the raw compressed CSV, performs reproducible processing (including interpreting damage exponent codes), summarizes the data by event type, and presents bar charts of the top contributors for both public-health and economic impact. Code is shown for every step so the analysis can be re-run and published to RPubs.

Data Processing

This section shows the R code used to download, read, clean, and prepare the NOAA storm data for analysis. All code chunks are shown (echo = TRUE).

# Load packages used in the analysis
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
if (!requireNamespace("readr", quietly = TRUE)) install.packages("readr")
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
if (!requireNamespace("stringr", quietly = TRUE)) install.packages("stringr")

library(dplyr)
library(readr)
library(ggplot2)
library(stringr)

# URL for the course dataset (compressed CSV)
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

# Download file to a temporary file and read directly (no manual preprocessing)
tempfile_path <- tempfile(fileext = ".bz2")
downloader <- tryCatch({
  download.file(url, destfile = tempfile_path, mode = "wb")
  TRUE
}, error = function(e) {
  message("Download failed: ", e$message)
  FALSE
})

# Read the CSV (readr::read_csv is tolerant and fast)
storm_raw <- read_csv(tempfile_path, guess_max = 100000)

# Inspect the structure
glimpse(storm_raw)

## Rows: 902,297
## Columns: 37
## $ STATE__    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ BGN_DATE   <chr> "4/18/1950 0:00:00", "4/18/1950 0:00:00", "2/20/1951 0:00:0…
## $ BGN_TIME   <chr> "0130", "0145", "1600", "0900", "1500", "2000", "0100", "09…
## $ TIME_ZONE  <chr> "CST", "CST", "CST", "CST", "CST", "CST", "CST", "CST", "CS…
## $ COUNTY     <dbl> 97, 3, 57, 89, 43, 77, 9, 123, 125, 57, 43, 9, 73, 49, 107,…
## $ COUNTYNAME <chr> "MOBILE", "BALDWIN", "FAYETTE", "MADISON", "CULLMAN", "LAUD…
## $ STATE      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",…
## $ EVTYPE     <chr> "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TOR…
## $ BGN_RANGE  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BGN_AZI    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ BGN_LOCATI <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_DATE   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_TIME   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ COUNTY_END <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ COUNTYENDN <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_RANGE  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ END_AZI    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ END_LOCATI <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ LENGTH     <dbl> 14.0, 2.0, 0.1, 0.0, 0.0, 1.5, 1.5, 0.0, 3.3, 2.3, 1.3, 4.7…
## $ WIDTH      <dbl> 100, 150, 123, 100, 150, 177, 33, 33, 100, 100, 400, 400, 2…
## $ F          <dbl> 3, 2, 2, 2, 2, 2, 2, 1, 3, 3, 1, 1, 3, 3, 3, 4, 1, 1, 1, 1,…
## $ MAG        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FATALITIES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 0,…
## $ INJURIES   <dbl> 15, 0, 2, 2, 2, 6, 1, 0, 14, 0, 3, 3, 26, 12, 6, 50, 2, 0, …
## $ PROPDMG    <dbl> 25.0, 2.5, 25.0, 2.5, 2.5, 2.5, 2.5, 2.5, 25.0, 25.0, 2.5, …
## $ PROPDMGEXP <chr> "K", "K", "K", "K", "K", "K", "K", "K", "K", "K", "M", "M",…
## $ CROPDMG    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CROPDMGEXP <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ WFO        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ STATEOFFIC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ZONENAMES  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ LATITUDE   <dbl> 3040, 3042, 3340, 3458, 3412, 3450, 3405, 3255, 3334, 3336,…
## $ LONGITUDE  <dbl> 8812, 8755, 8742, 8626, 8642, 8748, 8631, 8558, 8740, 8738,…
## $ LATITUDE_E <dbl> 3051, 0, 0, 0, 0, 0, 0, 0, 3336, 3337, 3402, 3404, 0, 3432,…
## $ LONGITUDE_ <dbl> 8806, 0, 0, 0, 0, 0, 0, 0, 8738, 8737, 8644, 8640, 0, 8540,…
## $ REMARKS    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ REFNUM     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …

# Keep only columns we need for this analysis
storm <- storm_raw %>%
  select(BGN_DATE, STATE, EVTYPE,
         FATALITIES, INJURIES,
         PROPDMG, PROPDMGEXP,
         CROPDMG, CROPDMGEXP)


# Clean EVTYPE: uppercase trim and collapse whitespace
storm <- storm %>%
  mutate(EVTYPE = str_to_upper(str_trim(EVTYPE)))

# Function to convert damage exponent to numeric multiplier
exp_to_mult <- function(exp) {
  # make NA and blanks zero multiplier
  exp <- toupper(as.character(exp))
  exp[is.na(exp)] <- ""
  exp <- str_trim(exp)
  mult <- numeric(length(exp))
  mult[exp %in% c("", "0")] <- 1
  mult[exp %in% c("K")] <- 1e3
  mult[exp %in% c("M")] <- 1e6
  mult[exp %in% c("B")] <- 1e9
  # There are other codes used; handle common ones
  mult[exp %in% c("H")] <- 100
  # Numeric exponents (e.g., "2" => 10^2) occasionally appear
  is_num <- grepl("^[0-9]+$", exp)
  mult[is_num] <- 10 ^ as.numeric(exp[is_num])
  # Anything else: try to interpret common synonyms (e.g., 'K' spelled full)
  # default to 1 to avoid dropping data but flag in a column if unusual
  mult[mult == 0] <- 1
  return(mult)
}

# Apply multipliers and compute total economic damage
storm <- storm %>%
  mutate(PROP_MULT = exp_to_mult(PROPDMGEXP),
         CROP_MULT = exp_to_mult(CROPDMGEXP),
         PROP_DMG_USD = PROPDMG * PROP_MULT,
         CROP_DMG_USD = CROPDMG * CROP_MULT,
         TOTAL_ECON_DMG = PROP_DMG_USD + CROP_DMG_USD,
         TOTAL_HEALTH_IMPACT = FATALITIES + INJURIES)

# Quick sanity checks
summary(storm$PROP_DMG_USD)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 4.746e+05 5.000e+02 1.150e+11

summary(storm$CROP_DMG_USD)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 5.442e+04 0.000e+00 5.000e+09

summary(storm$TOTAL_HEALTH_IMPACT)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0000    0.0000    0.0000    0.1725    0.0000 1742.0000

Results

This section computes summaries by event type and produces figures showing the top event types by health impact and by economic damage.

# Aggregate by event type for health impact
health_by_event <- storm %>%
  group_by(EVTYPE) %>%
  summarize(total_fatalities = sum(FATALITIES, na.rm = TRUE),
            total_injuries = sum(INJURIES, na.rm = TRUE),
            total_health = sum(TOTAL_HEALTH_IMPACT, na.rm = TRUE),
            n_events = n()) %>%
  arrange(desc(total_health))

# Top 10 by total health impact
top10_health <- health_by_event %>% slice_head(n = 10)
knitr::kable(top10_health, digits = 0, caption = "Top 10 EVTYPE by total (fatalities + injuries)")

Top 10 EVTYPE by total (fatalities + injuries)
EVTYPE	total_fatalities	total_injuries	total_health	n_events
TORNADO	5633	91346	96979	60652
EXCESSIVE HEAT	1903	6525	8428	1678
TSTM WIND	504	6957	7461	219946
FLOOD	470	6789	7259	25327
LIGHTNING	816	5230	6046	15755
HEAT	937	2100	3037	767
FLASH FLOOD	978	1777	2755	54278
ICE STORM	89	1975	2064	2006
THUNDERSTORM WIND	133	1488	1621	82564
WINTER STORM	206	1321	1527	11433

# Bar plot for health impact: fatalities + injuries (stacked)
plot_health <- top10_health %>%
  tidyr::pivot_longer(cols = c(total_fatalities, total_injuries),
                      names_to = "type", values_to = "count") %>%
  mutate(EVTYPE = forcats::fct_reorder(EVTYPE, -count, .fun = sum))

ggplot(plot_health, aes(x = EVTYPE, y = count, fill = type)) +
  geom_col(position = "stack") +
  coord_flip() +
  labs(title = "Top 10 Event Types by Total Health Impact (Fatalities + Injuries)",
       x = "Event Type", y = "Count", fill = "Measure") +
  theme_minimal()

# Aggregate by event type for economic damage
econ_by_event <- storm %>%
  group_by(EVTYPE) %>%
  summarize(total_prop = sum(PROP_DMG_USD, na.rm = TRUE),
            total_crop = sum(CROP_DMG_USD, na.rm = TRUE),
            total_econ = sum(TOTAL_ECON_DMG, na.rm = TRUE),
            n_events = n()) %>%
  arrange(desc(total_econ))

# Top 10 by economic damage
top10_econ <- econ_by_event %>% slice_head(n = 10)
knitr::kable(top10_econ %>% mutate(total_prop = round(total_prop, 0),
                                   total_crop = round(total_crop, 0),
                                   total_econ = round(total_econ, 0)),
             caption = "Top 10 EVTYPE by total economic damage (USD)")

Top 10 EVTYPE by total economic damage (USD)
EVTYPE	total_prop	total_crop	total_econ	n_events
FLOOD	144657709807	5661968450	150319678257	25327
HURRICANE/TYPHOON	69305840000	2607872800	71913712800	88
TORNADO	56947380676	414953270	57362333946	60652
STORM SURGE	43323536000	5000	43323541000	261
HAIL	15735267513	3025954473	18761221986	288661
FLASH FLOOD	16822723978	1421317100	18244041078	54278
DROUGHT	1046106000	13972566000	15018672000	2488
HURRICANE	11868319010	2741910000	14610229010	174
RIVER FLOOD	5118945500	5029459000	10148404500	173
ICE STORM	3944927860	5022113500	8967041360	2006

# Bar plot for economic damage (property + crop stacked)
plot_econ <- top10_econ %>%
  tidyr::pivot_longer(cols = c(total_prop, total_crop),
                      names_to = "type", values_to = "amount") %>%
  mutate(EVTYPE = forcats::fct_reorder(EVTYPE, -amount, .fun = sum))

ggplot(plot_econ, aes(x = EVTYPE, y = amount, fill = type)) +
  geom_col(position = "stack") +
  coord_flip() +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$", scale = 1)) +
  labs(title = "Top 10 Event Types by Total Economic Damage (Property + Crop)",
       x = "Event Type", y = "Damage (USD)", fill = "Damage Type") +
  theme_minimal()

Short Discussion

The tables and figures above identify which event types are most harmful to population health and which cause the largest economic losses across the period covered. The code and plots are suitable for inclusion in a short briefing to municipal managers. When preparing resources and prioritization, managers may consider both frequency (number of events) and severity (per-event damage). The code can be modified to restrict analysis to more recent years (e.g., 1990+) if that is preferred.

Reproducibility notes

The analysis starts from the raw compressed CSV and performs all processing in this document.
The damage exponent conversion is conservative: uncommon or ambiguous exponent codes are treated as multiplier = 1 to avoid discarding records. If you want to strictly flag uncommon exponents, add an audit step to list unique PROPDMGEXP and CROPDMGEXP values.

Appendix: Quick audit of exponent codes (optional)

unique_props <- sort(unique(storm_raw$PROPDMGEXP))
unique_crops <- sort(unique(storm_raw$CROPDMGEXP))
unique_props

##  [1] "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m" "M"

unique_crops

## [1] "?" "0" "2" "B" "k" "K" "m" "M"

sessionInfo()

## R version 4.5.0 (2025-04-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stringr_1.5.1 ggplot2_3.5.2 readr_2.1.5   dplyr_1.1.4  
## 
## loaded via a namespace (and not attached):
##  [1] bit_4.6.0         gtable_0.3.6      jsonlite_2.0.0    crayon_1.5.3     
##  [5] compiler_4.5.0    tidyselect_1.2.1  parallel_4.5.0    tidyr_1.3.1      
##  [9] jquerylib_0.1.4   scales_1.3.0      yaml_2.3.10       fastmap_1.2.0    
## [13] R6_2.6.1          labeling_0.4.3    generics_0.1.3    knitr_1.50       
## [17] forcats_1.0.0     tibble_3.2.1      munsell_0.5.1     bslib_0.9.0      
## [21] pillar_1.10.2     tzdb_0.5.0        rlang_1.1.6       cachem_1.1.0     
## [25] stringi_1.8.7     xfun_0.52         sass_0.4.10       bit64_4.6.0-1    
## [29] cli_3.6.4         withr_3.0.2       magrittr_2.0.3    digest_0.6.37    
## [33] grid_4.5.0        vroom_1.6.5       rstudioapi_0.17.1 hms_1.1.3        
## [37] lifecycle_1.0.4   vctrs_0.6.5       evaluate_1.0.3    glue_1.8.0       
## [41] farver_2.1.2      colorspace_2.1-1  purrr_1.0.4       rmarkdown_2.29   
## [45] tools_4.5.0       pkgconfig_2.0.3   htmltools_0.5.8.1

Noaa Storm Data Analysis

Edwin R. Mutandi

2025-11-05