Introduction

This analysis explores the NOAA Storm Database to determine which event types have the greatest impact in terms of deaths, injuries, and economc impact. The dataset is from 1950 to 2011. ## Data Processing The data comes from the U.S. National Oceanic and Atmospheric Administration (NOAA) Storm Database, which explains multiple weather events on the United States and the number of deaths, injuries, property damage, and crop damage.

The format of the data is a CSV file. Data is loaded using read_csv() function from the readr package. The variables we focus in our analysis are: EVTYPE (Event Type) FATALITIES (Number of deaths) INJURIES (Number of injuries) PROPDMG (Property damage amount) PROPDMGEXP (Exponent for property damage) CROPDMG (Crop damage amount) CROPDMGEXP (Exponent for crop damage) Because it is such a large dataset and the time to evaluate the plots was long the data was filtered to exclude any data that did not result in deaths, injuries, or damage to property/crops.

Also the property and crop damage values (PROPDMG and CROPDMG) are converted into actual numerical values (“K” = thoudands, “M” = millions, “B” = billions). ### Loading Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)

# Load dataset
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(file_url, destfile = "StormData.csv.bz2", mode = "wb")
storm_data <- read_csv("StormData.csv.bz2")
## Rows: 902297 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): BGN_DATE, BGN_TIME, TIME_ZONE, COUNTYNAME, STATE, EVTYPE, BGN_AZI,...
## dbl (18): STATE__, COUNTY, BGN_RANGE, COUNTY_END, END_RANGE, LENGTH, WIDTH, ...
## lgl  (1): COUNTYENDN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- storm_data %>%
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
  filter(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)  # Remove irrelevant rows

Data Preparation

# Select relevant columns
data <- storm_data %>% 
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
str(data)
## tibble [902,297 × 7] (S3: tbl_df/tbl/data.frame)
##  $ EVTYPE    : chr [1:902297] "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ FATALITIES: num [1:902297] 0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num [1:902297] 15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num [1:902297] 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr [1:902297] "K" "K" "K" "K" ...
##  $ CROPDMG   : num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr [1:902297] NA NA NA NA ...
summary(data)
##     EVTYPE            FATALITIES          INJURIES            PROPDMG       
##  Length:902297      Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00  
##  Class :character   1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   0.00  
##  Mode  :character   Median :  0.0000   Median :   0.0000   Median :   0.00  
##                     Mean   :  0.0168   Mean   :   0.1557   Mean   :  12.06  
##                     3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:   0.50  
##                     Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00  
##   PROPDMGEXP           CROPDMG         CROPDMGEXP       
##  Length:902297      Min.   :  0.000   Length:902297     
##  Class :character   1st Qu.:  0.000   Class :character  
##  Mode  :character   Median :  0.000   Mode  :character  
##                     Mean   :  1.527                     
##                     3rd Qu.:  0.000                     
##                     Max.   :990.000
colnames(data)
## [1] "EVTYPE"     "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP"
## [6] "CROPDMG"    "CROPDMGEXP"
# Define the exponent map BEFORE using it
exp_map <- c("K" = 1e3, "M" = 1e6, "B" = 1e9, " " = 1, "0" = 1)

data <- data %>%
  mutate(
    PROPDMGEXP = toupper(as.character(PROPDMGEXP)),
    CROPDMGEXP = toupper(as.character(CROPDMGEXP)),
    
    # Replace NA values with "0" before mapping
    PROPDMGEXP = ifelse(is.na(PROPDMGEXP) | PROPDMGEXP == "", "0", PROPDMGEXP),
    CROPDMGEXP = ifelse(is.na(CROPDMGEXP) | CROPDMGEXP == "", "0", CROPDMGEXP),
    
    # Convert to multipliers
    PROPDMGEXP = ifelse(PROPDMGEXP %in% names(exp_map), exp_map[PROPDMGEXP], 1),
    CROPDMGEXP = ifelse(CROPDMGEXP %in% names(exp_map), exp_map[CROPDMGEXP], 1),
    
    # Ensure numeric conversion works properly
    PROPDMGEXP = as.numeric(PROPDMGEXP),
    CROPDMGEXP = as.numeric(CROPDMGEXP),
    
    # Compute total damages
    TOTAL_PROP_DMG = PROPDMG * PROPDMGEXP,
    TOTAL_CROP_DMG = CROPDMG * CROPDMGEXP,
    TOTAL_ECONOMIC_DMG = TOTAL_PROP_DMG + TOTAL_CROP_DMG
  )

Results

Most Harmful Events to Population Health

health_impact <- data %>%
  group_by(EVTYPE) %>%
  summarise(TOTAL_FATALITIES = sum(FATALITIES, na.rm = TRUE),
            TOTAL_INJURIES = sum(INJURIES, na.rm = TRUE)) %>%
  arrange(desc(TOTAL_FATALITIES + TOTAL_INJURIES)) %>%
  top_n(10, TOTAL_FATALITIES + TOTAL_INJURIES)

ggplot(health_impact, aes(x = reorder(EVTYPE, -(TOTAL_FATALITIES + TOTAL_INJURIES)), 
                           y = TOTAL_FATALITIES + TOTAL_INJURIES, fill = EVTYPE)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Weather Events Causing Most Harm to Population Health",
       x = "Event Type", y = "Total Fatalities + Injuries") +
  theme_minimal()

###Greatest Economic Consequences

economic_impact <- data %>%
  group_by(EVTYPE) %>%
  summarise(TOTAL_ECONOMIC_DMG = sum(TOTAL_ECONOMIC_DMG, na.rm = TRUE)) %>%
  arrange(desc(TOTAL_ECONOMIC_DMG)) %>%
  top_n(10, TOTAL_ECONOMIC_DMG)

ggplot(economic_impact, aes(x = reorder(EVTYPE, -TOTAL_ECONOMIC_DMG), 
                             y = TOTAL_ECONOMIC_DMG / 1e9, fill = EVTYPE)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Weather Events with Greatest Economic Impact",
       x = "Event Type", y = "Total Economic Damage (Billion USD)") +
  theme_minimal()

This analysis shows that tornadoes have the most significant impact on public health, causing the highest number of injuries and detahs. On the other hand floods have the greatest effect on economic impact.