This analysis explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to identify:
Using data from 1950 to 2011, this study provides insights using data processing, aggregation, and visualization techniques in R.
library(dplyr) # For data manipulation
## Warning: package 'dplyr' was built under R version 4.2.3
library(ggplot2) # For data visualization
## Warning: package 'ggplot2' was built under R version 4.2.3
library(readr) # For reading CSV files
## Warning: package 'readr' was built under R version 4.2.3
# Set options for large numbers
options(scipen=999) # Disable scientific notation
knitr::opts_chunk$set(fig.path="figure/", echo=TRUE, message=FALSE, warning=FALSE)
# Download and extract the dataset if not already present
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file_name <- "StormData.csv.bz2"
if (!file.exists(file_name)) {
download.file(file_url, destfile = file_name, method = "curl")
}
# Read the data
storm_data <- read_csv(file_name)
dim(storm_data) # Check number of rows and columns
## [1] 902297 37
head(storm_data) # View first few rows
## # A tibble: 6 × 37
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 4/18/1950… 0130 CST 97 MOBILE AL TORNA… 0
## 2 1 4/18/1950… 0145 CST 3 BALDWIN AL TORNA… 0
## 3 1 2/20/1951… 1600 CST 57 FAYETTE AL TORNA… 0
## 4 1 6/8/1951 … 0900 CST 89 MADISON AL TORNA… 0
## 5 1 11/15/195… 1500 CST 43 CULLMAN AL TORNA… 0
## 6 1 11/15/195… 2000 CST 77 LAUDERDALE AL TORNA… 0
## # ℹ 28 more variables: BGN_AZI <chr>, BGN_LOCATI <chr>, END_DATE <chr>,
## # END_TIME <chr>, COUNTY_END <dbl>, COUNTYENDN <lgl>, END_RANGE <dbl>,
## # END_AZI <chr>, END_LOCATI <chr>, LENGTH <dbl>, WIDTH <dbl>, F <dbl>,
## # MAG <dbl>, FATALITIES <dbl>, INJURIES <dbl>, PROPDMG <dbl>,
## # PROPDMGEXP <chr>, CROPDMG <dbl>, CROPDMGEXP <chr>, WFO <chr>,
## # STATEOFFIC <chr>, ZONENAMES <chr>, LATITUDE <dbl>, LONGITUDE <dbl>,
## # LATITUDE_E <dbl>, LONGITUDE_ <dbl>, REMARKS <chr>, REFNUM <dbl>
storm_data <- storm_data %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
convert_multiplier <- function(exp) {
exp <- toupper(exp)
ifelse(exp == "B", 1e9,
ifelse(exp == "M", 1e6,
ifelse(exp == "K", 1e3, 1)))
}
storm_data <- storm_data %>%
mutate(
PROPDMGEXP = convert_multiplier(PROPDMGEXP),
CROPDMGEXP = convert_multiplier(CROPDMGEXP),
PropertyDamage = PROPDMG * PROPDMGEXP,
CropDamage = CROPDMG * CROPDMGEXP,
TotalDamage = PropertyDamage + CropDamage
)
health_impact <- storm_data %>%
group_by(EVTYPE) %>%
summarize(TotalFatalities = sum(FATALITIES, na.rm = TRUE),
TotalInjuries = sum(INJURIES, na.rm = TRUE)) %>%
arrange(desc(TotalFatalities + TotalInjuries)) %>%
head(10)
print(health_impact)
## # A tibble: 10 × 3
## EVTYPE TotalFatalities TotalInjuries
## <chr> <dbl> <dbl>
## 1 TORNADO 5633 91346
## 2 EXCESSIVE HEAT 1903 6525
## 3 TSTM WIND 504 6957
## 4 FLOOD 470 6789
## 5 LIGHTNING 816 5230
## 6 HEAT 937 2100
## 7 FLASH FLOOD 978 1777
## 8 ICE STORM 89 1975
## 9 THUNDERSTORM WIND 133 1488
## 10 WINTER STORM 206 1321
# Plot: Top 10 Events Causing Most Fatalities & Injuries
ggplot(health_impact, aes(x = reorder(EVTYPE, TotalFatalities + TotalInjuries),
y = TotalFatalities + TotalInjuries, fill = TotalFatalities)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Most Harmful Weather Events (Health Impact)",
x = "Event Type",
y = "Total Fatalities & Injuries") +
theme_minimal()
economic_impact <- storm_data %>%
group_by(EVTYPE) %>%
summarize(TotalDamage = sum(TotalDamage, na.rm = TRUE)) %>%
arrange(desc(TotalDamage)) %>%
head(10)
print(economic_impact)
## # A tibble: 10 × 2
## EVTYPE TotalDamage
## <chr> <dbl>
## 1 FLOOD 138007444500
## 2 HURRICANE/TYPHOON 29348167800
## 3 TORNADO 16570326363
## 4 HURRICANE 12405268000
## 5 RIVER FLOOD 10108369000
## 6 HAIL 10045596620
## 7 FLASH FLOOD 8715885183.
## 8 ICE STORM 5925150850
## 9 STORM SURGE/TIDE 4641493000
## 10 THUNDERSTORM WIND 3813647990
# Plot: Top 10 Events with the Highest Economic Impact (Dot Plot)
ggplot(economic_impact, aes(x = reorder(EVTYPE, TotalDamage), y = TotalDamage)) +
geom_point(color = "blue", size = 4) +
coord_flip() +
labs(title = "Top 10 Weather Events with Highest Economic Impact",
x = "Event Type",
y = "Total Damage (USD)") +
theme_minimal()