Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern. This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.It does the following tasks as well 1. It finds which types of events are harmful with respect to population health across United States. 2. It also finds which types of events have greatest economic consequences across United States.
Analysis Starts
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Reading raw data from original file
# Download if file doesn't exist
if (!file.exists("stormData.csv.bz2")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "stormData.csv.bz2")
}
# Read the compressed CSV
st <- read.csv(bzfile("stormData.csv.bz2"), stringsAsFactors = FALSE)
Data Description
head(st)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
Data Processing and Transformation
# --- Clean FATALITIES and INJURIES columns ---
st$FATALITIES <- as.numeric(gsub("[^0-9.]", "", st$FATALITIES))
st$INJURIES <- as.numeric(gsub("[^0-9.]", "", st$INJURIES))
st$FATALITIES[is.na(st$FATALITIES)] <- 0
st$INJURIES[is.na(st$INJURIES)] <- 0
# --- Clean PROPDMG and CROPDMG columns ---
st$PROPDMG <- as.numeric(gsub("[^0-9.]", "", st$PROPDMG))
st$CROPDMG <- as.numeric(gsub("[^0-9.]", "", st$CROPDMG))
st$PROPDMG[is.na(st$PROPDMG)] <- 0
st$CROPDMG[is.na(st$CROPDMG)] <- 0
# --- Normalize and trim exponent columns ---
st$PROPDMGEXP <- toupper(trimws(st$PROPDMGEXP))
st$CROPDMGEXP <- toupper(trimws(st$CROPDMGEXP))
Results
exp_map <- c(
H = 1e2, K = 1e3, M = 1e6, B = 1e9,
h = 1e2, k = 1e3, m = 1e6, b = 1e9,
`0` = 1, `1` = 10, `2` = 1e2, `3` = 1e3, `4` = 1e4,
`5` = 1e5, `6` = 1e6, `7` = 1e7, `8` = 1e8,
`+` = 1, `-` = 1, `?` = 1)
st$prop_multiplier <- exp_map[st$PROPDMGEXP]
st$crop_multiplier <- exp_map[st$CROPDMGEXP]
st$prop_multiplier[is.na(st$prop_multiplier)] <- 1
st$crop_multiplier[is.na(st$crop_multiplier)] <- 1
st$Property_Damage <- st$PROPDMG * st$prop_multiplier
st$Crop_Damage <- st$CROPDMG * st$crop_multiplier
st$Total_Economic_Damage <- st$Property_Damage + st$Crop_Damage
Summary of Population Health Impact
library(dplyr)
health_impact <- st %>%
group_by(EVTYPE) %>%
summarise(
Total_Fatalities = sum(FATALITIES),
Total_Injuries = sum(INJURIES),
Total_Harm = Total_Fatalities + Total_Injuries
) %>%
arrange(desc(Total_Harm))
cat("Question 1: Event types most harmful to population health in the US:\n")
## Question 1: Event types most harmful to population health in the US:
print(head(health_impact, 5))
## # A tibble: 5 × 4
## EVTYPE Total_Fatalities Total_Injuries Total_Harm
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
Summary of Economics Impact
econ_impact <- st %>%
group_by(EVTYPE) %>%
summarise(
Total_Economic_Damage = sum(Total_Economic_Damage)
) %>%
arrange(desc(Total_Economic_Damage))
cat("\nQuestion 2: Event types with greatest economic consequences in the US:\n")
##
## Question 2: Event types with greatest economic consequences in the US:
head(econ_impact, 5)
## # A tibble: 5 × 2
## EVTYPE Total_Economic_Damage
## <chr> <dbl>
## 1 FLOOD 150319678257
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57362333946.
## 4 STORM SURGE 43323541000
## 5 HAIL 18761221986.
Plot top 10 event for population Health impact
library(ggplot2)
top10_health <- head(health_impact, 10)
ggplot(top10_health, aes(x = reorder(EVTYPE, Total_Harm), y = Total_Harm)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Top 10 Most Harmful Weather Events (Population Health)",
x = "Event Type",
y = "Total Harm (Fatalities + Injuries)"
) +
theme_minimal()
Plot top 10 event for economic impact
top10_econ <- head(econ_impact, 10)
ggplot(top10_econ, aes(x = reorder(EVTYPE, Total_Economic_Damage), y = Total_Economic_Damage / 1e9)) +
geom_bar(stat = "identity", fill = "darkred") +
coord_flip() +
labs(
title = "Top 10 Weather Events by Economic Damage",
x = "Event Type",
y = "Total Damage (in Billions USD)"
) +
theme_minimal()