library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Load the data
storm_data <- read.csv("C:/Users/Benjamin/Desktop/Project/repdata_data_StormData.csv.bz2")
# Quick checks
dim(storm_data) # number of rows and columns
## [1] 902297 37
head(storm_data) # first 6 rows
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
str(storm_data) # structure of the dataset
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
# Select key columns for analysis
storm_subset <- storm_data %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
# Preview
head(storm_subset)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
# Step 5: Map damage exponents to numeric multipliers and calculate numeric damage
# Define mapping for exponents
exp_map <- c("0" = 1, "K" = 1e3, "k" = 1e3,
"M" = 1e6, "m" = 1e6,
"B" = 1e9, "b" = 1e9)
# Clean property damage
storm_subset$PROPDMGEXP <- toupper(storm_subset$PROPDMGEXP)
storm_subset$PROPDMGEXP[storm_subset$PROPDMGEXP == ""] <- "0" # replace blanks with "0"
storm_subset$PROPDMGNUM <- exp_map[storm_subset$PROPDMGEXP] * storm_subset$PROPDMG
# Clean crop damage
storm_subset$CROPDMGEXP <- toupper(storm_subset$CROPDMGEXP)
storm_subset$CROPDMG[storm_subset$CROPDMGEXP == ""] <- 0 # replace blanks with 0
storm_subset$CROPDMGEXP[storm_subset$CROPDMGEXP == ""] <- "0" # replace blanks with "0"
storm_subset$CROPDMGNUM <- exp_map[storm_subset$CROPDMGEXP] * storm_subset$CROPDMG
# Check cleaned data
head(storm_subset)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP PROPDMGNUM
## 1 TORNADO 0 15 25.0 K 0 0 25000
## 2 TORNADO 0 0 2.5 K 0 0 2500
## 3 TORNADO 0 2 25.0 K 0 0 25000
## 4 TORNADO 0 2 2.5 K 0 0 2500
## 5 TORNADO 0 2 2.5 K 0 0 2500
## 6 TORNADO 0 6 2.5 K 0 0 2500
## CROPDMGNUM
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
# Step 6: Top 10 events by population harm (fatalities + injuries)
library(dplyr)
library(ggplot2)
# Group by event type and sum fatalities and injuries
health_impact <- storm_subset %>%
group_by(EVTYPE) %>%
summarise(Total_Harm = sum(FATALITIES + INJURIES, na.rm = TRUE)) %>%
arrange(desc(Total_Harm))
# Take top 10
top_health <- head(health_impact, 10)
top_health
## # A tibble: 10 × 2
## EVTYPE Total_Harm
## <chr> <dbl>
## 1 TORNADO 96979
## 2 EXCESSIVE HEAT 8428
## 3 TSTM WIND 7461
## 4 FLOOD 7259
## 5 LIGHTNING 6046
## 6 HEAT 3037
## 7 FLASH FLOOD 2755
## 8 ICE STORM 2064
## 9 THUNDERSTORM WIND 1621
## 10 WINTER STORM 1527
# Plot top 10 events by population harm
ggplot(top_health, aes(x = reorder(EVTYPE, Total_Harm), y = Total_Harm)) +
geom_bar(stat = "identity", fill = "firebrick") +
coord_flip() +
labs(title = "Top 10 Storm Events by Population Harm",
x = "Event Type", y = "Total Fatalities + Injuries")

# Step 7: Top 10 events by economic damage (property + crop damage)
# Group by event type and sum property + crop damage
econ_impact <- storm_subset %>%
group_by(EVTYPE) %>%
summarise(Total_Cost = sum(PROPDMGNUM + CROPDMGNUM, na.rm = TRUE)) %>%
arrange(desc(Total_Cost))
# Take top 10
top_econ <- head(econ_impact, 10)
top_econ
## # A tibble: 10 × 2
## EVTYPE Total_Cost
## <chr> <dbl>
## 1 FLOOD 150319678257
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57352113886.
## 4 STORM SURGE 43323541000
## 5 HAIL 18733221483.
## 6 FLASH FLOOD 17561699078.
## 7 DROUGHT 15018672000
## 8 HURRICANE 14610229010
## 9 RIVER FLOOD 10148404500
## 10 ICE STORM 8967041360
# Plot top 10 events by economic damage
ggplot(top_econ, aes(x = reorder(EVTYPE, Total_Cost), y = Total_Cost/1e9)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Storm Events by Economic Damage",
x = "Event Type", y = "Total Damage (Billion $)")
