library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Load the data
storm_data <- read.csv("C:/Users/Benjamin/Desktop/Project/repdata_data_StormData.csv.bz2")

# Quick checks
dim(storm_data)      # number of rows and columns
## [1] 902297     37
head(storm_data)     # first 6 rows
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6
str(storm_data)      # structure of the dataset
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
# Select key columns for analysis
storm_subset <- storm_data %>%
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

# Preview
head(storm_subset)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0           
## 6 TORNADO          0        6     2.5          K       0
# Step 5: Map damage exponents to numeric multipliers and calculate numeric damage

# Define mapping for exponents
exp_map <- c("0" = 1, "K" = 1e3, "k" = 1e3,
             "M" = 1e6, "m" = 1e6,
             "B" = 1e9, "b" = 1e9)

# Clean property damage
storm_subset$PROPDMGEXP <- toupper(storm_subset$PROPDMGEXP)
storm_subset$PROPDMGEXP[storm_subset$PROPDMGEXP == ""] <- "0"  # replace blanks with "0"
storm_subset$PROPDMGNUM <- exp_map[storm_subset$PROPDMGEXP] * storm_subset$PROPDMG

# Clean crop damage
storm_subset$CROPDMGEXP <- toupper(storm_subset$CROPDMGEXP)
storm_subset$CROPDMG[storm_subset$CROPDMGEXP == ""] <- 0        # replace blanks with 0
storm_subset$CROPDMGEXP[storm_subset$CROPDMGEXP == ""] <- "0"  # replace blanks with "0"
storm_subset$CROPDMGNUM <- exp_map[storm_subset$CROPDMGEXP] * storm_subset$CROPDMG

# Check cleaned data
head(storm_subset)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP PROPDMGNUM
## 1 TORNADO          0       15    25.0          K       0          0      25000
## 2 TORNADO          0        0     2.5          K       0          0       2500
## 3 TORNADO          0        2    25.0          K       0          0      25000
## 4 TORNADO          0        2     2.5          K       0          0       2500
## 5 TORNADO          0        2     2.5          K       0          0       2500
## 6 TORNADO          0        6     2.5          K       0          0       2500
##   CROPDMGNUM
## 1          0
## 2          0
## 3          0
## 4          0
## 5          0
## 6          0
# Step 6: Top 10 events by population harm (fatalities + injuries)

library(dplyr)
library(ggplot2)

# Group by event type and sum fatalities and injuries
health_impact <- storm_subset %>%
  group_by(EVTYPE) %>%
  summarise(Total_Harm = sum(FATALITIES + INJURIES, na.rm = TRUE)) %>%
  arrange(desc(Total_Harm))

# Take top 10
top_health <- head(health_impact, 10)
top_health
## # A tibble: 10 × 2
##    EVTYPE            Total_Harm
##    <chr>                  <dbl>
##  1 TORNADO                96979
##  2 EXCESSIVE HEAT          8428
##  3 TSTM WIND               7461
##  4 FLOOD                   7259
##  5 LIGHTNING               6046
##  6 HEAT                    3037
##  7 FLASH FLOOD             2755
##  8 ICE STORM               2064
##  9 THUNDERSTORM WIND       1621
## 10 WINTER STORM            1527
# Plot top 10 events by population harm
ggplot(top_health, aes(x = reorder(EVTYPE, Total_Harm), y = Total_Harm)) +
  geom_bar(stat = "identity", fill = "firebrick") +
  coord_flip() +
  labs(title = "Top 10 Storm Events by Population Harm",
       x = "Event Type", y = "Total Fatalities + Injuries")

# Step 7: Top 10 events by economic damage (property + crop damage)

# Group by event type and sum property + crop damage
econ_impact <- storm_subset %>%
  group_by(EVTYPE) %>%
  summarise(Total_Cost = sum(PROPDMGNUM + CROPDMGNUM, na.rm = TRUE)) %>%
  arrange(desc(Total_Cost))

# Take top 10
top_econ <- head(econ_impact, 10)
top_econ
## # A tibble: 10 × 2
##    EVTYPE               Total_Cost
##    <chr>                     <dbl>
##  1 FLOOD             150319678257 
##  2 HURRICANE/TYPHOON  71913712800 
##  3 TORNADO            57352113886.
##  4 STORM SURGE        43323541000 
##  5 HAIL               18733221483.
##  6 FLASH FLOOD        17561699078.
##  7 DROUGHT            15018672000 
##  8 HURRICANE          14610229010 
##  9 RIVER FLOOD        10148404500 
## 10 ICE STORM           8967041360
# Plot top 10 events by economic damage
ggplot(top_econ, aes(x = reorder(EVTYPE, Total_Cost), y = Total_Cost/1e9)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Storm Events by Economic Damage",
       x = "Event Type", y = "Total Damage (Billion $)")