1: Synopsis

The goal of this analysis is to explore the NOAA Storm Database and explore the effects of severe weather events on both population and economy.The database covers the time period between 1950 and November 2011.

The following analysis investigates which types of severe weather events are most harmful on:

  1. Health (injuries and fatalities)
  2. Property and crops (economic consequences)

Information on the Data: Documentation

2: Data Processing

2.1: Data Loading

Download the raw data file and extract the data into a dataframe.Then convert to a data.table

# Load required libraries
library("data.table")
## Warning: package 'data.table' was built under R version 4.3.2
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.3.2
# Define the URL for the dataset and download it
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#download.file(data_url, destfile = #"/Users/mkia/Documents/rcoursera/CourseFive/storm_data.csv.bz2")

# Read the CSV file into a data table
storm_data <- fread("/Users/mkia/Documents/rcoursera/CourseFive/storm_data.csv.bz2")
## Warning in
## fread("/Users/mkia/Documents/rcoursera/CourseFive/storm_data.csv.bz2"):
## Discarded single-line footer: <<31.00,6/7/2002 0:00:00,"05:40:00
## PM","MST",101.00,"KEITH","NE","HAIL",6.00,"NW","PAXTON",6/7/2002
## 0:00:00,"05:40:00
## PM",0.00,,6.00,"NW","PAXTON",0.00,0.00,,75.00,0.00,0.00,0.00,,0.00,,"LBF","NEBRASKA,
## C>>

2.2: Examining Column Names

# Display column names to understand the structure of the data
colnames(storm_data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

2.3: Data Subsetting

Subset the dataset on the parameters of interest. Basically, we remove the columns we don’t need for clarity.

# Identify and remove unnecessary columns
columns_to_keep <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
storm_data <- storm_data[, ..columns_to_keep]

# Filter the dataset to include only rows with significant impact
storm_data <- storm_data[(EVTYPE != "?" & (INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0))]

2.4: Converting Exponent Columns

# Convert property and crop damage exponent columns to numeric values
storm_data[, PROPDMGEXP := toupper(PROPDMGEXP)]
storm_data[, CROPDMGEXP := toupper(CROPDMGEXP)]

# Define mapping for exponents
damage_exp_mapping <- list(
  'PROPDMGEXP' = c("\"\"" = 1, "-" = 1, "+" = 1, "0" = 1, "1" = 10, "2" = 100, "3" = 1000, "4" = 10000,
                   "5" = 100000, "6" = 1000000, "7" = 10000000, "8" = 100000000, "9" = 1000000000,
                   "H" = 100, "K" = 1000, "M" = 1000000, "B" = 1000000000),
  'CROPDMGEXP' = c("\"\"" = 1, "?" = 1, "0" = 1, "K" = 1000, "M" = 1000000, "B" = 1000000000)
)

# Apply the mapping
storm_data[, PROPDMGEXP := damage_exp_mapping[['PROPDMGEXP']][PROPDMGEXP]]
storm_data[, CROPDMGEXP := damage_exp_mapping[['CROPDMGEXP']][CROPDMGEXP]]

# Replace NAs with default value
storm_data[is.na(PROPDMGEXP), PROPDMGEXP := 1]
storm_data[is.na(CROPDMGEXP), CROPDMGEXP := 1]

2.5: Calculating Economic Costs

# Ensure PROPDMGEXP and CROPDMGEXP columns are correctly converted to numeric values
storm_data[, PROPDMGEXP := as.numeric(PROPDMGEXP)]
storm_data[, CROPDMGEXP := as.numeric(CROPDMGEXP)]

# Replace any NAs that might have resulted from unmapped values with 1
storm_data[is.na(PROPDMGEXP), PROPDMGEXP := 1]
storm_data[is.na(CROPDMGEXP), CROPDMGEXP := 1]


# Calculate property and crop damage costs
storm_data[, `:=` (
  Property_Damage = PROPDMG * PROPDMGEXP,
  Crop_Damage = CROPDMG * CROPDMGEXP
)]

2.6: Aggregating Total Costs

# Aggregate total economic impact by event type
economic_impact <- storm_data[, .(
  Total_Property_Damage = sum(Property_Damage),
  Total_Crop_Damage = sum(Crop_Damage),
  Total_Economic_Damage = sum(Property_Damage) + sum(Crop_Damage)
), by = EVTYPE]

# Sort the results by total economic damage and select the top 10
top_economic_impact <- economic_impact[order(-Total_Economic_Damage)][1:10]

2.7: Aggregating Health Impacts

# Aggregate total fatalities and injuries by event type
health_impact <- storm_data[, .(
  Total_Fatalities = sum(FATALITIES),
  Total_Injuries = sum(INJURIES),
  Total_Health_Impact = sum(FATALITIES) + sum(INJURIES)
), by = EVTYPE]

# Sort the results by total fatalities and select the top 10
top_health_impact <- health_impact[order(-Total_Fatalities)][1:10]

3: Results

3.1: Events Most Harmful to Population Health

# Prepare data for plotting
health_impact_melted <- melt(top_health_impact, id.vars = "EVTYPE", variable.name = "Impact_Type")

# Create a bar plot for health impacts
health_plot <- ggplot(health_impact_melted, aes(x = reorder(EVTYPE, -value), y = value)) +
  geom_bar(stat = "identity", aes(fill = Impact_Type), position = "dodge") +
  labs(title = "Top 10 Weather Events Most Harmful to Health in the US", y = "Count", x = "Event Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Display the plot
health_plot

3.2: Events that have the Greatest Economic Consequences

# Prepare data for plotting
economic_impact_melted <- melt(top_economic_impact, id.vars = "EVTYPE", variable.name = "Damage_Type")

# Create a bar plot for economic impacts
economic_plot <- ggplot(economic_impact_melted, aes(x = reorder(EVTYPE, -value), y = value)) +
  geom_bar(stat = "identity", aes(fill = Damage_Type), position = "dodge") +
  labs(title = "Top 10 Weather Events with Greatest Economic Consequences", y = "Cost (USD)", x = "Event Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Display the plot
economic_plot