The goal of this analysis is to explore the NOAA Storm Database and explore the effects of severe weather events on both population and economy.The database covers the time period between 1950 and November 2011.
The following analysis investigates which types of severe weather events are most harmful on:
Information on the Data: Documentation
Download the raw data file and extract the data into a dataframe.Then convert to a data.table
# Load required libraries
library("data.table")
## Warning: package 'data.table' was built under R version 4.3.2
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.3.2
# Define the URL for the dataset and download it
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#download.file(data_url, destfile = #"/Users/mkia/Documents/rcoursera/CourseFive/storm_data.csv.bz2")
# Read the CSV file into a data table
storm_data <- fread("/Users/mkia/Documents/rcoursera/CourseFive/storm_data.csv.bz2")
## Warning in
## fread("/Users/mkia/Documents/rcoursera/CourseFive/storm_data.csv.bz2"):
## Discarded single-line footer: <<31.00,6/7/2002 0:00:00,"05:40:00
## PM","MST",101.00,"KEITH","NE","HAIL",6.00,"NW","PAXTON",6/7/2002
## 0:00:00,"05:40:00
## PM",0.00,,6.00,"NW","PAXTON",0.00,0.00,,75.00,0.00,0.00,0.00,,0.00,,"LBF","NEBRASKA,
## C>>
# Display column names to understand the structure of the data
colnames(storm_data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Subset the dataset on the parameters of interest. Basically, we remove the columns we don’t need for clarity.
# Identify and remove unnecessary columns
columns_to_keep <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
storm_data <- storm_data[, ..columns_to_keep]
# Filter the dataset to include only rows with significant impact
storm_data <- storm_data[(EVTYPE != "?" & (INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0))]
# Convert property and crop damage exponent columns to numeric values
storm_data[, PROPDMGEXP := toupper(PROPDMGEXP)]
storm_data[, CROPDMGEXP := toupper(CROPDMGEXP)]
# Define mapping for exponents
damage_exp_mapping <- list(
'PROPDMGEXP' = c("\"\"" = 1, "-" = 1, "+" = 1, "0" = 1, "1" = 10, "2" = 100, "3" = 1000, "4" = 10000,
"5" = 100000, "6" = 1000000, "7" = 10000000, "8" = 100000000, "9" = 1000000000,
"H" = 100, "K" = 1000, "M" = 1000000, "B" = 1000000000),
'CROPDMGEXP' = c("\"\"" = 1, "?" = 1, "0" = 1, "K" = 1000, "M" = 1000000, "B" = 1000000000)
)
# Apply the mapping
storm_data[, PROPDMGEXP := damage_exp_mapping[['PROPDMGEXP']][PROPDMGEXP]]
storm_data[, CROPDMGEXP := damage_exp_mapping[['CROPDMGEXP']][CROPDMGEXP]]
# Replace NAs with default value
storm_data[is.na(PROPDMGEXP), PROPDMGEXP := 1]
storm_data[is.na(CROPDMGEXP), CROPDMGEXP := 1]
# Ensure PROPDMGEXP and CROPDMGEXP columns are correctly converted to numeric values
storm_data[, PROPDMGEXP := as.numeric(PROPDMGEXP)]
storm_data[, CROPDMGEXP := as.numeric(CROPDMGEXP)]
# Replace any NAs that might have resulted from unmapped values with 1
storm_data[is.na(PROPDMGEXP), PROPDMGEXP := 1]
storm_data[is.na(CROPDMGEXP), CROPDMGEXP := 1]
# Calculate property and crop damage costs
storm_data[, `:=` (
Property_Damage = PROPDMG * PROPDMGEXP,
Crop_Damage = CROPDMG * CROPDMGEXP
)]
# Aggregate total economic impact by event type
economic_impact <- storm_data[, .(
Total_Property_Damage = sum(Property_Damage),
Total_Crop_Damage = sum(Crop_Damage),
Total_Economic_Damage = sum(Property_Damage) + sum(Crop_Damage)
), by = EVTYPE]
# Sort the results by total economic damage and select the top 10
top_economic_impact <- economic_impact[order(-Total_Economic_Damage)][1:10]
# Aggregate total fatalities and injuries by event type
health_impact <- storm_data[, .(
Total_Fatalities = sum(FATALITIES),
Total_Injuries = sum(INJURIES),
Total_Health_Impact = sum(FATALITIES) + sum(INJURIES)
), by = EVTYPE]
# Sort the results by total fatalities and select the top 10
top_health_impact <- health_impact[order(-Total_Fatalities)][1:10]
# Prepare data for plotting
health_impact_melted <- melt(top_health_impact, id.vars = "EVTYPE", variable.name = "Impact_Type")
# Create a bar plot for health impacts
health_plot <- ggplot(health_impact_melted, aes(x = reorder(EVTYPE, -value), y = value)) +
geom_bar(stat = "identity", aes(fill = Impact_Type), position = "dodge") +
labs(title = "Top 10 Weather Events Most Harmful to Health in the US", y = "Count", x = "Event Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Display the plot
health_plot
# Prepare data for plotting
economic_impact_melted <- melt(top_economic_impact, id.vars = "EVTYPE", variable.name = "Damage_Type")
# Create a bar plot for economic impacts
economic_plot <- ggplot(economic_impact_melted, aes(x = reorder(EVTYPE, -value), y = value)) +
geom_bar(stat = "identity", aes(fill = Damage_Type), position = "dodge") +
labs(title = "Top 10 Weather Events with Greatest Economic Consequences", y = "Cost (USD)", x = "Event Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Display the plot
economic_plot