Course_Project_2 - Reproducible Research Course in R (JHU)

Synopsis

This analysis investigatd the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to determine the types of events most harmful to population health and those with the greatest economic consequences. Data from 1950 to November 2011 are processed and analyzed, focusing on fatality, injury, and property damage measures. The analysis identifies the most significant event types in terms of human health and economic impact.

Data Processing

## Data Processing
# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Download the dataset
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_file <- "StormData.csv.bz2"

# Check if the file already exists; if not, download it
if (!file.exists(data_file)) {
  download.file(url, data_file)
}

# Load the data
storm_data <- read.csv(bzfile(data_file))

# Inspect the dataset structure
str(storm_data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
# Data Processing
# Converted relevant columns to appropriate data types
storm_data$BGN_DATE <- as.Date(storm_data$BGN_DATE, format = "%m/%d/%Y")

# Selected necessary columns and filter the dataset for analysis
storm_cleaned <- storm_data %>%
  select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
  filter(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)

# Further data processing for damages (convert exponent codes)
unique(storm_cleaned$PROPDMGEXP)  # To view unique values of PROPDMGEXP for conversion
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "4" "h" "2" "7" "3" "H" "-"
# Display a summary of the cleaned dataset
summary(storm_cleaned)
##     EVTYPE            FATALITIES          INJURIES            PROPDMG       
##  Length:254633      Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00  
##  Class :character   1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   2.00  
##  Mode  :character   Median :  0.0000   Median :   0.0000   Median :   5.00  
##                     Mean   :  0.0595   Mean   :   0.5519   Mean   :  42.75  
##                     3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:  25.00  
##                     Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00  
##   PROPDMGEXP           CROPDMG         CROPDMGEXP       
##  Length:254633      Min.   :  0.000   Length:254633     
##  Class :character   1st Qu.:  0.000   Class :character  
##  Mode  :character   Median :  0.000   Mode  :character  
##                     Mean   :  5.411                     
##                     3rd Qu.:  0.000                     
##                     Max.   :990.000

Results

Human Health Impact

## Results
# Aggregated data to see which events are most harmful to population health
health_impact <- storm_cleaned %>%
  group_by(EVTYPE) %>%
  summarize(total_fatalities = sum(FATALITIES), total_injuries = sum(INJURIES)) %>%
  arrange(desc(total_fatalities), desc(total_injuries))

# Ploted the most harmful event types
ggplot(health_impact[1:10,], aes(x = reorder(EVTYPE, -total_fatalities), y = total_fatalities)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Top 10 Event Types with Highest Fatalities", x = "Event Type", y = "Total Fatalities") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Economic Impact

# Aggregated data to see which events have the greatest economic consequences
econ_impact <- storm_cleaned %>%
  group_by(EVTYPE) %>%
  summarize(total_prop_dmg = sum(PROPDMG), total_crop_dmg = sum(CROPDMG)) %>%
  arrange(desc(total_prop_dmg), desc(total_crop_dmg))

# Ploted economic impact
ggplot(econ_impact[1:10,], aes(x = reorder(EVTYPE, -total_prop_dmg), y = total_prop_dmg)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  labs(title = "Top 10 Event Types with Greatest Property Damage", x = "Event Type", y = "Total Property Damage") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Key Findings

  1. Events Most Harmful to Population Health The analysis showed that tornadoes, excessive heat, and flash floods are among the most harmful events to population health, with tornadoes being the leading cause of fatalities and injuries.

  2. Events with the Greatest Economic Consequences Tornadoes, flash floods, and thunderstorm winds contributed the most to economic losses, with tornadoes causing the largest damage to both property and crops.