##This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

####The data analysis addressed the following questions: #- Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? #- Across the United States, which types of events have the greatest economic consequences?

####In conclusion;

Download url Stormdata: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2

More available documentation about the database, with some describtion of how variables are constructed or defined. National Weather Service https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf National Climatic Data Center Storm Events https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf

##Data Processing

#reading data
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "StormData.Csv.bz2"
curl::curl_download(url, destfile)
Raw_data <- read.csv(file = destfile, header= TRUE, sep=",")

On the coursera discussion platform are given extra mentor comments to make the task easier. (https://www.coursera.org/learn/reproducible-research/discussions/weeks/4/threads/IdtP_JHzEeaePQ71AQUtYw)

####Here is described that however the data collection started at 1950, only at Jan 1996 they started with recoridng all events type. So we can use the data since then and neglect all other data.

# subsetting by date
Main_data <- Raw_data
Main_data$BGN_DATE <- as.POSIXct(Raw_data$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
Main_data <- subset(Main_data, BGN_DATE > as.POSIXct("1995-12-31"))

#So for the questions we need to look at events types and at which events have the greatest economic consequences.Therefore we need to focous on the following 7 variabeles;EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP.

Main_data <- subset(Main_data, select = c(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
num_unique_events <- length(unique(Main_data$EVTYPE))
print(num_unique_events)
## [1] 516

##There are in total 516 different unique events in the variable EVTYPE. We only need to need have the most harmful with respect to the population or the ones with the greatest economic consequences.

##To answer the first question let start with most harmful with respect to the population.Which includes the variables FATALITIES and INJURIES.

Health_data <- aggregate(cbind(FATALITIES, INJURIES) ~ EVTYPE, data = Main_data, FUN=sum)
Health_data$PEOPLE_LOSS <- Health_data$FATALITIES + Health_data$INJURIES
Health_data <- Health_data[order(Health_data$PEOPLE_LOSS, decreasing = TRUE), ]
Top10_events_people <- Health_data[1:10,]
#knitr::kable(Top10_events_people, format = "markdown")

#Now lets look at the economic consequences

##The values in the PROPDMGEXP and CROPDMGEXP columns represent exponents that indicate powers of ten. This means the total damage is calculated by multiplying the PROPDMG or CROPDMG value by 10 raised to the power specified in the exponent column.

###Exponent values are:
Letters, which correspond to specific magnitudes: B or b = Billion (10^9) M or m = Million (10^6) K or k = Thousand (10^3) H or h = Hundred (10^2)

Symbols: “-” = Indicates a value less than the stated amount. “+” = Suggests a value greater than the stated amount. “?” = Represents uncertainty or low confidence in the value. These symbols (-, +, and ?) can be optionally ignored if they do not provide meaningful information.

# Function to convert damage exponents to numeric values
convert_dmg_exp <- function(exp_column) {
  # Replace letter and symbol codes with corresponding numbers
  exp_column <- gsub("[Hh]", "2", exp_column)  # Hundreds -> 10^2
  exp_column <- gsub("[Kk]", "3", exp_column)  # Thousands -> 10^3
  exp_column <- gsub("[Mm]", "6", exp_column)  # Millions -> 10^6
  exp_column <- gsub("[Bb]", "9", exp_column)  # Billions -> 10^9
  exp_column <- gsub("\\+", "1", exp_column)   # '+' -> 1 (Positive adjustment)
  exp_column <- gsub("[\\?\\-\\ ]", "0", exp_column)  # '?' '-' and empty space -> 0

  # Convert to numeric and handle any NAs by replacing them with 0
  exp_column <- as.numeric(exp_column)
  exp_column[is.na(exp_column)] <- 0
  return(exp_column)
}

# Apply the function to both PROPDMGEXP and CROPDMGEXP columns
Main_data$PROPDMGEXP <- convert_dmg_exp(Main_data$PROPDMGEXP)
Main_data$CROPDMGEXP <- convert_dmg_exp(Main_data$CROPDMGEXP)

#Create total property, crop damage and total damage economic loss

# Load necessary library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Create new columns for total property damage and total crop damage
Main_data <- mutate(Main_data, 
                    PROPDMGTOTAL = PROPDMG * (10 ^ PROPDMGEXP), 
                    CROPDMGTOTAL = CROPDMG * (10 ^ CROPDMGEXP))

# Aggregate total property and crop damage by event type (EVTYPE)
Economic_data <- aggregate(cbind(PROPDMGTOTAL, CROPDMGTOTAL) ~ EVTYPE, 
                           data = Main_data, FUN = sum)

# Create a new column for total economic loss (property + crop damage)
Economic_data$ECONOMIC_LOSS <- Economic_data$PROPDMGTOTAL + Economic_data$CROPDMGTOTAL

# Sort the data by total economic loss in descending order
Economic_data <- Economic_data[order(Economic_data$ECONOMIC_LOSS, decreasing = TRUE), ]

# Extract the top 10 events with the highest economic loss
Top10_events_economy <- Economic_data[1:10,]

# Display the top 10 events in a markdown table
#knitr::kable(Top10_events_economy, format = "markdown")

##Only three figures were allowed according to the Grading criteria, to still be able to show all the data a combined figure with all the necesarry data was made. There are ranks for the top10 events people loss and economic loss variable name: Rank_People_Loss and Rank_Economic_Loss. Events with no combination in both top 10s are called ‘NA’ in either the People or Economic section of the figure.

# Load necessary library for data manipulation
library(dplyr)

# Add rank for people loss (descending order)
Top10_events_people <- Top10_events_people %>%
  mutate(Rank_People_Loss = row_number(-PEOPLE_LOSS))

# Add rank for economic loss (descending order)
Top10_events_economy <- Top10_events_economy %>%
  mutate(Rank_Economic_Loss = row_number(-ECONOMIC_LOSS))

# Combine the two tables by 'EVTYPE'
Combined_events <- full_join(Top10_events_people, Top10_events_economy, by = "EVTYPE")

# Display the combined table
knitr::kable(Combined_events, format = "markdown")
EVTYPE FATALITIES INJURIES PEOPLE_LOSS Rank_People_Loss PROPDMGTOTAL CROPDMGTOTAL ECONOMIC_LOSS Rank_Economic_Loss
TORNADO 1511 20667 22178 1 24616945710 283425010 24900370720 4
EXCESSIVE HEAT 1797 6391 8188 2 NA NA NA NA
FLOOD 414 6758 7172 3 143944833550 4974778400 148919611950 1
LIGHTNING 651 4141 4792 4 NA NA NA NA
TSTM WIND 241 3629 3870 5 NA NA NA NA
FLASH FLOOD 887 1674 2561 6 15222203910 1334901700 16557105610 6
THUNDERSTORM WIND 130 1400 1530 7 NA NA NA NA
WINTER STORM 191 1292 1483 8 NA NA NA NA
HEAT 237 1222 1459 9 NA NA NA NA
HURRICANE/TYPHOON 64 1275 1339 10 69305840000 2607872800 71913712800 2
STORM SURGE NA NA NA NA 43193536000 5000 43193541000 3
HAIL NA NA NA NA 14595143420 2476029450 17071172870 5
HURRICANE NA NA NA NA 11812819010 2741410000 14554229010 7
DROUGHT NA NA NA NA 1046101000 13367566000 14413667000 8
TROPICAL STORM NA NA NA NA 7642475550 677711000 8320186550 9
HIGH WIND NA NA NA NA 5247860360 633561300 5881421660 10

##Results

To answer question 1 and 2 there need to be looked at the most harmful events on the population health and the events with the greatest economic consequences.

#Analyzing harmful events on population health
# Load necessary library for plotting
library(ggplot2)

# Plot the total people loss by event type
ggplot(Top10_events_people, aes(x = reorder(EVTYPE, PEOPLE_LOSS), y = PEOPLE_LOSS)) +
  geom_bar(stat = "identity", colour = "red") +
  labs(title = "Total People Loss in USA by Weather Events (1996-2011)",
       y = "Number of Fatalities and Injuries", 
       x = "Event Type") +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()

#Analyzing the total economic consequences of harmful events
# Plot the total economic loss by event type
ggplot(Top10_events_economy, aes(x = reorder(EVTYPE, ECONOMIC_LOSS), y = ECONOMIC_LOSS)) +
  geom_bar(stat = "identity", colour = "green") +
  labs(title = "Total Economic Loss in USA by Weather Events (1996-2011)",
       y = "Size of Property and Crop Loss", 
       x = "Event Type") +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()