## Analysis of the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This project explores the NOAA storm database, which tracks major storms and weather events, to address the most severe types of weather events in the USA, which caused greatest damage to human population in terms of fatalities/injuries and economic loss during the years 1950 - 2011.
## - identify the weather events that are most harmful with respect to population health
## - identify the weather events that have the greatest economic consequences.
## We conclude that TORNADOS and FLOODS are most harmful weather events in the USA in terms of the risk to human health and economic impact.
#The data source is in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. It is possible to download the source file from the course web site: Storm Data
# downloading data needed ----------------------------------------------------------------
library(dplyr)
library(ggplot2)
library(data.table)
library(lubridate)
library(rmarkdown)
Url_data <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
filename <- "repdata_data_StormData.csv.bz2"
download.file(Url_data, filename)
# reading data
My_data <- fread(file = filename, sep = "auto", header = TRUE)
My_data <- data.table(My_data) #transfer back to data.table
# Change date formats and filter data for dates
My_data$BGN_DATE <- mdy_hms(My_data$BGN_DATE)
My_data <- My_data[BGN_DATE > "1995-12-31"]
Namely: EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP.
Therefore, we can limit our data to these variables.
# Select the needed columns
My_data <- My_data[, colnames(My_data) %in%
c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
, with=FALSE]
EVTYPE = type of event
FATALITIES = number of fatalities
INJURIES = number of injuries
PROPDMG = the size of property damage
PROPDMGEXP = the exponent values for ‘PROPDMG’ (property damage)
CROPDMG = the size of crop damage
CROPDMGEXP = the exponent values for ‘CROPDMG’ (crop damage)
#cleaning event types names
My_data$EVTYPE <- toupper(My_data$EVTYPE)
# eliminating zero data
My_data <- My_data[FATALITIES != 0 &
INJURIES != 0 &
PROPDMG != 0 &
CROPDMG != 0]
#pivot table with dplyr
My_data <- data.frame(My_data) #transfer back to data.frame
Health_data <- My_data %>% group_by(EVTYPE) %>%
summarise(FATALITIES = sum(FATALITIES),
INJURIES = sum(INJURIES))
## `summarise()` ungrouping output (override with `.groups` argument)
Health_data <- data.table(Health_data) #transfer back to data.table
Health_data <- Health_data[, PEOPLE_LOSS := FATALITIES + INJURIES, by = "EVTYPE"]
# descending order by PEOPLE_LOSS
Health_data <- Health_data[order(Health_data$PEOPLE_LOSS, decreasing = TRUE), ]
#top 10 by PEOPLE_LOSS
Top10.EVTYPE.People <- top_n(Health_data, 10)
## Selecting by PEOPLE_LOSS
knitr::kable(Top10.EVTYPE.People, format = "markdown")
| EVTYPE | FATALITIES | INJURIES | PEOPLE_LOSS |
|---|---|---|---|
| FLOOD | 37 | 2487 | 2524 |
| TORNADO | 160 | 1332 | 1492 |
| HURRICANE/TYPHOON | 22 | 884 | 906 |
| TROPICAL STORM | 7 | 266 | 273 |
| FLASH FLOOD | 18 | 214 | 232 |
| TSUNAMI | 32 | 129 | 161 |
| WILDFIRE | 31 | 124 | 155 |
| EXCESSIVE HEAT | 46 | 18 | 64 |
| HIGH WIND | 10 | 51 | 61 |
| HEAVY SNOW | 4 | 38 | 42 |
The number/letter in the exponent value columns (PROPDMGEXP and CROPDMGEXP) represents the power of ten (10^The number). It means that the total size of damage is the product of PROPDMG and CROPDMG and figure 10 in the power corresponding to exponent value.
#transform letters and symbols to numbers
My_data$PROPDMGEXP <- gsub("[Hh]", "2", My_data$PROPDMGEXP)
My_data$PROPDMGEXP <- gsub("[Kk]", "3", My_data$PROPDMGEXP)
My_data$PROPDMGEXP <- gsub("[Mm]", "6", My_data$PROPDMGEXP)
My_data$PROPDMGEXP <- gsub("[Bb]", "9", My_data$PROPDMGEXP)
My_data$PROPDMGEXP <- gsub("\\+", "1", My_data$PROPDMGEXP)
My_data$PROPDMGEXP <- gsub("\\?|\\-|\\ ", "0", My_data$PROPDMGEXP)
My_data$PROPDMGEXP <- as.numeric(My_data$PROPDMGEXP)
My_data$CROPDMGEXP <- gsub("[Hh]", "2", My_data$CROPDMGEXP)
My_data$CROPDMGEXP <- gsub("[Kk]", "3", My_data$CROPDMGEXP)
My_data$CROPDMGEXP <- gsub("[Mm]", "6", My_data$CROPDMGEXP)
My_data$CROPDMGEXP <- gsub("[Bb]", "9", My_data$CROPDMGEXP)
My_data$CROPDMGEXP <- gsub("\\+", "1", My_data$CROPDMGEXP)
My_data$CROPDMGEXP <- gsub("\\-|\\?|\\ ", "0", My_data$CROPDMGEXP)
My_data$CROPDMGEXP <- as.numeric(My_data$CROPDMGEXP)
#creating total damage values
My_data$PROPDMGEXP[is.na(My_data$PROPDMGEXP)] <- 0
My_data$CROPDMGEXP[is.na(My_data$CROPDMGEXP)] <- 0
#Total damage values
My_data <- mutate(My_data,
PROPDMGTOTAL = PROPDMG * (10 ^ PROPDMGEXP),
CROPDMGTOTAL = CROPDMG * (10 ^ CROPDMGEXP))
#Economic_data: Let us now analyze the date from above
#pivot table with dplyr
Economic_data <- My_data %>% group_by(EVTYPE) %>%
summarise(PROPDMGTOTAL = sum(PROPDMGTOTAL),
CROPDMGTOTAL = sum(CROPDMGTOTAL))
## `summarise()` ungrouping output (override with `.groups` argument)
Economic_data <- data.table(Economic_data) #transfer back to data.table
Economic_data <- Economic_data[, ECONOMIC_LOSS := PROPDMGTOTAL + CROPDMGTOTAL, by = "EVTYPE"]
# descending order by ECONOMIC_LOSS
Economic_data <- Economic_data[order(Economic_data$ECONOMIC_LOSS, decreasing = TRUE), ]
#top 10 by ECONOMIC_LOSS
Top10.EVTYPE.economy <- top_n(Economic_data, 10)
## Selecting by ECONOMIC_LOSS
knitr::kable(Top10.EVTYPE.economy, format = "markdown")
| EVTYPE | PROPDMGTOTAL | CROPDMGTOTAL | ECONOMIC_LOSS |
|---|---|---|---|
| HURRICANE/TYPHOON | 11300000000 | 1795000000 | 13095000000 |
| WILDFIRE | 1165120000 | 75150000 | 1240270000 |
| HIGH WIND | 948190000 | 222930000 | 1171120000 |
| TORNADO | 1040902000 | 42920000 | 1083822000 |
| TROPICAL STORM | 628470000 | 121690000 | 750160000 |
| EXCESSIVE HEAT | 170000 | 492400000 | 492570000 |
| HURRICANE | 140250000 | 127000000 | 267250000 |
| FLOOD | 210500000 | 12180500 | 222680500 |
| FLASH FLOOD | 94657000 | 2235000 | 96892000 |
| TSUNAMI | 81000000 | 20000 | 81020000 |
Analyzing population health impact on the graph one can conclude that TORNADOS, EXCESSIVE HEAT and FLOOD are the main contributors to deaths and injuries out of all event types of weather events.
#plotting health loss -> HL
HL <- ggplot(data = Top10.EVTYPE.People, aes(x = reorder(EVTYPE, PEOPLE_LOSS), y = PEOPLE_LOSS)) +
geom_bar(stat = "identity", colour = "black") +
labs(title = "USA total people loss by weather events in 1996-2011") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "Number of fatalities and injuries", x = "Event Type") +
coord_flip()
HL
#plotting economic loss -> EL
EL <- ggplot(data = Top10.EVTYPE.economy, aes(x = reorder(EVTYPE, ECONOMIC_LOSS), y = ECONOMIC_LOSS)) +
geom_bar(stat = "identity", colour = "black") +
labs(title = "USA total economic loss by weather events in 1996-2011") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "Size of property and crop loss", x = "Event Type") +
coord_flip()
EL