This analysis involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. Data was downloaded from: Storm Data. Analysis is focused on discovering types of events which have the most biggest economic and health consequences.
#Loading libraries
library(data.table) #to work faster with large dataset
library(plotly) #nice iteractive plots
library(reshape)
library(plyr)
#Creating temporary file in memory
temp <- tempfile()
#Just to use download.file in R Markdown
setInternet2(use = TRUE)
#Download .bz2 file in temp
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", temp)
#Read data from temp .bz2 file
storm_data <- read.csv(temp)
#Delete temp file
unlink(temp)
storm_data_dim <- dim(storm_data)
So, there are 902297 storm events recorded in storm_data dataset with 37 variables which explain them.
length_EVTYPE <- length(unique(storm_data$EVTYPE))
There are 985 different storm events. In order to do analysis, we should decrease number of categories(a lot of current types refer to the same thing). So, let’s group most frequent events into few groups: STORM, THUNDERSTORM, TORNADO, FLOOD/PRECIPITATION, WINTER EVENTS, FIRE, DROUGHT/WARM, FOG, LAND EVENTS.
#Number of events that occured more than 30 times
number_frequent_events <- length(sort(table(storm_data$EVTYPE), decreasing = TRUE)[sort(table(storm_data$EVTYPE), decreasing = TRUE) > 30])
#Grouping of events
storm_data$EVTYPE <- as.character(storm_data$EVTYPE)
storm_data$EVTYPE_GROUPED <- NA
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("HAIL", "THUNDERSTORM WINDS", "HEAVY RAIN", "WINTER STORM", "FUNNEL CLOUD", "STRONG WIND", "ICE STORM", "HIGH WIND", "TSTM WIND/HAIL", "HIGH SURF", "TROPICAL STORM", "MARINE HAIL", "DUST STORM", "WIND", "STORM SURGE", "STRONG WINDS", "HURRICANE", "STORM SURGE/TIDE", "DUST DEVIL", "MARINE HIGH WIND", "HURRICANE/TYPHOON", "HEAVY SURF", "MARINE STRONG WIND", "SMALL HAIL")] <- "STORM"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("TSTM WIND", "THUNDERSTORM WIND", "LIGHTNING", "THUNDERSTORM WINDS HAIL", "THUNDERSTORM")] <- "THUNDERSTORM"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("TORNADO", "WATERSPOUT", "FUNNEL CLOUDS", "FUNNEL", "WATERSPOUTS")] <- "TORNADO"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("FLASH FLOOD", "FLOOD", "URBAN/SML STREAM FLD", "COASTAL FLOOD", "RIP CURRENT", "URBAN FLOOD", "ASTRONOMICAL LOW TIDE", "RIVER FLOOD", "COASTAL FLOODING", "FLOODING", "ASTRONOMICAL HIGH TIDE", "URBAN FLOODING", "COASTAL FLOOFING", "MONTHLY PRECIPITATION", "MIXED PRECIPITATION")] <- "FLOOD/PRECIPITATION"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("HEAVY SNOW", "WINTER WEATHER", "BLIZZARD", "FROST/FREEZE", "EXTREME COLD/WIND CHILL", "EXTREME COLD", "LAKE-EFFECT SNOW", "SNOW", "COLD/WIND CHILL", "AVALANCHE", "FREEZING RAIN", "EXTREME WINDCHILL", "LIGHT SNOW", "MODERATE SNOWFALL", "WINTRY MIX", "FREEZE", "COLD", "ICE", "SNOW AND ICE", "GLAZE", "HEAVY SNOW SQUALLS")] <- "WINTER EVENTS"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("WILDFIRE", "FOREST FIRE")] <- "FIRE"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("DROUGHT", "EXCESSIVE HEAT", "HEAT", "DRY MICROBURST", "RECORD WARMTH", "UNSEASONABLY WARM", "RECORD HEAT", "HEAT WAVE", "TEMPERATURE RECORD")] <- "DROUGHT/WARM"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("DENSE FOG", "FOG", "FREEZING FOG")] <- "FOG"
storm_data$EVTYPE_GROUPED[toupper(storm_data$EVTYPE) %in% c("LANDSLIDE")] <- "LAND EVENTS"
number_frequent_grouped_events <- length(unique(storm_data$EVTYPE_GROUPED[!is.na(storm_data$EVTYPE_GROUPED)]))
percentage_covered <- round(mean(!is.na(storm_data$EVTYPE_GROUPED)), digits = 4) * 100
So, there are 104 events, that occured more than 30 times and they were grouped in 9 groups. It covers 97.67% of all observations.
In order to get correct data about damage, PROPDMG and CROPDMG columns are needed to be converted due to values in PROPDMGEXP and CROPDMGEXP.
convert_demage <- function(data, condition) {
if (toupper(condition) == "K") {
return (data * (10^3))
} else if (toupper(condition) == "M") {
return (data * (10^6))
} else if (toupper(condition) == "B") {
return (data * (10^9))
} else if (!is.na(as.numeric(as.character(condition)))) {
return (data * (10^as.numeric(as.character(condition))))
} else {
return(data)
}
}
storm_data$PROPDMG <- mapply(convert_demage, storm_data$PROPDMG, storm_data$PROPDMGEXP)
storm_data$CROPDMG <- mapply(convert_demage, storm_data$CROPDMG, storm_data$CROPDMGEXP)
storm_data <- storm_data[, c("EVTYPE_GROUPED", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG", "BGN_DATE")]
Let’s verify if there are any NAs in columns:
sapply(storm_data, function(x)sum(is.na(x)))
EVTYPE_GROUPED FATALITIES INJURIES PROPDMG CROPDMG
21037 0 0 0 0
BGN_DATE
0
There are NAs only in EVTYPE_GROUPED, which we know about (they refer to not frequent events).
Let’s verify type of variables:
sapply(storm_data, class)
EVTYPE_GROUPED FATALITIES INJURIES PROPDMG CROPDMG
"character" "numeric" "numeric" "numeric" "numeric"
BGN_DATE
"factor"
EVTYPE, EVTYPE_GROUPED should be changed to factor.
storm_data$EVTYPE <- factor(storm_data$EVTYPE)
storm_data$EVTYPE_GROUPED <- factor(storm_data$EVTYPE_GROUPED)
storm_data$BGN_DATE <- year(as.POSIXct(as.character(storm_data$BGN_DATE), format="%m/%d/%Y %H:%M:%S"))
Transformation of data.frame to data.table to work faster with it:
storm_data <- data.table(storm_data)
setkey(storm_data, EVTYPE, BGN_DATE)
storm_data_sum <- storm_data[!is.na(EVTYPE_GROUPED), length(INJURIES), by = c("BGN_DATE", "EVTYPE_GROUPED")]
names(storm_data_sum) <- c("YEAR", "EVTYPE_GROUPED", "COUNT")
plot_ly(data = storm_data_sum, x = YEAR, y = COUNT, color = EVTYPE_GROUPED) %>% layout(xaxis = list(title = ""), title = "Number of events per year")
We could conclude that there is trend in increasing number of recorded events in time. There is information about TORNADO, THUNDERSTORMS and STORM events starting from 1950. WINTER EVENTS, LAND EVENTS, FOG, FLOOD/PRECIPITATION, FIRE, DROUGHT/WARM events had been recorded since 1993.
Next bar plot shows average number of suffered people from different types of events:
storm_data$ALL_SUFFERED <- storm_data$FATALITIES + storm_data$INJURIES
storm_data_sum <- storm_data[!is.na(EVTYPE_GROUPED), list(mean(ALL_SUFFERED), mean(FATALITIES), mean(INJURIES)), by = EVTYPE_GROUPED]
names(storm_data_sum) <- c("EVTYPE_GROUPED", "ALL_SUFFERED","FATALITIES", "INJURIES")
storm_data_sum <- melt(storm_data_sum, id=c("EVTYPE_GROUPED"))
storm_data_sum$variable <- factor(storm_data_sum$variable, levels = c("INJURIES" , "FATALITIES", "ALL_SUFFERED"))
plot_ly(data = storm_data_sum, x = EVTYPE_GROUPED, y = value, type = "bar", color = variable) %>% layout(autosize = F, margin = list(b = 90), xaxis = list(title = "Type of events"), yaxis = list(title = "Avearge suffered, people"), title = "Average fatalities & injuries per Type of events")
We could see that the most harmful is DROUGHT/WARM, TORNADO events with respect to average number of fatalities and injuries. There are 2.2 and 1.5 fatalities and injuries in average per event accordingly.
The most harmful is DROUGHT/WARM events with respect to average number of fatalities. There are 0.5 fatalities in average per event.
The most harmful is DROUGHT/WARM, TORNADO events with respect to average number of injuries. There are 1.6 and 1.4 injuries in average per event accordingly.
Total number of fatalities/injuries per event type are in table below:
storm_data_sum <- storm_data[!is.na(EVTYPE_GROUPED), list(sum(FATALITIES), sum(INJURIES)), by = EVTYPE_GROUPED]
names(storm_data_sum) <- c("EVTYPE_GROUPED", "FATALITIES", "INJURIES")
storm_data_sum
EVTYPE_GROUPED FATALITIES INJURIES
1: DROUGHT/WARM 3028 9103
2: FIRE 75 911
3: FLOOD/PRECIPITATION 1858 8883
4: FOG 80 1076
5: LAND EVENTS 38 52
6: STORM 1217 9854
7: THUNDERSTORM 1454 13687
8: TORNADO 5636 91375
9: WINTER EVENTS 954 3201
There are the highest frequency of fatalities/injuries in TORNADO events.
Next bar plot shows average number of property/crop damage from different types of events:
storm_data$DMG <- storm_data$PROPDMG + storm_data$CROPDMG
storm_data_sum <- storm_data[!is.na(EVTYPE_GROUPED), list(mean(DMG), mean(PROPDMG), mean(CROPDMG)), by = EVTYPE_GROUPED]
names(storm_data_sum) <- c("EVTYPE_GROUPED", "FULL_DEMAGE","PROPERTY_DEMAGE", "CROP_DAMAGE")
storm_data_sum <- melt(storm_data_sum, id=c("EVTYPE_GROUPED"))
storm_data_sum$variable <- factor(storm_data_sum$variable, levels = c("CROP_DAMAGE" , "PROPERTY_DEMAGE", "FULL_DEMAGE"))
plot_ly(data = storm_data_sum, x = EVTYPE_GROUPED, y = value, type = "bar", color = variable) %>% layout(autosize = F, margin = list(b = 90), xaxis = list(title = "Type of events"), yaxis = list(title = "Avearge Demage, $"), title = "Average Demage per Type of events")
We could see that the most harmful is DROUGHT/WARM, FLOOD/PRECIPITATION, FIRE events with respect to average number of damage. There are 2.8M, 2.1M and 1.8M $ damage in average per event accordingly.
The most harmful is FLOOD/PRECIPITATION, FIRE events with respect to average number of property damage. There are 2M, 1.7M $ damage in average per event accordingly.
The most harmful is DROUGHT/WARM events with respect to average number of crop damage. There are 2.7M $ damage in average per event.
Total number of property/crop damage per event type are in table below:
storm_data_sum <- storm_data[!is.na(EVTYPE_GROUPED), list(sum(PROPDMG), sum(CROPDMG)), by = EVTYPE_GROUPED]
names(storm_data_sum) <- c("EVTYPE_GROUPED", "PROPERTY_DAMAGE", "CROP_DAMAGE")
storm_data_sum
EVTYPE_GROUPED PROPERTY_DAMAGE CROP_DAMAGE
1: DROUGHT/WARM 1072849350 14872004500
2: FIRE 4765114000 295472800
3: FLOOD/PRECIPITATION 167193450596 12131434150
4: FOG 25011500 0
5: LAND EVENTS 324596000 20017000
6: STORM 171450699763 15825464411
7: THUNDERSTORM 8900061971 981991490
8: TORNADO 56956734377 414953270
9: WINTER EVENTS 1786884900 3143257100
There is the highest damage in $ from 1950 till 2011 (for recorded events) from STORM, FLOOD/PRECIPITATION and DROUGHT/WARM.