The basic goal of this assignment is to explore the NOAA Storm Database and answer the following questions:
First of all, we install and load the needed packages
library(data.table)
library(stringdist)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gghighlight)
## Loading required package: ggplot2
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Next, we load the storm data, and we lighten the data by keeping only the needed variables
storm_data <- fread("repdata_data_StormData.csv.bz2", header = T, sep = ',')
filtered_storm_data <- storm_data[, c("EVTYPE", "BGN_DATE", "BGN_TIME", "TIME_ZONE", "END_DATE", "END_TIME", "FATALITIES", "INJURIES", "CROPDMG", "CROPDMGEXP", "PROPDMG", "PROPDMGEXP")]
Putting the exponential of prop and crop damages values to a better format
filtered_storm_data$propmultiplication<-1
filtered_storm_data$propmultiplication[filtered_storm_data$PROPDMGEXP=="H"]<-100
filtered_storm_data$propmultiplication[filtered_storm_data$PROPDMGEXP=="K"]<-1000
filtered_storm_data$propmultiplication[filtered_storm_data$PROPDMGEXP=="M"]<-1000000
filtered_storm_data$propmultiplication[filtered_storm_data$PROPDMGEXP=="B"]<-1000000000
filtered_storm_data$cropmultiplication<-1
filtered_storm_data$cropmultiplication[filtered_storm_data$CROPDMGEXP=="H"]<-100
filtered_storm_data$cropmultiplication[filtered_storm_data$CROPDMGEXP=="K"]<-1000
filtered_storm_data$cropmultiplication[filtered_storm_data$CROPDMGEXP=="M"]<-1000000
filtered_storm_data$cropmultiplication[filtered_storm_data$CROPDMGEXP=="B"]<-1000000000
# use the new factors to calculate two new columns with the property and crop cost
filtered_storm_data$propTotal <- filtered_storm_data$PROPDMG * filtered_storm_data$propmultiplication
filtered_storm_data$cropTotal <- filtered_storm_data$CROPDMG * filtered_storm_data$cropmultiplication
The event types are not well represented in the data, many events are present more than once with slightly different names, the function amatch will help put the data in a better format
official_event_types <- read.csv("event_types.csv")
i <- amatch(filtered_storm_data$EVTYPE, official_event_types$Event.Name,maxDist = 50)
filtered_storm_data$EVTYPE <- official_event_types$Event.Name[i]
Now we create a year column
filtered_storm_data$BGN_DATE <- as.Date(filtered_storm_data$BGN_DATE, "%m/%d/%Y %H:%M:%S")
filtered_storm_data$year <- year(filtered_storm_data$BGN_DATE)
We want to have the mean of all damage numbers for each event type by year. After a little bit of exploration, we find that records before 1992 are not enough. So we keep only data recorded after 1992
global_mean <- filtered_storm_data %>% group_by(EVTYPE, year) %>%
summarise(fatalities.mean = mean(FATALITIES), injuries.mean = mean(INJURIES),
prop.mean = mean(propTotal), crop.mean = mean(cropTotal))
#we take only years after 1992, because there is not enough records before
global_mean <- global_mean[global_mean$year>=1992,]
Ploting the mean of fatalities and injuries by year and we highlights the events corresponding to the highest values
plot1 <- ggplot(global_mean) +
geom_line(aes(year, fatalities.mean, colour = EVTYPE)) +
xlab("Year") + ylab("Fatalities") +
ggtitle("Fatalities per event per year")+
coord_cartesian(xlim=c(1992, 2011), ylim = c(0,5)) +
gghighlight(max(fatalities.mean)>1.5) +
theme_minimal()
## label_key: EVTYPE
plot2 <- ggplot(global_mean) +
geom_line(aes(year, injuries.mean, colour = EVTYPE)) +
xlab("Year") + ylab("Injuries") +
ggtitle("Injuries per event type per year")+
coord_cartesian(xlim=c(1992, 2011), ylim = c(0,20)) +
gghighlight(max(injuries.mean)>8) +
theme_minimal()
## label_key: EVTYPE
grid.arrange(plot1, plot2, nrow = 2)
Doing the same for prop and crop damages values
plot3 <- ggplot(global_mean) +
geom_line(aes(year, prop.mean, colour = EVTYPE)) +
xlab("Year") + ylab("Prop damages") +
ggtitle("Prop damages per event per year")+
coord_cartesian(xlim=c(1992, 2011), ylim = c(0,2e+08)) +
gghighlight(max(prop.mean)>0.5e+08) +
theme_minimal()
## label_key: EVTYPE
plot4 <- ggplot(global_mean) +
geom_line(aes(year, crop.mean, colour = EVTYPE)) +
xlab("Year") + ylab("Crop damages") +
ggtitle("Crop damages per event type per year")+
coord_cartesian(xlim=c(1992, 2011), ylim = c(0,2e+07)) +
gghighlight(max(crop.mean)>1e+07) +
theme_minimal()
## label_key: EVTYPE
grid.arrange(plot3, plot4, nrow = 2)
The first figure shows that Excessive Heat and Seiche are the events causing the highest fatality and injury numbers, they are the most harmful to health based on the analysis.
The second figure shows that Dense Smoke and Avalanche are the events causing the highest prop damages. For Crop damages, Ice storms and Dense Fog tend to be the most important causes.