Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern. The analysis aims to investigate which different types of sever weather events are most harmful for populations health and economy. This analysis shows by aggregating the data by storm events type :

Data Processing

The data used for this analysis comes from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

Downloading, unzipping and read the data

if(!file.exists('data')) dir.create('data')
if(!file.exists('./data/storm.zip')){
fileUrl <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(fileUrl, destfile = './data/StormData.csv.bz2')
unzip('./data/StormData.csv.bz2')
storm_or <-read.csv("./data/StormData.csv.bz2")
dim(storm_or)
}
## [1] 902297     37

It’s possibile to find some documentation about how some of the variables are constructed/defined:

There are a lot of variable in the data that are not of interest for this analysis so we can consider only the most relevant for our goal

storm <-  storm_or[ ,c( "STATE__",
                     "BGN_DATE", 
                     "EVTYPE", 
                     "FATALITIES",
                     "INJURIES",
                     "PROPDMG",
                     "PROPDMGEXP",
                     "CROPDMG",
                     "CROPDMGEXP")]

Processing Data

Below we can find the list of packages used for this analysis.

library(dplyr)
library(ggplot2)
library(gridExtra)
library(lubridate)
library(stringr)

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete. For this reason we decidet to consider the data form the year 1996.

storm$year <- year(as.Date(storm$BGN_DATE,format='%m/%d/%Y'))
ggplot(storm, aes(x = year))+
        geom_histogram(bins = 30)+
        labs(title = "Events per year", x = "Year", y = "Number of events")

#knitr::kable(table(storm$year), 
#             col.names = c("Year", "Number of events"), align = 'l')
storm <- storm %>% filter(year > 1995)

As we can learn from the documentation of this databanse there are only 48 events permitted in Storm Data are listed in Table 1 of Section 2.1.1. So we have to clean the data form events that are not in this table, but examining the variability EVTYPE we can see that there are a lot of different kinds of events

storm$EVTYPE <- toupper(str_trim(storm$EVTYPE))
storm$EVTYPE <- gsub("  ", " ", storm$EVTYPE)
length(unique(storm$EVTYPE))
## [1] 426

To clean all this caos we need a lot of time, so we decidet to reduce this caos excluding the events that are not on goal. To do this we can exclude the events that haven’t a correct State FIPS number link.

storm <- storm %>% filter(STATE__ < 73)

Moreover we can calculate the amount of crop and prop damages and exclude all the event that are no data about public health and economic problems

#prop damage
storm$PROPDMGEXP <- as.character(storm$PROPDMGEXP)
storm$PROPDMGEXP <- str_trim(storm$PROPDMGEXP)
storm$PROPDMGEXP <- gsub("B", "9", storm$PROPDMGEXP)
storm$PROPDMGEXP <- gsub("H|h", "2", storm$PROPDMGEXP)
storm$PROPDMGEXP <- gsub("K", "3", storm$PROPDMGEXP)
storm$PROPDMGEXP <- gsub("M|m", "6", storm$PROPDMGEXP)
storm$PROPDMGEXP <- as.numeric(storm$PROPDMGEXP)
storm$PROPDMGEXP[which(is.na(storm$PROPDMGEXP))] <- 0
storm$PROPDMG <- storm$PROPDMG * 10^storm$PROPDMGEXP

#crop damage
storm$CROPDMGEXP <- as.character(storm$CROPDMGEXP)
storm$CROPDMGEXP <- str_trim(storm$CROPDMGEXP)
storm$CROPDMGEXP <- gsub("B", "9", storm$CROPDMGEXP)
storm$CROPDMGEXP <- gsub("H|h", "2", storm$CROPDMGEXP)
storm$CROPDMGEXP <- gsub("K", "3", storm$CROPDMGEXP)
storm$CROPDMGEXP <- gsub("M|m", "6", storm$CROPDMGEXP)
storm$CROPDMGEXP <- as.numeric(storm$CROPDMGEXP)
storm$CROPDMGEXP[which(is.na(storm$CROPDMGEXP))] <- 0
storm$CROPDMG <- storm$CROPDMG * 10^storm$CROPDMGEXP

#consider only row with data about interest
storm <- storm %>% select(-c("BGN_DATE","PROPDMGEXP","CROPDMGEXP"))%>%
        filter(PROPDMG!=0 | CROPDMG!=0 | FATALITIES!=0 | INJURIES!=0)
dim(storm)
## [1] 201045      7

How we can see, we have reduct the caos in the variable EVTYPE

storm$EVTYPE[which(grepl("TSTM WIND",storm$EVTYPE))] <- "TSTM WIND"
storm$EVTYPE[which(grepl("THUNDERSTORM",storm$EVTYPE))] <- "THUNDERSTORM"
storm$EVTYPE[which(grepl("TYPHOON",storm$EVTYPE))] <- "HURRICANE"
length(unique(storm$EVTYPE))
## [1] 160

The cleaning work is not finished but we can be satisfied.

Results

Effects on population health

The following chart shows the top 10 weather events that are most dangerous for public health.

storm_health <- storm %>% filter(FATALITIES!=0 | INJURIES!=0 )%>% 
                        group_by(EVTYPE) %>%
                        summarize(count = n(), 
                                  tot_injuries = sum(INJURIES),
                                  tot_fatalities = sum(FATALITIES))%>%
                        arrange(desc(tot_fatalities, tot_injuries))

p1 <- ggplot(data=head(storm_health,10), 
             aes(x=reorder(EVTYPE, tot_injuries), y=tot_injuries)) +
        geom_bar(fill="gold3",stat="identity")  + coord_flip() + 
        ylab("Total number of injuries") + xlab("Event type") +
        ggtitle("Health impact of weather events in the US") +
        theme(legend.position="none")
p2 <- ggplot(data=head(storm_health,10), 
             aes(x=reorder(EVTYPE, tot_fatalities), y=tot_fatalities)) +
        geom_bar(fill="red4",stat="identity")  + coord_flip() + 
        ylab("Total number of fatalities") + xlab("Event type") +
        theme(legend.position="none")
grid.arrange(p1, p2, nrow =2)

Effects on economy

The following chart shows the top 10 weather events causing financial damage to both property and crops.

storm_economy <- storm %>% filter(PROPDMG!=0 | CROPDMG!=0)%>% 
                        group_by(EVTYPE) %>%
                        summarize(count = n(), 
                                tot_PROPDMG = sum(PROPDMG)/1000000,
                                tot_CROPDMG = sum(CROPDMG)/1000000)%>%
                        arrange(desc(tot_PROPDMG, tot_CROPDMG))
p3 <- ggplot(data=head(storm_economy,10), 
             aes(x=reorder(EVTYPE, tot_PROPDMG), y=tot_PROPDMG)) +
        geom_bar(fill="blue3",stat="identity")  + coord_flip() + 
        ylab("Property damage") + xlab("Event type") +
        ggtitle("Economic impact of weather events in the US in millions of dollars") +
        theme(legend.position="none")
p4 <- ggplot(data=head(storm_economy,10), 
             aes(x=reorder(EVTYPE, tot_CROPDMG), y=tot_CROPDMG)) +
        geom_bar(fill="olivedrab",stat="identity")  + coord_flip() + 
        ylab("Crop damage") + xlab("Event type") +
        theme(legend.position="none")
grid.arrange(p3, p4, nrow =2)