Synopsis

The following is a brief data analysis performed for the course Reproducible Research by John Hopkins University. It is based on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database and the can be downloaded directly from their site.

After donwloading, only the desired columns were selected (type of event, mortalities, injuries, property damage and crop damage). As different entries of economic damage had different units (thousands, millions and billion dollars) this was corrected, using an additional column that expressed the unit. Names in the event type column were also fixed, because they were messy and not standarized. This was done by text clustering as you will see soon. Finally, data was grouped by event type and the sum of mortalities, injuries and economic damages was plotted. Hope you enjoy this.

Data Processing

Loading the required libraries

#Upload required libraries
library(ggplot2)
library(ggthemes)
library(dplyr)
library(lubridate)
library(stringdist)
library(RColorBrewer)
library(stringr)
library(ggpubr)

Downloading and reading the data

#Upload required ibraries
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", '~/Github/Final-Project-Reproducible-Research/data.zip')
data <- read.csv('~/Github/Final-Project-Reproducible-Research/data.zip')
data <- select(data,STATE, BGN_DATE, BGN_DATE,COUNTY, STATE, EVTYPE, FATALITIES, INJURIES,PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

Fixing the data

Property and crop damage are in different units (thousands, millions and billions) so this has to be checked. This I learned from Jeremy Beck.

data <- data %>% mutate(PROPDMGEXP = ifelse(PROPDMGEXP == 'B',1E9,
                                      ifelse(PROPDMGEXP == 'K', 1E3,
                                        ifelse(PROPDMGEXP == 'M', 1E6,0))),
                        CROPDMGEXP = ifelse(CROPDMGEXP == 'B', 1E9,
                                        ifelse(CROPDMGEXP == 'K', 1E3,
                                            ifelse(CROPDMGEXP == 'M', 1E6,0))))

data <- data %>% mutate(PROPDMG_CASH = PROPDMG*PROPDMGEXP, CROPDMG_CASH = CROPDMG*CROPDMGEXP)

More Fixing

As you can see, names for EVTYPE are messy af and we will fix them through a cluster function. (All this part is also inspired by Jeremy Beck).

set.seed(42)
EVTYPES <- unique(data$EVTYPE)
distance_matrix <- stringdistmatrix(EVTYPES,EVTYPES,method = "jw")
rownames(distance_matrix) <- EVTYPES
EVTYPES_hc <- hclust(as.dist(distance_matrix))
EV_cuts <- cutree(EVTYPES_hc, h=0.14)
EV_cuts <- as.data.frame(EV_cuts)
EV_cuts$Event_Type <- attr(EV_cuts, 'row.names')
colnames(EV_cuts) <- c("Cluster","Event_Type")
data <- merge(data, EV_cuts, by.x="EVTYPE", by.y="Event_Type", all.x=T, all.y=F)
data$EVTYPE <- str_to_title(data$EVTYPE)

Adressing injuries, fatalities and economic damage by event type.

dataxevent <- data %>% group_by(Cluster)  %>%
      summarise(EVTYPE= first(EVTYPE), fat= sum(FATALITIES), inj = sum(INJURIES), propdam= sum(PROPDMG_CASH), cropdam = sum(CROPDMG_CASH)) 

Results

Injuries and Fatalities

We will plot the 15 event types with the most mortalities and injuries.

#Preparing some fuNkY colors
mycolors <- colorRampPalette(brewer.pal(8, "Dark2"))(25)
#top injuries
p1= ggplot(top_n(dataxevent,15, inj), aes(x= inj, y= reorder(EVTYPE, inj),fill= EVTYPE))+
         geom_bar(stat= 'identity')+
         ylab("Event Type")+xlab("Total Injuries")+
         theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
         text=element_text(size=12))+
        scale_color_manual(mycolors)+
  theme(legend.position = "none")+
  ggtitle("Injuries by Extreme Natural \n Events in the US 1950-2011")

#topdeaths
p2= ggplot(top_n(dataxevent,15, fat), aes(x= fat, y= reorder(EVTYPE, fat),fill= EVTYPE))+
  geom_bar(stat= 'identity')+
  ylab("Event Type")+xlab("Total Fatalities")+
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
        text=element_text(size=12))+
  scale_color_manual(mycolors)+
  theme(legend.position = "none")+
  ggtitle("Mortalities by Extreme Natural \n Events in the US 1950-2011")


ggarrange(p1,p2, 
          labels = c("A", "B"),
          ncol = 2, nrow = 1)

Economic damages

We will plot will the 15 event types with the most economic damage to properties and crops.

# Reducing digit number in the $ columns

dataxevent$propdammil = dataxevent$propdam/1000000000
dataxevent$cropdammil = dataxevent$cropdam/1000000000

#Prop Damage
p3= ggplot(top_n(dataxevent, 15, propdammil), aes(x= propdammil, 
            y= reorder(EVTYPE, propdam), fill= EVTYPE))+
  geom_bar(stat= 'identity')+
  ylab("Event Type")+xlab("Total Damages (Million $)")+
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
        text=element_text(size=12))+
  scale_color_manual(mycolors)+
  theme(legend.position = "none")+
  ggtitle("Property Damages by Extreme Natural \n Events in the US 1950-2011")

#Crop Damage

p4= ggplot(top_n(dataxevent, 15, cropdammil), aes(x= cropdammil, y= reorder(EVTYPE, cropdam),fill= EVTYPE))+
  geom_bar(stat= 'identity')+
  ylab("Event Type")+xlab("Total Damages (Million $)")+
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
        text=element_text(size=12))+
  scale_color_manual(mycolors)+
  theme(legend.position = "none")+
  ggtitle("Crop Damages by Extreme Natural \n Events in the US 1950-2011")

ggarrange(p3,p4, 
          labels = c("A", "B"),
          ncol = 2, nrow = 1)