Loading and preprocessing the data

Creating SD dataset from the bz2 file:

#fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FSD.csv.bz2"
#download.file(fileURL, destfile = "repdata_data_StormData.csv.bz2")
SD <- read.csv(file="repdata_data_StormData.csv")
dim(SD)
## [1] 902297     37

Those are the variables selected for the analysis:

* BGN_DT: Date of the event;
* EVTYPE: event type (e.g. tornado, flood, etc.);
* FATALITIES: number of human deaths caused by the event;
* INJURIES: number of human injuries caused by the event;
* PROPDMG: property damage in USD
* PROPDMGEXP: multiplier of property damage (e.g. thousands, millions USD, etc.)
* CROPDMG: crop damage in USD
* CROPDMGEXP: multiplier of crop damage (e.g. thousands, millions USD, etc.)
SD <- SD[,c("BGN_DATE","EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
head(SD)
##             BGN_DATE  EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP
## 1  4/18/1950 0:00:00 TORNADO          0       15    25.0          K
## 2  4/18/1950 0:00:00 TORNADO          0        0     2.5          K
## 3  2/20/1951 0:00:00 TORNADO          0        2    25.0          K
## 4   6/8/1951 0:00:00 TORNADO          0        2     2.5          K
## 5 11/15/1951 0:00:00 TORNADO          0        2     2.5          K
## 6 11/15/1951 0:00:00 TORNADO          0        6     2.5          K
##   CROPDMG CROPDMGEXP
## 1       0           
## 2       0           
## 3       0           
## 4       0           
## 5       0           
## 6       0
str(SD)
## 'data.frame':    902297 obs. of  8 variables:
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...

BGN_DT variable is tranformed in Date format and the year is calculated and plotted

# Change Dates
SD$BGN_DATE <- sub(" 0:00:00", "", as.character(SD$BGN_DATE))
SD$BGN_DATE <- as.Date.character(x = SD$BGN_DATE, format = "%m/%d/%Y")

# Find year
require(lubridate)
SD$year <- year(SD$BGN_DATE)
hist(SD$year, breaks = 30)

From the histogram above is clear that the number of events recording had a significant increase around the 1995. So, we use the subset of the data from 1995 to 2011 to get most out of good records.

SD <- SD[SD$year >= 1995, ]
dim(SD)
## [1] 681500      9

Now we have to modify the name of Events type basin on the data documentation

require(RecordLinkage)

# Change Event Types names
SD$CHAREVENT <- tolower(as.character(SD$EVTYPE))

# List of Event Types as reported in the Storm Data documentation 
eventTypes <- c("astronomical low tide", "avalanche", "blizzard", "coastal flood", "cold/wind chill", "debris flow", "dense fog", "dense smoke", "drought", "dust devil", "dust storm", "excessive heat", "extreme cold/wind chill", "flash flood", "flood", "frost freeze", "funnel cloud", "freezing fog", "hail", "heat", "heavy rain", "heavy snow", "high surf", "high wind", "hurricane typhoon", "ice storm", "lake effect snow", "lakeshore flood", "lightning", "marine hail", "marine high wind", "marine strong wind", "marine thunderstorm wind", "rip current", "seiche", "sleet", "storm surge tide", "strong wind", "thunderstorm wind", "tornado", "tropical depression", "tropical storm", "tsunami", "volcanic ash", "waterspout", "wildfire", "winter storm", "winter weather")

# Function to apply the Levenshtein Similarity Algorithm
SD$TIDYEVENT<- NA
setEventType <- function(x) {
        similarity <- levenshteinSim(x, eventTypes)
        return (eventTypes[which.max(similarity)])
}

SD$TIDYEVENT <- sapply(SD$CHAREVENT, setEventType)

After we need to correct the values relative to the event effects variables, changing empty variables with 0 and correcting the multipliers

SD[SD$FATALITIES == "", "FATALITIES"] <- 0
SD[SD$INJURIES == "","INJURIES"] <- 0
SD[SD$PROPDMG == "", "PROPDMG"] <- 0
SD[SD$CROPDMG == "", "CROPDMG"] <- 0

SD$PROPDMGEXP <- as.character(SD$PROPDMGEXP)
SD$CROPDMGEXP <- as.character(SD$CROPDMGEXP)

# correct exponentials
SD[(SD$PROPDMGEXP == ""),"PROPDMGEXP"] <- 0
SD[(SD$PROPDMGEXP == "+") | (SD$PROPDMGEXP == "-") | (SD$PROPDMGEXP == "?"),"PROPDMGEXP"] <- 1
SD[(SD$PROPDMGEXP == "h") | (SD$PROPDMGEXP == "H"),"PROPDMGEXP"] <- 2
SD[(SD$PROPDMGEXP == "k") | (SD$PROPDMGEXP == "K"),"PROPDMGEXP"] <- 3
SD[(SD$PROPDMGEXP == "m") | (SD$PROPDMGEXP == "M"),"PROPDMGEXP"] <- 6
SD[(SD$PROPDMGEXP == "B"),"PROPDMGEXP"] <- 9

SD[(SD$CROPDMGEXP == ""),"CROPDMGEXP"] <- 0
SD[(SD$CROPDMGEXP == "+") | (SD$CROPDMGEXP == "-") | (SD$CROPDMGEXP == "?"),"CROPDMGEXP"] <- 1
SD[(SD$CROPDMGEXP == "h") | (SD$CROPDMGEXP == "H"),"CROPDMGEXP"] <- 2
SD[(SD$CROPDMGEXP == "k") | (SD$CROPDMGEXP == "K"),"CROPDMGEXP"] <- 3
SD[(SD$CROPDMGEXP == "m") | (SD$CROPDMGEXP == "M"),"CROPDMGEXP"] <- 6
SD[(SD$CROPDMGEXP == "B"),"CROPDMGEXP"] <- 9

SD$PROPDMGEXP <- as.integer(SD$PROPDMGEXP)
SD$CROPDMGEXP <- as.integer(SD$CROPDMGEXP)

# Calculate the Total Damage for each event
SD$propDamage <- SD$PROPDMG * 10^SD$PROPDMGEXP
SD$cropDamage <- SD$CROPDMG * 10^SD$CROPDMGEXP

Results

Events Human Harm

First we have to summarise FATALITIES and INJURIES by type of Event

require(dplyr)
require(ggplot2)
require(RColorBrewer)

SD_HumanHarm <- summarise(group_by(SD, TIDYEVENT), totFatalities = sum(FATALITIES),totInjuries = sum(INJURIES,na.rm=T))

Event Mortality

SD_TopFatalities <- arrange(SD_HumanHarm,desc(totFatalities))[1:10,1:2]

Here the list of Top 10 Mortal Events

SD_TopFatalities
## Source: local data frame [10 x 2]
## 
##                  TIDYEVENT totFatalities
## 1           excessive heat          2000
## 2                  tornado          1545
## 3              flash flood           955
## 4                     heat           924
## 5                lightning           732
## 6              rip current           569
## 7                high wind           522
## 8                    flood           513
## 9  extreme cold/wind chill           272
## 10              heavy rain           271

And the relative plot

g <- ggplot(SD_TopFatalities, aes(x=TIDYEVENT, y=totFatalities))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("number of Fatalities") + xlab("Event") + ggtitle("Top 10 Mortal Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))

Based on the plot above, we find that excessive heat and tornado caused most fatalities in the US between 1995 and 2011.

Event Injuries

SD_TopInjuries <- arrange(SD_HumanHarm,desc(totInjuries))[1:10,c(1,3)]

Here the list of Top 10 Injuring Events

SD_TopInjuries
## Source: local data frame [10 x 2]
## 
##            TIDYEVENT totInjuries
## 1            tornado       21783
## 2              flood        7533
## 3     excessive heat        6703
## 4          high wind        4926
## 5          lightning        4634
## 6               heat        2030
## 7  thunderstorm wind        1940
## 8        flash flood        1765
## 9           wildfire        1534
## 10      winter storm        1375

And the relative plot

g <- ggplot(SD_TopInjuries, aes(x=TIDYEVENT, y=totInjuries))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("number of Injuries") + xlab("Event") + ggtitle("Top 10 Injurig Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))

Based on the plot above, we find that tornado caused the majority of injuries in the US between 1995 and 2011. ### Event Economic Damage Summarise propDamage and cropDamage by type of event

SD_Damage <- summarise(group_by(SD, TIDYEVENT), totPropDamage = round(sum(propDamage)/1e09,3), totCropDamage = round(sum(cropDamage)/1e09,3))

Event Property Damage

SD_PropDamage <- arrange(SD_Damage,desc(totPropDamage))[1:10,1:2]

Here the list of Top 10 Property Damaging Events

SD_PropDamage
## Source: local data frame [10 x 2]
## 
##            TIDYEVENT totPropDamage
## 1              flood       144.041
## 2  hurricane typhoon        85.150
## 3   storm surge tide        47.836
## 4            tornado        24.941
## 5        flash flood        16.377
## 6               hail        15.050
## 7          high wind         9.833
## 8           wildfire         8.086
## 9     tropical storm         7.658
## 10 thunderstorm wind         4.387

And the relative plot

g <- ggplot(SD_PropDamage, aes(x=TIDYEVENT, y=totPropDamage))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("$ Billion") + xlab("Event") + ggtitle("Top 10 Property Damage Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))

Based on the plot above, we find that Flooding and Hurricanes caused the higher costs in term of property damage in the US between 1995 and 2011. #### Event Crop Damage

SD_CropDamage <- arrange(SD_Damage,desc(totCropDamage))[1:10,c(1,3)]

Here the list of Top 10 Crop Damaging Events

SD_CropDamage
## Source: local data frame [10 x 2]
## 
##                  TIDYEVENT totCropDamage
## 1                  drought        13.927
## 2                    flood         5.536
## 3        hurricane typhoon         5.506
## 4                     hail         2.614
## 5             frost freeze         1.839
## 6              flash flood         1.496
## 7  extreme cold/wind chill         1.330
## 8                high wind         1.222
## 9               heavy rain         0.745
## 10          tropical storm         0.694

And the relative plot

g <- ggplot(SD_CropDamage, aes(x=TIDYEVENT, y=totCropDamage))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("$ Billion") + xlab("Event") + ggtitle("Top 10 Property Damage Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))

Based on the plot above, we find that Drought caused the higher costs in term of crop damage in the US between 1995 and 2011.

Conclusion

From these data, we found that excessive heat and tornado are most harmful with respect to population health, while flood, drought, and hurricanes have the greatest economic consequences.