Creating SD
dataset from the bz2 file:
#fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FSD.csv.bz2"
#download.file(fileURL, destfile = "repdata_data_StormData.csv.bz2")
SD <- read.csv(file="repdata_data_StormData.csv")
dim(SD)
## [1] 902297 37
Those are the variables selected for the analysis:
* BGN_DT: Date of the event;
* EVTYPE: event type (e.g. tornado, flood, etc.);
* FATALITIES: number of human deaths caused by the event;
* INJURIES: number of human injuries caused by the event;
* PROPDMG: property damage in USD
* PROPDMGEXP: multiplier of property damage (e.g. thousands, millions USD, etc.)
* CROPDMG: crop damage in USD
* CROPDMGEXP: multiplier of crop damage (e.g. thousands, millions USD, etc.)
SD <- SD[,c("BGN_DATE","EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
head(SD)
## BGN_DATE EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP
## 1 4/18/1950 0:00:00 TORNADO 0 15 25.0 K
## 2 4/18/1950 0:00:00 TORNADO 0 0 2.5 K
## 3 2/20/1951 0:00:00 TORNADO 0 2 25.0 K
## 4 6/8/1951 0:00:00 TORNADO 0 2 2.5 K
## 5 11/15/1951 0:00:00 TORNADO 0 2 2.5 K
## 6 11/15/1951 0:00:00 TORNADO 0 6 2.5 K
## CROPDMG CROPDMGEXP
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
str(SD)
## 'data.frame': 902297 obs. of 8 variables:
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
BGN_DT
variable is tranformed in Date format and the year is calculated and plotted
# Change Dates
SD$BGN_DATE <- sub(" 0:00:00", "", as.character(SD$BGN_DATE))
SD$BGN_DATE <- as.Date.character(x = SD$BGN_DATE, format = "%m/%d/%Y")
# Find year
require(lubridate)
SD$year <- year(SD$BGN_DATE)
hist(SD$year, breaks = 30)
From the histogram above is clear that the number of events recording had a significant increase around the 1995. So, we use the subset of the data from 1995 to 2011 to get most out of good records.
SD <- SD[SD$year >= 1995, ]
dim(SD)
## [1] 681500 9
Now we have to modify the name of Events type basin on the data documentation
require(RecordLinkage)
# Change Event Types names
SD$CHAREVENT <- tolower(as.character(SD$EVTYPE))
# List of Event Types as reported in the Storm Data documentation
eventTypes <- c("astronomical low tide", "avalanche", "blizzard", "coastal flood", "cold/wind chill", "debris flow", "dense fog", "dense smoke", "drought", "dust devil", "dust storm", "excessive heat", "extreme cold/wind chill", "flash flood", "flood", "frost freeze", "funnel cloud", "freezing fog", "hail", "heat", "heavy rain", "heavy snow", "high surf", "high wind", "hurricane typhoon", "ice storm", "lake effect snow", "lakeshore flood", "lightning", "marine hail", "marine high wind", "marine strong wind", "marine thunderstorm wind", "rip current", "seiche", "sleet", "storm surge tide", "strong wind", "thunderstorm wind", "tornado", "tropical depression", "tropical storm", "tsunami", "volcanic ash", "waterspout", "wildfire", "winter storm", "winter weather")
# Function to apply the Levenshtein Similarity Algorithm
SD$TIDYEVENT<- NA
setEventType <- function(x) {
similarity <- levenshteinSim(x, eventTypes)
return (eventTypes[which.max(similarity)])
}
SD$TIDYEVENT <- sapply(SD$CHAREVENT, setEventType)
After we need to correct the values relative to the event effects variables, changing empty variables with 0
and correcting the multipliers
SD[SD$FATALITIES == "", "FATALITIES"] <- 0
SD[SD$INJURIES == "","INJURIES"] <- 0
SD[SD$PROPDMG == "", "PROPDMG"] <- 0
SD[SD$CROPDMG == "", "CROPDMG"] <- 0
SD$PROPDMGEXP <- as.character(SD$PROPDMGEXP)
SD$CROPDMGEXP <- as.character(SD$CROPDMGEXP)
# correct exponentials
SD[(SD$PROPDMGEXP == ""),"PROPDMGEXP"] <- 0
SD[(SD$PROPDMGEXP == "+") | (SD$PROPDMGEXP == "-") | (SD$PROPDMGEXP == "?"),"PROPDMGEXP"] <- 1
SD[(SD$PROPDMGEXP == "h") | (SD$PROPDMGEXP == "H"),"PROPDMGEXP"] <- 2
SD[(SD$PROPDMGEXP == "k") | (SD$PROPDMGEXP == "K"),"PROPDMGEXP"] <- 3
SD[(SD$PROPDMGEXP == "m") | (SD$PROPDMGEXP == "M"),"PROPDMGEXP"] <- 6
SD[(SD$PROPDMGEXP == "B"),"PROPDMGEXP"] <- 9
SD[(SD$CROPDMGEXP == ""),"CROPDMGEXP"] <- 0
SD[(SD$CROPDMGEXP == "+") | (SD$CROPDMGEXP == "-") | (SD$CROPDMGEXP == "?"),"CROPDMGEXP"] <- 1
SD[(SD$CROPDMGEXP == "h") | (SD$CROPDMGEXP == "H"),"CROPDMGEXP"] <- 2
SD[(SD$CROPDMGEXP == "k") | (SD$CROPDMGEXP == "K"),"CROPDMGEXP"] <- 3
SD[(SD$CROPDMGEXP == "m") | (SD$CROPDMGEXP == "M"),"CROPDMGEXP"] <- 6
SD[(SD$CROPDMGEXP == "B"),"CROPDMGEXP"] <- 9
SD$PROPDMGEXP <- as.integer(SD$PROPDMGEXP)
SD$CROPDMGEXP <- as.integer(SD$CROPDMGEXP)
# Calculate the Total Damage for each event
SD$propDamage <- SD$PROPDMG * 10^SD$PROPDMGEXP
SD$cropDamage <- SD$CROPDMG * 10^SD$CROPDMGEXP
First we have to summarise FATALITIES
and INJURIES
by type of Event
require(dplyr)
require(ggplot2)
require(RColorBrewer)
SD_HumanHarm <- summarise(group_by(SD, TIDYEVENT), totFatalities = sum(FATALITIES),totInjuries = sum(INJURIES,na.rm=T))
SD_TopFatalities <- arrange(SD_HumanHarm,desc(totFatalities))[1:10,1:2]
Here the list of Top 10 Mortal Events
SD_TopFatalities
## Source: local data frame [10 x 2]
##
## TIDYEVENT totFatalities
## 1 excessive heat 2000
## 2 tornado 1545
## 3 flash flood 955
## 4 heat 924
## 5 lightning 732
## 6 rip current 569
## 7 high wind 522
## 8 flood 513
## 9 extreme cold/wind chill 272
## 10 heavy rain 271
And the relative plot
g <- ggplot(SD_TopFatalities, aes(x=TIDYEVENT, y=totFatalities))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("number of Fatalities") + xlab("Event") + ggtitle("Top 10 Mortal Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))
Based on the plot above, we find that excessive heat and tornado caused most fatalities in the US between 1995 and 2011.
SD_TopInjuries <- arrange(SD_HumanHarm,desc(totInjuries))[1:10,c(1,3)]
Here the list of Top 10 Injuring Events
SD_TopInjuries
## Source: local data frame [10 x 2]
##
## TIDYEVENT totInjuries
## 1 tornado 21783
## 2 flood 7533
## 3 excessive heat 6703
## 4 high wind 4926
## 5 lightning 4634
## 6 heat 2030
## 7 thunderstorm wind 1940
## 8 flash flood 1765
## 9 wildfire 1534
## 10 winter storm 1375
And the relative plot
g <- ggplot(SD_TopInjuries, aes(x=TIDYEVENT, y=totInjuries))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("number of Injuries") + xlab("Event") + ggtitle("Top 10 Injurig Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))
Based on the plot above, we find that tornado caused the majority of injuries in the US between 1995 and 2011. ### Event Economic Damage Summarise
propDamage
and cropDamage
by type of event
SD_Damage <- summarise(group_by(SD, TIDYEVENT), totPropDamage = round(sum(propDamage)/1e09,3), totCropDamage = round(sum(cropDamage)/1e09,3))
SD_PropDamage <- arrange(SD_Damage,desc(totPropDamage))[1:10,1:2]
Here the list of Top 10 Property Damaging Events
SD_PropDamage
## Source: local data frame [10 x 2]
##
## TIDYEVENT totPropDamage
## 1 flood 144.041
## 2 hurricane typhoon 85.150
## 3 storm surge tide 47.836
## 4 tornado 24.941
## 5 flash flood 16.377
## 6 hail 15.050
## 7 high wind 9.833
## 8 wildfire 8.086
## 9 tropical storm 7.658
## 10 thunderstorm wind 4.387
And the relative plot
g <- ggplot(SD_PropDamage, aes(x=TIDYEVENT, y=totPropDamage))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("$ Billion") + xlab("Event") + ggtitle("Top 10 Property Damage Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))
Based on the plot above, we find that Flooding and Hurricanes caused the higher costs in term of property damage in the US between 1995 and 2011. #### Event Crop Damage
SD_CropDamage <- arrange(SD_Damage,desc(totCropDamage))[1:10,c(1,3)]
Here the list of Top 10 Crop Damaging Events
SD_CropDamage
## Source: local data frame [10 x 2]
##
## TIDYEVENT totCropDamage
## 1 drought 13.927
## 2 flood 5.536
## 3 hurricane typhoon 5.506
## 4 hail 2.614
## 5 frost freeze 1.839
## 6 flash flood 1.496
## 7 extreme cold/wind chill 1.330
## 8 high wind 1.222
## 9 heavy rain 0.745
## 10 tropical storm 0.694
And the relative plot
g <- ggplot(SD_CropDamage, aes(x=TIDYEVENT, y=totCropDamage))
g + geom_bar(stat="identity", fill = brewer.pal(3,"Set1")[1]) + ylab("$ Billion") + xlab("Event") + ggtitle("Top 10 Property Damage Event") + theme(axis.text.x = element_text(angle = 45,hjust=1))
Based on the plot above, we find that Drought caused the higher costs in term of crop damage in the US between 1995 and 2011.
From these data, we found that excessive heat and tornado are most harmful with respect to population health, while flood, drought, and hurricanes have the greatest economic consequences.