The United States National Oceanic and Atmospheric Administration (NOAA) tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The database starts in 1950 and ends in November 2011. The data quality is less good in the earlier years and this is probably due to a lack of recordings.
The first part of the analysis summarizes the number of harmful events (injuries and deaths) by the ten biggest causes of these events. The second part summarizes the size of damage measured in USD on property and crops by the ten biggest causes of those events.
The report addresses two main questions. Firstly which types of events are most harmful with respect to population health and the data shows that Tornados followed by Thunderstorms accounts for the most casualties (Death and Injury).Secondly the events with the greatest economic consequence are Flooding and Hurricanes (Property and Crop Damage).
The data processing is done in Four parts. Firstly preparing setup and environment followed by Data download and data reading and finally data cleaning of actual events data and economic damage estimates.
The data cleaning of the events database was the most time consuming as the data needed quite a bit of reclassification and transformation to be able to have meaningfull aggregated groups.
setwd("~/G-ART/artData/Coursera/ReproducibleResearch")
if(!file.exists("./Assignment2")){dir.create("./Assignment2")}
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.2
library(knitr)
## Warning: package 'knitr' was built under R version 3.2.1
library(data.table)
## Warning: package 'data.table' was built under R version 3.2.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.2
setwd("~/G-ART/artData/Coursera/ReproducibleResearch/Assignment2")
echo = TRUE
if (!file.exists("downloadStormData.csv.bz2")) {
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", dest="downloadStormData.csv.bz2")
}
stormData.All = read.csv("downloadStormData.csv.bz2",header = TRUE)
stormData.Subset <- subset(stormData.All, select = c("BGN_DATE", "TIME_ZONE", "STATE", "EVTYPE", "FATALITIES",
"INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP"))
# adding seasonality to the data
stormData.Subset$BGN_DATE <- as.POSIXlt(strptime(gsub( " .*$", "", stormData.Subset$BGN_DATE), format = "%m/%d/%Y"))
thisquarter <- quarters.Date(stormData.Subset$BGN_DATE)
stormData.Subset <- cbind(thisquarter, stormData.Subset)
if (!file.exists("stormData.Subset.csv")) {
write.table(stormData.Subset, file="stormData.Subset.csv", sep="|", row.names=FALSE)
} else {
stormData.Subset <- read.table("stormData.Subset.csv", header=TRUE, sep="|")
}
stormData.Subset$EVTYPE <- tolower(stormData.Subset$EVTYPE)
stormData.Subset <- subset(stormData.Subset, !grepl('^summary', stormData.Subset$EVTYPE))
stormData.Subset <- subset(stormData.Subset, !grepl('^record', stormData.Subset$EVTYPE))
stormData.Subset <- subset(stormData.Subset, !grepl('^monthly', stormData.Subset$EVTYPE))
stormData.Subset$EVTYPE <- gsub("\\(|\\)", "", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*blizzard.*", "Blizzard", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*coastal.*", "Coastal Flooding", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*winter.*", "Winter Storms", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*thunderstorm.*", "Thunderstorm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*urban.*", "Urban Flodding", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*water.*", "Water Sprout", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*cold.*", "Cold Weather", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*heavy rain.*", "Heavy Rain", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*tornado.*", "Tornado", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*drought.*", "Drought", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*flood.*", "Flooding", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*dust.*", "Dust Storm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*hail.*", "Hail Storm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*dry.*", "Dry Weather", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*snow.*", "Snowfall", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*heat.*", "HeatWave", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*freezing.*", "Freezing Weather", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*storm.*", "Storm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*high wind.*", "Storm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*strong wind.*", "Storm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*hurricane.*", "Hurricane", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*typhoon.*", "Hurricane", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*mud.*", "Mud Slide", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*thunder.*", "Thunderstorm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*tstm.*", "Thunderstorm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*lightning.*", "Thunderstorm", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*wild.*", "Wild Fires", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*volcanic.*", "Volcanic Ash", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*rip current.*", "Rip Current", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*avalanc.*", "Avalanche", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*surf.*", "High/Heavy Surf", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*fog.*", "Fog", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*glace.*", "Glace", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*tsunami.*", "Tsunami", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*frost.*", "Frost", stormData.Subset$EVTYPE)
stormData.Subset$EVTYPE <- gsub(".*freeze.*", "Frost", stormData.Subset$EVTYPE)
write.table(stormData.Subset, file="stormData.Subset2.csv", sep="|", row.names=FALSE)
str(stormData.Subset)
## 'data.frame': 901773 obs. of 11 variables:
## $ thisquarter: Factor w/ 4 levels "Q1","Q2","Q3",..: 2 2 1 2 4 4 4 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1950-01-03","1950-01-13",..: 16 16 97 136 193 193 194 201 205 205 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : chr "Tornado" "Tornado" "Tornado" "Tornado" ...
## $ FATALITIES : int 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : int 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP : Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP : Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
stormData.Subset$PROPDMGEXP <- tolower(stormData.Subset$PROPDMGEXP)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='h',10^2)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='k',10^3)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='m',10^6)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='b',10^9)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='0',0)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='1',10^1)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='2',10^2)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='3',10^3)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='4',10^4)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='5',10^5)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='6',10^6)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='7',10^7)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='8',10^8)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='?',1)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='+',1)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='',1)
stormData.Subset$PROPDMGEXP <- replace(stormData.Subset$PROPDMGEXP,stormData.Subset$PROPDMGEXP=='-',1)
stormData.Subset$PROPDMGEXP <- as.numeric(stormData.Subset$PROPDMGEXP)
stormData.Subset$CROPDMGEXP <- tolower(stormData.Subset$CROPDMGEXP)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='h',10^2)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='k',10^3)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='m',10^6)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='b',10^9)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='0',0)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='1',10^1)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='2',10^2)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='3',10^3)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='4',10^4)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='5',10^5)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='6',10^6)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='7',10^7)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='8',10^8)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='?',1)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='+',1)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='',1)
stormData.Subset$CROPDMGEXP <- replace(stormData.Subset$CROPDMGEXP,stormData.Subset$CROPDMGEXP=='-',1)
stormData.Subset$CROPDMGEXP <- as.numeric(stormData.Subset$CROPDMGEXP)
stormData.Subset$AMTPROPDMG <- stormData.Subset$PROPDMG * stormData.Subset$PROPDMGEXP
stormData.Subset$AMTCROPDMG <- stormData.Subset$CROPDMG * stormData.Subset$CROPDMGEXP
write.table(stormData.Subset, file="stormData.Subset3.csv", sep="|", row.names=FALSE)
The weather has a severe impact both in form of economic damage and human casualties.
The report addresses two main questions. Firstly which types of events are most harmful with respect to population health and the data shows that Tornados followed by Heatwave are the ones with the highest number of fatilities whereas Tornado and Thunderstorms are the are the two factor with the highest Injury rate.
Secondly the events with the greatest economic consequence for properties are Flooding and Hurricanes and for crop damage Drought and Flooding are the biggest factors.
FatalitySum <- tapply(stormData.Subset$FATALITIES, stormData.Subset$EVTYPE, FUN = sum, na.rm = TRUE)
InjuriesSum <- tapply(stormData.Subset$INJURIES, stormData.Subset$EVTYPE, FUN = sum, na.rm = TRUE)
MostFatality <- FatalitySum[order(FatalitySum, decreasing = TRUE)]
MostInjuries <- InjuriesSum[order(InjuriesSum, decreasing = TRUE)]
Top10MostFatality <- MostFatality[1:10]
Top10MostInjuries <- MostInjuries[1:10]
TotalCasualtiesSum <- FatalitySum + InjuriesSum
TotalCasualtiesSorted <- TotalCasualtiesSum[order(TotalCasualtiesSum, decreasing = TRUE)]
Top10MostCasualties <- TotalCasualtiesSorted[1:10]
Top10MostFatality
## Tornado HeatWave Flooding Thunderstorm
## 5658 3113 1517 1332
## Storm Rip Current Cold Weather Winter Storms
## 808 577 450 278
## Avalanche High/Heavy Surf
## 225 161
Top10MostInjuries
## Tornado Thunderstorm HeatWave Flooding Storm
## 91364 12201 9159 8597 6679
## Winter Storms Wild Fires Hail Storm Hurricane Snowfall
## 1891 1606 1466 1333 1165
Top10MostCasualties
## Tornado Thunderstorm HeatWave Flooding Storm
## 97022 13533 12272 10114 7487
## Winter Storms Wild Fires Hail Storm Hurricane Snowfall
## 2169 1696 1486 1466 1319
fatalities <- aggregate(FATALITIES ~ EVTYPE, data=stormData.Subset,FUN=sum)
sorting <- order(fatalities$FATALITIES,decreasing = TRUE)
Top10fatalities <- fatalities[sorting,]
Top10fatalities = Top10fatalities[1:10,]
injuries <- aggregate(INJURIES ~ EVTYPE, data = stormData.Subset,FUN=sum)
sorting <- order(injuries$INJURIES,decreasing = TRUE)
Top10injuries <- injuries[sorting,]
Top10injuries = Top10injuries[1:10,]
p1 <- ggplot(Top10fatalities, aes(x=reorder(EVTYPE, FATALITIES), y=FATALITIES)) +
geom_bar(stat="identity", colour="white", fill = "red") +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1, size=10)) +
xlab("Event Type") +
ylab("Number of Incidents") +
ggtitle("Fatalities by Event Type")
p2 <- ggplot(Top10injuries, aes(x=reorder(EVTYPE, INJURIES), y=INJURIES)) +
geom_bar(stat="identity", colour="white", fill = "red") +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1, size=10)) +
xlab("Event Type") +
ylab("Number of Incidents") +
ggtitle("Injuries by Event Type")
grid.arrange(p1,p2, ncol = 2)
PropertyDamageSum <- tapply(stormData.Subset$AMTPROPDMG, stormData.Subset$EVTYPE, FUN = sum, na.rm = TRUE)
CropDamageSum <- tapply(stormData.Subset$AMTCROPDMG, stormData.Subset$EVTYPE, FUN = sum, na.rm = TRUE)
MostPropertyDamage <- PropertyDamageSum[order(PropertyDamageSum, decreasing = TRUE)]
MostCropDamage <- CropDamageSum[order(CropDamageSum, decreasing = TRUE)]
Top10MostPropertyDamage <- MostPropertyDamage[1:10]
Top10MostMostCropDamage <- MostCropDamage[1:10]
TotalDamageSum <- PropertyDamageSum + CropDamageSum
TotalDamageSorted <- TotalDamageSum[order(TotalDamageSum, decreasing = TRUE)]
Top10MostDamage <- TotalDamageSorted[1:10]
Top10MostPropertyDamage
## Flooding Hurricane Storm Tornado Hail Storm
## 167739651354 85256410010 72429431120 58552151743 16021899777
## Wild Fires Winter Storms Thunderstorm Heavy Rain Drought
## 8491563500 6776795250 5435213223 3248543142 1046306000
Top10MostMostCropDamage
## Drought Flooding Storm Hurricane Hail Storm
## 13972621780 12378380100 7130120208 5506117800 3111583853
## Frost Cold Weather HeatWave Heavy Rain Thunderstorm
## 1997061000 1416765550 904413500 795762800 566099440
Top10MostDamage
## Flooding Hurricane Storm Tornado Hail Storm
## 180118031454 90762527810 79559551328 58969613053 19133483630
## Drought Wild Fires Winter Storms Thunderstorm Heavy Rain
## 15018927780 8894345130 6824239250 6001312663 4044305942
propertydamage <- aggregate(AMTPROPDMG ~ EVTYPE, data=stormData.Subset,FUN=sum)
sorting <- order(propertydamage$AMTPROPDMG,decreasing = TRUE)
Top10propertydamage <- propertydamage[sorting,]
Top10propertydamage = Top10propertydamage[1:10,]
cropdamage <- aggregate(AMTCROPDMG ~ EVTYPE, data = stormData.Subset,FUN=sum)
sorting <- order(cropdamage$AMTCROPDMG,decreasing = TRUE)
Top10cropdamage <- cropdamage[sorting,]
Top10cropdamage = Top10cropdamage[1:10,]
p3 <- ggplot(Top10propertydamage, aes(x=reorder(EVTYPE, AMTPROPDMG), y=AMTPROPDMG)) +
geom_bar(stat="identity", colour="white", fill = "blue") +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1, size=10)) +
xlab("Event Type") +
ylab("USD") +
ggtitle("Property Damage by Event Type")
p4 <- ggplot(Top10cropdamage, aes(x=reorder(EVTYPE, AMTCROPDMG), y=AMTCROPDMG)) +
geom_bar(stat="identity", colour="white", fill = "blue") +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1, size=10)) +
xlab("Event Type") +
ylab("USD") +
ggtitle("Crop Damage by Event Type")
grid.arrange(p3,p4, ncol = 2)