The goal of this project is to explore the U.S. National Oceanic and Atmospheric Administration’s (NOOA) Storm Database and answer some basic questions about severe weather events, for example:
The database tracks charateristics of major storms and weather events in the United States from 1950 to 2011, including when and where they occur, as well as estimates of any fatalities, injures, and property damage.
FATALITIES, INJURIES, PROPDMG, and CROPDMGFATALITIES and INJURIES, and extract top 10 harmful eventsPROPDMG and CROPDMG, then subset top 10 biggest damageOther cleaning task: Leading and trailing white space were removed in the Event type names; upper cases also were converted into lower case.
Load data
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "stormData.csv.bz2", method = "curl")
storm <- read.table("stormData.csv.bz2", header = T, sep = ",", stringsAsFactors = F)
stormReduce <- subset(storm, select = c(EVTYPE, FATALITIES, INJURIES, PROPDMG, CROPDMG))
stormReduce <- subset(stormReduce,PROPDMG != 0 | CROPDMG != 0 | FATALITIES != 0 | INJURIES !=0)
EVTYPE columnlibrary(stringr)
stormReduce$EVTYPE <- tolower(str_trim(stormReduce$EVTYPE))
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
storm_HealthDamage <- stormReduce %>% group_by(EVTYPE) %>% summarise(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES))
# Calculate top 10 damages
storm_HealthDamage$DAMAGE <- storm_HealthDamage$FATALITIES + storm_HealthDamage$INJURIES
top10HealthDamage <- head(storm_HealthDamage[order(storm_HealthDamage$DAMAGE, decreasing = T),],10)
top10HealthDamage$EVTYPE <- as.factor(top10HealthDamage$EVTYPE)
# Reorder factor based on value of damage
top10HealthDamage$EVTYPE <- reorder(top10HealthDamage$EVTYPE, top10HealthDamage$DAMAGE, sum)
top10HealthDamage
## Source: local data frame [10 x 4]
##
## EVTYPE FATALITIES INJURIES DAMAGE
## 1 tornado 5633 91346 96979
## 2 excessive heat 1903 6525 8428
## 3 tstm wind 504 6957 7461
## 4 flood 470 6789 7259
## 5 lightning 816 5230 6046
## 6 heat 937 2100 3037
## 7 flash flood 978 1777 2755
## 8 ice storm 89 1975 2064
## 9 thunderstorm wind 133 1488 1621
## 10 winter storm 206 1321 1527
storm_EcoDamage <- stormReduce %>% group_by(EVTYPE) %>% summarise(PROPDMG = sum(PROPDMG), CROPDMG = sum(CROPDMG))
storm_EcoDamage$DAMAGE <- storm_EcoDamage$PROPDMG + storm_EcoDamage$CROPDMG
top10EcoDamage <- head(storm_EcoDamage[order(storm_EcoDamage$DAMAGE, decreasing = T),], 10)
top10EcoDamage$EVTYPE <- factor(top10EcoDamage$EVTYPE)
top10EcoDamage$EVTYPE <- reorder(top10EcoDamage$EVTYPE, top10EcoDamage$DAMAGE, sum)
top10EcoDamage
## Source: local data frame [10 x 4]
##
## EVTYPE PROPDMG CROPDMG DAMAGE
## 1 tornado 3212258.2 100018.52 3312276.7
## 2 flash flood 1420174.6 179200.46 1599375.1
## 3 tstm wind 1336103.6 109202.60 1445306.2
## 4 hail 688693.4 579596.28 1268289.7
## 5 flood 899938.5 168037.88 1067976.4
## 6 thunderstorm wind 876844.2 66791.45 943635.6
## 7 lightning 603351.8 3580.61 606932.4
## 8 thunderstorm winds 446293.2 18684.93 464978.1
## 9 high wind 324731.6 17283.21 342014.8
## 10 winter storm 132720.6 1978.99 134699.6
library(reshape2)
top10HealthDamage <- melt(top10HealthDamage[,1:3], id.vars = "EVTYPE", variable.name = "DAMAGE_TYPE", value.name = "DAMAGE_COUNT")
top10EcoDamage <- melt(top10EcoDamage[,1:3], id.vars = "EVTYPE", variable.name = "DAMAGE_TYPE", value.name = "DAMAGE_COUNT")
# Change to descriptive label
top10EcoDamage$DAMAGE_TYPE <- factor(top10EcoDamage$DAMAGE_TYPE, labels = c("PROPERTY_DAMAGE", "CROP_DAMAGE"))
library(ggplot2)
ggplot(top10HealthDamage, aes(EVTYPE, DAMAGE_COUNT, fill = DAMAGE_TYPE)) + geom_bar(stat = "identity", position = "dodge") + coord_flip()
ggplot(top10EcoDamage, aes(EVTYPE, DAMAGE_COUNT, fill = DAMAGE_TYPE)) + geom_bar(stat = "identity", position = "dodge") + coord_flip()