Weather events have great economic and health consequences to the United States population and economy. With data submited to the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, we are able to identify the top type of weather events that affect the United States.
The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm data consists of events submited from 1950 to 2011. The data set is available in a coma delimited (.csv) file, which we will download form Amazon’s cloudfront.net.
We downloaded and read the sample Storm Data into R, ensuring all column names are R friendly. Because of the size of the set, we subset only the columns relevant to this analysis.
library(knitr)
# get data file
if (!file.exists("repdata_data_StormData.csv.bz2")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2")
}
# read the Storm Data compressed bzfile
storm.data <- read.csv(bzfile("repdata_data_StormData.csv.bz2"))
# remove all fields irrelevant to the analysis
storm.data <- storm.data[, c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP", "REFNUM")]
# ensure colmn names are R friendly
names(storm.data) <- make.names(names(storm.data), unique = TRUE)
The columns relevant to this analysis consist of the BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, REFNUM columns.
The event types (EVTYPE) values were entered manually by the record submiters. This approach created inconsistencies across a large percent of records. To address this issue, we cleaned the values by standardizing to a set of general events.
Events that spanned more than a month and were entered as one record (as summary) were omitted because separating costs would be too inacurate.
To preserve the original values we created a copy of the EVTYPE column named EVTYPE.COPY. This new column (EVTYPE.COPY) is where we will perform all cleaning and perform the analysis.
library(stringr)
storm.data$EVTYPE.COPY <- storm.data$EVTYPE # make working copy of EVTYPE
storm.data$EVTYPE.COPY <- str_trim(storm.data$EVTYPE.COPY) # trim the values
storm.data$EVTYPE.COPY <- toupper(storm.data$EVTYPE.COPY) # convert values to upper case
gsub.EVTYPE.COPY <- function( replace.with , patern.list ){
for(i.patern in patern.list){
storm.data[grep(i.patern, storm.data$EVTYPE.COPY),]$EVTYPE.COPY <-
gsub(i.patern , replace.with ,
storm.data[grep(i.patern,storm.data$EVTYPE.COPY),]$EVTYPE.COPY
)
}
assign('storm.data',storm.data,envir=.GlobalEnv)
}
# function to fix special characters in EVTYPE.COPY ####
fix.special.characters <- function(){
gsub.EVTYPE.COPY("",c("[(|)|.|:|&|0-9|?]","MPH","AGRICULTURAL "," AND$"," DAMAGE","EROSION","^NONE$","INJURY","MAJOR","MINOR","RECORD","RURAL","SEVERE ","UNSEASONAL","SPELL","(NO WEATHER)","UNSEASONAL","UNSEASONABLE","UNSEASONABLY","STREET","URBAN","EROSIN","DENSE","UNUSUALLY","(UNUSUAL)","DEPRESSION","TREE[S]","^/","/$"," .$"," -$"))
storm.data$EVTYPE.COPY <- gsub(" {2,}", " ", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("/",c(" AND ","-",","," /","/ ","\\\\","- "))
storm.data$EVTYPE.COPY <- gsub(" {2,}", " ", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- str_trim(storm.data$EVTYPE.COPY)
assign('storm.data',storm.data,envir=.GlobalEnv)
}
fix.special.characters()
storm.data[grep("^SUMMARY ",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "" # delete all summary records
# STANDARDIZE SPELLING OF EVTYPE.COPY ####
storm.data$EVTYPE.COPY <- sub("AVALANCE","AVALANCHE", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("CURRENTS", "CURRENT", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("DEVEL", "DEVIL", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("DUSTSTORM", "DUST STORM", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("FLOOD",c("FLOODS","FLOODING","FLOODIN","FLD","FLDG"))
storm.data$EVTYPE.COPY <- sub("FLOWS", "FLOW", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("LANDSLIDES|LANDSLUMP", "LANDSLIDE", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("LIGHTING", "LIGHTNING", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("RAINS", "RAIN", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("ROADS", "ROAD", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("SNOWFALL", "SNOW", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("SQUALLS", "SQUALL", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("STORMS", "STORM", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("TEMPERATURES", "TEMPERATURE", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("TORNADO",c("TORNDAO","TORNADOES","TORNADOS","TORNDAO"))
gsub.EVTYPE.COPY("THUNDERSTORM", c("THUNDERSTORMS","THUNDERSNOW","THUNDERSNOW","THUNDERSTORMW","THUNDERSTROM","THUNDERSTROM","TUNDERSTORM","TSTMW","TSTM","THUNDESTORM","THUNERSTORM","THUNDERTSORM","THUNDEERSTORM","THUNDERESTORM","THUDERSTORM","THUNDERTORM"))
storm.data$EVTYPE.COPY <- sub("TIDES", "TIDE", storm.data$EVTYPE.COPY)
gsub.EVTYPE.COPY("WILDFIRE",c("WILDFIRES","WILD FIRES","FIRES"))
gsub.EVTYPE.COPY("WIND",c("WINDS","WINS","WINDD","WND","W INDS","WIND G"))
gsub.EVTYPE.COPY("WINTER",c("WINTRY","WINTERY"))
storm.data$EVTYPE.COPY <- sub("FUNNEL CLOUD","TORNADO", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("VOG","FOG", storm.data$EVTYPE.COPY)
storm.data$EVTYPE.COPY <- sub("WAVES","WAVE", storm.data$EVTYPE.COPY)
fix.special.characters()
# STANDARDIZE EVTYPE.COPY ####
storm.data[grep("(TORNADO)|(FUNNEL)|NADO|ROTATING|LANDSPOUT",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "TORNADO"
storm.data[grep("(HURRIC)|(TYPHO)|(FLOYD)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HURRICANE"
storm.data[grep("BLIZZARD",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "BLIZZARD"
storm.data[grep("THUNDERS|(METRO STORM)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "THUNDERSTORM_WIND"
storm.data[grep("TROPICAL",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "TROPICAL_STORM"
storm.data[grep("(WAYTERSPOUT)|(WATER SPOU)|(WATERSPOU)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WATERSPOUT"
storm.data[grep("FLOOD|DAM|SURGE|(FLASH FLOO)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "FLOOD"
storm.data[grep("HEAVY RAIN|RAIN|SHOWER|WET|PRECIPITATION|PRECIP",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HEAVY_RAIN"
storm.data[grep("SNOW",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HEAVY_SNOW"
storm.data[grep("HAIL|ICE PELLETS",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HAIL"
storm.data[grep("MICROBURST",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "DOWNBURST"
storm.data[grep("DRY|LOW RAIN|DRIEST",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "DROUGHT"
storm.data[grep("HEAT|HOT|WARM|HIGH TEMP",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HEAT"
storm.data[grep("COLD|LOW TEMP|HYPOTHERMIA|COOL|HYPERTHERMIA|(EXTREME WINDCHILL TEMPERATURE)",
storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "COLD"
storm.data[grep("FROST|FREEZE|FREEZ",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "FROST"
storm.data[grep("(MUD)|(DEBR)|(LANDSLIDE)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "DEBRIS_FLOW"
storm.data[grep("VOLCAN",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "VOLCANIC_ERUPTION"
storm.data[grep("FIRE",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WILDFIRE"
storm.data[grep("WINTER WEAT|WINTER MIX",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WINTER_WEATHER"
storm.data[grep("WINTER STORM",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "WINTER_STORM"
storm.data[grep("(WIND CHILL)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "COLD_WIND_CHILL"
storm.data[grep("SURF|SEA|SWELL|(HIGH TIDE)",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HIGH_SURF"
storm.data[grep("FOG",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "FOG"
storm.data[grep("SLEET|ROAD|BLACK ICE|PATCHY ICE",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "SLEET"
storm.data[grep("COASTALSTORM|COASTAL",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "COASTAL_STORM"
storm.data[grep("LIGHTNING",storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "LIGHTNING"
storm.data[grep("ICE FLOES|ICE STORM|(ICE/STRONG WIND)|(GLAZE ICE)", storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "ICE_STORM"
storm.data[grep("(WIND STORM)|^WIND$|(WIND ADVISORY)|(WIND ADVISORY)|(HIGH WIND)|(GUSTY WIND)|STRONG WIND|(STORM FORCE WIND)", storm.data$EVTYPE.COPY),]$EVTYPE.COPY <- "HIGH_WIND"
storm.data$EVTYPE.COPY <- gsub("_"," ", storm.data$EVTYPE.COPY) # remove underscore
storm.data <- storm.data[storm.data$EVTYPE.COPY!='',] # remove records without EVTYPE.COPY value
In addition to cleaning the event values (in EVTYPE.COPY), we also
storm.data$BGN_DATE <- as.Date(storm.data$BGN_DATE,"%m/%d/%Y 0:00:00") # fix begining date
storm.data$YEAR <- as.POSIXlt(storm.data$BGN_DATE)$year + 1900 # add year column
Because in the earlier years of the database there were fewer events recorded, most likely due to a lack of good records, we will use more recent years in the analysis. Records from 1995 to 2011 will be used to determine events with the greatest effect on population health and economic cost.
events.by.year <- aggregate(REFNUM ~ YEAR , data=storm.data, FUN=length) # total values by EVTYPE
plot(events.by.year$YEAR, events.by.year$REFNUM, xlab="Year", ylab="Events", type="l", main="Events Submited by Year", las=1, , cex.axis=0.7)
abline(v="1995", col="red") # add 1995 as break point for more complete data
Count of events submited by year
To identify which events are most harmful to the population health, we will use the FATALITIES and INJURIES columns, which are in count of occurences.
We first total annual fatalities and injuries by event type from 1995 to 2011. Then, we will select the top 3 events for fatalities and injuries.
fatality.injuries.total <- aggregate(cbind(FATALITIES,INJURIES) ~ EVTYPE.COPY , data=storm.data[storm.data$YEAR>=2000,], FUN=sum)
The top 3 event types with fatal consequences include
# top event types for fatalities
(top.fatality.events <- head(fatality.injuries.total[order(-fatality.injuries.total$FATALITIES),] ,
n=n.top.events)[,c("EVTYPE.COPY","FATALITIES")])
## EVTYPE.COPY FATALITIES
## 22 HEAT 1244
## 41 TORNADO 1193
## 15 FLOOD 893
The top 3 event types causing injuries to the US pupulation include
# top event types for injuries
(top.injuries.events <- head(fatality.injuries.total[order(-fatality.injuries.total$INJURIES),] ,
n=n.top.events)[,c("EVTYPE.COPY","INJURIES")])
## EVTYPE.COPY INJURIES
## 41 TORNADO 15213
## 22 HEAT 4933
## 40 THUNDERSTORM WIND 3225
To determine events across the United States with the greatest economic consequences we looked at Property and Crop damages in the Us.
To determine the events, which produce the most property damages, we used the following columns
library(stringr)
storm.data$PROPDMGEXP <- toupper(storm.data$PROPDMGEXP) # convert values to upper case
storm.data$PROPDMGEXP <- str_trim(storm.data$PROPDMGEXP) # trim the values
storm.data$PROPDMGEXP.COPY <- NA
storm.data[grep("[B|M|K|H]",storm.data$PROPDMGEXP),]$PROPDMGEXP.COPY <- storm.data[grep("[B|M|K|H]",storm.data$PROPDMGEXP),]$PROPDMGEXP
storm.data[grep("[B]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1000000000 # Billion
storm.data[grep("[M]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1000000 # Million
storm.data[grep("[K]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1000 # Thousand
storm.data[grep("[H]",storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 100 # Hundred
storm.data[is.na(storm.data$PROPDMGEXP.COPY),]$PROPDMGEXP.COPY <- 1
storm.data$PROPERTY.DAMAGE.COST <- as.numeric(storm.data$PROPDMGEXP.COPY) * storm.data$PROPDMG / 1000000 # in $1,000,000
The top 3 event types contributing to property damages include
property.total.cost <- aggregate(PROPERTY.DAMAGE.COST ~ EVTYPE.COPY, data = storm.data[storm.data$YEAR >=
1995, ], FUN = sum) # total values by EVTYPE
(top.property.damage.events <- head(property.total.cost[order(-property.total.cost$PROPERTY.DAMAGE.COST),
], n = n.top.events)[, c("EVTYPE.COPY", "PROPERTY.DAMAGE.COST")]) # top event types for injuries
## EVTYPE.COPY PROPERTY.DAMAGE.COST
## 20 FLOOD 208054
## 36 HURRICANE 85250
## 63 TORNADO 24931
To determine the events, which produce the most crop damages, we used the following columns
storm.data$CROPDMGEXP <- toupper(storm.data$CROPDMGEXP) # convert values to upper case
storm.data$CROPDMGEXP <- str_trim(storm.data$CROPDMGEXP) # trim the values
storm.data$CROPDMGEXP.COPY <- NA
storm.data[grep("[B|M|K|H]",storm.data$CROPDMGEXP),]$CROPDMGEXP.COPY <- storm.data[grep("[B|M|K|H]",storm.data$CROPDMGEXP),]$CROPDMGEXP
storm.data[grep("[B]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1000000000 # Billion
storm.data[grep("[M]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1000000 # Million
storm.data[grep("[K]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1000 # Thousand
## storm.data[grep("[H]",storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 100 # Hundred
storm.data[is.na(storm.data$CROPDMGEXP.COPY),]$CROPDMGEXP.COPY <- 1
storm.data$CROP.DAMAGE.COST <- as.numeric(storm.data$CROPDMGEXP.COPY) * storm.data$CROPDMG / 1000000 # in $1,000,000
The top 3 event types contributing to crop damages include
crop.total.cost <- aggregate(CROP.DAMAGE.COST ~ EVTYPE.COPY, data = storm.data[storm.data$YEAR >=
1995, ], FUN = sum) # total values by EVTYPE
(top.crop.damage.events <- head(crop.total.cost[order(-crop.total.cost$CROP.DAMAGE.COST),
], n = n.top.events)[, c("EVTYPE.COPY", "CROP.DAMAGE.COST")]) # top event types for injuries
## EVTYPE.COPY CROP.DAMAGE.COST
## 14 DROUGHT 13922
## 20 FLOOD 7338
## 36 HURRICANE 5516
The top weather events affecting the United States, between 1995 and 2011, were identified as
par(mfrow=c(2,2), mar=c(6,5,2.1,2.1))
barplot(height=top.fatality.events$FATALITIES, names.arg=top.fatality.events$EVTYPE.COPY,
las=1, ylab="Fatalities", xlab="Top Weather Events contributing to Fatalities", cex.axis=0.7)
barplot(height=top.injuries.events$INJURIES, names.arg=top.injuries.events$EVTYPE.COPY,
las=1, ylab="Injuries", xlab="Top Weather Events contributing to Injuries", cex.axis=0.7)
barplot(height=top.property.damage.events$PROPERTY.DAMAGE.COST, names.arg=top.property.damage.events$EVTYPE.COPY,
las=1, ylab="Property Damages ($1M)", xlab="Top Weather Events contributing to Property Damages", cex.axis=0.7)
barplot(height=top.crop.damage.events$CROP.DAMAGE.COST, names.arg=top.crop.damage.events$EVTYPE.COPY,
las=1, ylab="Crop Damages ($1M)", xlab="Top Weather Events contributing to Crop Damaages", cex.axis=0.7)
Top Events affecting to population health and economic impact