Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database.
In this analysis, We will answer the following questions:
1.Which types of events are most harmful with respect to population health?
2.Which types of events have the greatest economic consequences?
As a summary we can say that the Tornado is most harmful event causing 5.633 deaths and 91.346 injuries. In terms of economical consequences, floods has been the responsible of most of the properties loses, while drought has been the reponsible of most of the crops loses.
cache = TRUE
echo = TRUE
options(scipen = 1)
if (!"stormData.csv.bz2" %in% dir("./data/")) {
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "data/stormData.csv.bz2")
}
if (!file.exists("data/storm.csv")) {
library(R.utils)
bunzip2("data/stormData.csv.bz2",
"data/storm.csv", remove = FALSE)
}
##Since there are some problems reading the rows 547363 and 547364, I found some way to fix it.
s1data <- read.csv("data/storm.csv",header = T,
nrows = 547362,stringsAsFactors = FALSE)
s2data <- read.csv("data/storm.csv",header = F,
nrows = 144924,skip = 547364,stringsAsFactors = FALSE)
colnames(s2data) <- names(s1data)
sdata <- rbind(s1data, s2data)
remove(s1data)
remove(s2data)
sdatarownum <- nrow(sdata)
sdatacolnum <- ncol(sdata)
sdatanames <- names(sdata)
*The file contains 692286 rows and 37 columns. The names of the columns are:
STATE_, BGN_DATE, BGN_TIME, TIME_ZONE, COUNTY, COUNTYNAME, STATE, EVTYPE, BGN_RANGE, BGN_AZI, BGN_LOCATI, END_DATE, END_TIME, COUNTY_END, COUNTYENDN, END_RANGE, END_AZI, END_LOCATI, LENGTH, WIDTH, F, MAG, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, WFO, STATEOFFIC, ZONENAMES, LATITUDE, LONGITUDE, LATITUDE_E, LONGITUDE, REMARKS, REFNUM
*We will keep the columns that are relevant for this analysis: “BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP”
*BGN_DATE: Date.
*EVTYPE: Type of weather event.
*FATALITIES: Number of fatalities.
*INJURIES: Number of injuries.
*PROPDMG: Amount of property damage.
*PROPDMGEXP: Exponential of property damage.
*CROPDMG: Amount of crop damage.
*CROPDMGEXP:Exponential of crop damage.
selectcol<-c("BGN_DATE","EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
selsdata<-sdata[selectcol]
remove(sdata)
anasdata <- subset(selsdata, selsdata$FATALITIES > 0 |
selsdata$INJURIES > 0 | selsdata$PROPDMG > 0 |
selsdata$CROPDMG > 0 )
remove(selsdata)
library(plyr)
anasdata$TEST <- sapply(strsplit(anasdata$BGN_DATE, " "), "[[", 1)
library(stringr)
anasdata$YEAR <- str_sub(anasdata$TEST, -4)
anasdata$EVTYPE <- as.factor(anasdata$EVTYPE)
anasdata$FATALITIES <- as.numeric(anasdata$FATALITIES)
anasdata$INJURIES <- as.numeric(anasdata$INJURIES)
anasdata$PROPDMGEXP <- as.factor(anasdata$PROPDMGEXP)
anasdata$CROPDMGEXP <- as.factor(anasdata$CROPDMGEXP)
anasdata$YEAR <- as.integer(anasdata$YEAR)
anasdata <- subset(anasdata, anasdata$YEAR >= 1950)
hist(anasdata$YEAR)
*We subset the data from 1993 as there’s a rapid increase and the data are consider to be more complete.
anasdata <- subset(anasdata, anasdata$YEAR >= 1993)
unique(anasdata$PROPDMGEXP)
## [1] B K M m + 0 5 6 4 h 2 7 3 H -
## Levels: - + 0 2 3 4 5 6 7 B h H K m M
##Recode the property exponent data.
anasdata$PROPEXP[anasdata$PROPDMGEXP == "2"] <- 100
anasdata$PROPEXP[anasdata$PROPDMGEXP == "3"] <- 1000
anasdata$PROPEXP[anasdata$PROPDMGEXP == "4"] <- 10000
anasdata$PROPEXP[anasdata$PROPDMGEXP == "5"] <- 1e+05
anasdata$PROPEXP[anasdata$PROPDMGEXP == "6"] <- 1e+06
anasdata$PROPEXP[anasdata$PROPDMGEXP == "7"] <- 1e+07
anasdata$PROPEXP[anasdata$PROPDMGEXP == "B"] <- 1e+09
anasdata$PROPEXP[anasdata$PROPDMGEXP == "h"] <- 100
anasdata$PROPEXP[anasdata$PROPDMGEXP == "H"] <- 100
anasdata$PROPEXP[anasdata$PROPDMGEXP == "K"] <- 1000
anasdata$PROPEXP[anasdata$PROPDMGEXP == "m"] <- 1e+06
anasdata$PROPEXP[anasdata$PROPDMGEXP == "M"] <- 1e+06
##Give 0 to those that are invalid.
anasdata$PROPEXP[anasdata$PROPDMGEXP == ""] <- 0
anasdata$PROPEXP[anasdata$PROPDMGEXP == "-"] <- 0
anasdata$PROPEXP[anasdata$PROPDMGEXP == "+"] <- 0
##Compute the property damage value.
anasdata$PROPDMGTOTAL <- anasdata$PROPDMG * anasdata$PROPEXP
unique(anasdata$CROPDMGEXP)
## [1] M K m B ? 0 k
## Levels: ? 0 B k K m M
##Recode the crop exponent data.
anasdata$CROPEXP[anasdata$CROPDMGEXP == "B"] <- 1e+09
anasdata$CROPEXP[anasdata$CROPDMGEXP == "k"] <- 1000
anasdata$CROPEXP[anasdata$CROPDMGEXP == "K"] <- 1000
anasdata$CROPEXP[anasdata$CROPDMGEXP == "m"] <- 1e+06
anasdata$CROPEXP[anasdata$CROPDMGEXP == "M"] <- 1e+06
##Give 0 to those that are invalid.
anasdata$CROPEXP[anasdata$CROPDMGEXP == ""] <- 0
anasdata$CROPEXP[anasdata$CROPDMGEXP == "?"] <- 0
anasdata$CROPEXP[anasdata$CROPDMGEXP == "0"] <- 0
##Compute the crop damage value.
anasdata$CROPDMGTOTAL <- anasdata$CROPDMG * anasdata$CROPEXP
fatal <- aggregate(FATALITIES ~ EVTYPE, data = anasdata, FUN = sum)
injury <- aggregate(INJURIES ~ EVTYPE, data = anasdata, FUN = sum)
propdmg <- aggregate(PROPDMGTOTAL ~ EVTYPE, data = anasdata, FUN = sum)
cropdmg <- aggregate(CROPDMGTOTAL ~ EVTYPE, data = anasdata, FUN = sum)
##Get the top 15 events with highest fatalities.
fataltop15 <- fatal[order(-fatal$FATALITIES), ][1:15, ]
##Get the top 15 events with highest injuries.
injurytop15 <- injury[order(-injury$INJURIES), ][1:15, ]
par(mfrow = c(1, 2), mar = c(12, 4, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(fataltop15$FATALITIES, las = 3, names.arg = fataltop15$EVTYPE, main = "The Top 15 Highest Fatalities Events", ylab = "number of fatalities", col = "steelblue")
barplot(injurytop15$INJURIES, las = 3, names.arg = injurytop15$EVTYPE, main = "The Top 15 Highest Injuries Events", ylab = "number of injuries", col = "steelblue")
##Get the top 15 events with highest property damage.
propdmgtop15 <- propdmg[order(-propdmg$PROPDMGTOTAL), ][1:15, ]
##Get the top 15 events with highest crop damage.
cropdmgtop15 <- cropdmg[order(-cropdmg$CROPDMGTOTAL), ][1:15, ]
par(mfrow = c(1, 2), mar = c(12, 4, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(propdmgtop15$PROPDMGTOTAL/(10^9), las = 3, names.arg = propdmgtop15$EVTYPE,
main = "Top 15 Events with Greatest Property Damages", ylab = "Cost of damages ($ billions)",
col = "steelblue")
barplot(cropdmgtop15$CROPDMGTOTAL/(10^9), las = 3, names.arg = cropdmgtop15$EVTYPE,
main = "Top 15 Events With Greatest Crop Damages", ylab = "Cost of damages ($ billions)",
col = "steelblue")