The goal of this assignment is to explore the NOAA Storm Database and provide a short report about severe weather events in US in order to inform the government or the municipal manager who might be responsible for preparing for severe weather events and will need to prioritize resources for different types of events. The questions to be answered in this report are as follows:
Download the dataset from the course website and unzipt it to get the csv file.
if (!file.exists("data/repdata-data-StormData.csv")) {
# check if the csv file is available
dir.create("data", showWarnings = FALSE) # in case the directory 'data' hasn't been created
download.file(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = "data/repdata-data-StormData.csv.bz2")
if (!require(R.utils)) {
install.packages("R.utils")
library(R.utils) # a library for unzipping bz2 files
}
bunzip2(filename = "data/repdata-data-StormData.csv.bz2", destname = "data/repdata-data-StormData.csv") # get the csv file for the task
}
df <- read.csv("data/repdata-data-StormData.csv")
In the dataset, there are two columns related to population health, i.e. FATALITIES and INJURIES. The required information is the total number of these two columns for each type of events.
First, we sum up these two columns and put the results into a new column called 'health'.
df$health <- df$FATALITIES + df$INJURIES
# df <- df[,c('EVTYPE','health')]
I categorize all the types of weather events into 6 kinds: cold, heat, water, wind, lightning, and earth.
cold <- c("avalanche", "avalance", "blizzard", "chill", "cold", "cool", "glaze",
"hypothermia", "hyperthermia", "ice", "icy", "freez", "frost", "low temp",
"sleet", "snow", "wint")
cold <- cbind(cold, code = rep("cold", length(cold)))
heat <- c("below normal precip", "dry", "drie", "drought", "fire", "heat", "high temp",
"hot", "warm")
heat <- cbind(heat, code = rep("heat", length(heat)))
water <- c("coast", "cstl", "current", "dam fail", "dam break", "drizzle", "drown",
"erosion", "erosin", "flood", "floood", "fld", "heavy shower", "high water",
"high waves", "lake", "landslump", "marine", "precip", "rain", "rising water",
"river", "rogue wave", "slide", "stream", "sea", "seiche", "surf", "swell",
"tide", "tidal", "torrent", "wet", "fog", "vog")
water <- cbind(water, code = rep("water", length(water)))
wind <- c("burst", "cloud", "depression", "floyd", "funnel", "gust", "hail",
"hurricane", "landspout", "storm", "southeast", "thunderstorm", "thundertsorm",
"thundestorm", "tornado", "torndao", "tstm", "turbulence", "typhoon", "wall",
"waterspout", "water spout", "wayterspout", "wind", "wnd")
wind <- cbind(wind, code = rep("wind", length(wind)))
lightning <- c("lightning", "ligntning", "lighting")
lightning <- cbind(lightning, code = rep("lightning", length(lightning)))
earth <- c("tsunami", "volcan", "dust")
earth <- cbind(earth, code = rep("earth", length(earth)))
events <- data.frame(rbind(cold, heat, water, wind, lightning, earth))
names(events) <- c("EVTYPE", "code")
events$code <- as.character(events$code)
Make the EVTYPE column lower cases for the convenience of matching keywords.
df$EVTYPE <- tolower(df$EVTYPE)
df$code <- rep("other", nrow(df))
Creat a new column 'string' filled with EVTYPE in regular expression.Then use grep function to get the indeces of the matched rows. Assign the 'code' to their 'code' column of the matched rows. Note that 'lack of snow' was mistaken as 'snow' in this for loop, so I look up this EVTYPE and assign its code as 'other'.
events$string <- paste(".*", events[, "EVTYPE"], ".*", sep = "")
for (i in seq(nrow(events))) {
df[grep(events$string[i], df$EVTYPE, ignore.case = TRUE), "code"] <- events$code[i]
}
df[grep(".*lack of snow.*", df$EVTYPE, ignore.case = TRUE), "code"] <- "other"
# df$code <- factor(df$code)
Have an overview of the newly added column 'code'.
table(df$code)
##
## cold earth heat lightning other water wind
## 30892 638 9615 15780 240 104126 741006
Next, we calculate the overall sum of population health for each type of categories (i.e. cold, heat, water, wind, lightning, and earth), and re-arrange the dataset in a descending order of the number of population health.
healthData <- aggregate(. ~ code, data = df[, c("health", "code")], FUN = sum)
healthData <- healthData[order(healthData$health, decreasing = TRUE), ]
Plot the general kinds of events that are harmful to population health.
barplot(healthData$health, names.arg = healthData$code)
title(xlab = "General Types of Events", ylab = "Sum of Fatalities and Injuries",
main = "The Damage to Population Health Caused by Severe Weather in the US")
allWindEvents <- paste(tolower(events[events$code == "wind", "EVTYPE"]), collapse = ", ")
paste("The most harmful kinds of events is ", healthData[1, "code"], ", having influenced the health of ",
healthData[1, "health"], " people, including fatalities and injuries. This kind of events includes ",
allWindEvents, ".")
## [1] "The most harmful kinds of events is wind , having influenced the health of 117107 people, including fatalities and injuries. This kind of events includes burst, cloud, depression, floyd, funnel, gust, hail, hurricane, landspout, storm, southeast, thunderstorm, thundertsorm, thundestorm, tornado, torndao, tstm, turbulence, typhoon, wall, waterspout, water spout, wayterspout, wind, wnd ."
Take a look at the overview of the column PROPDMGEXP
table(df$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
I decided to calculate to million, so I create a new column 'million'. I put m & M as one billion, and B as 1000 million, and ignore the rest.
df$million <- rep(0, nrow(df))
df[df$PROPDMGEXP == "B", "million"] <- df[df$PROPDMGEXP == "B", "million"] +
1000
df[df$PROPDMGEXP == "M", "million"] <- df[df$PROPDMGEXP == "M", "million"] +
1
df[df$PROPDMGEXP == "m", "million"] <- df[df$PROPDMGEXP == "m", "million"] +
1
Next, we calculate the overall sum of the economic consequences for each type of categories (i.e. cold, heat, water, wind, lightning, and earth), and re-arrange the dataset in a descending order of the number of economic consequences.
economicData <- aggregate(. ~ code, data = df[, c("million", "code")], FUN = sum)
economicData <- economicData[order(economicData$million, decreasing = TRUE),
]
Plot the economic consequences caused by the general kinds of events .
barplot(economicData$million, names.arg = economicData$code)
title(xlab = "General Types of Events", ylab = "Sum of Economic Consequences (in million)",
main = "The Economic Consequences Caused by Severe Weather in the US")
paste("The event that causes greatest economic consequences is ", economicData[1,
"code"], ", having caused ", economicData[1, "million"], " million of economic consequences. This kind of events includes ",
allWindEvents, ".")
## [1] "The event that causes greatest economic consequences is wind , having caused 37379 million of economic consequences. This kind of events includes burst, cloud, depression, floyd, funnel, gust, hail, hurricane, landspout, storm, southeast, thunderstorm, thundertsorm, thundestorm, tornado, torndao, tstm, turbulence, typhoon, wall, waterspout, water spout, wayterspout, wind, wnd ."