This assignment uses the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database that can be found at the following link.
The database includes various measures, including estimates of fatalities, injuries and damage to property.
Additional information on the database can be found at the National Weather Service Storm Data Documentation and the National Climatic Data Center Storm Events FAQ
The analysis consists of downloading and checking the data, transforming where necessary.
The two questions this analysis set out to answer are:
A brief summary of the top-level findings are reflected in the table below:
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists("data")) {
dir.create("data")
}
download.file(data_url, destfile = "./data/StormData.csv.bz2", method = "curl")
library(R.utils)
bunzip2("./data/StormData.csv.bz2", "./data/StormData.csv", remove = TRUE, skip = TRUE)
## [1] "./data/StormData.csv"
## attr(,"temporary")
## [1] FALSE
stormdata <- read.csv("./data/StormData.csv", na.strings = c("","?","-"))
stormdata_sub <- stormdata[,c(8,23,24,25,26,27,28)]
dim(stormdata_sub)
## [1] 902297 7
str(stormdata_sub)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : Factor w/ 984 levels " HIGH SURF ADVISORY",..: 833 833 833 833 833 833 833 833 833 833 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 16 levels "+","0","1","2",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 7 levels "0","2","B","k",..: NA NA NA NA NA NA NA NA NA NA ...
levels(stormdata_sub$PROPDMGEXP)
## [1] "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m" "M"
levels(stormdata_sub$CROPDMGEXP)
## [1] "0" "2" "B" "k" "K" "m" "M"
table(stormdata_sub$PROPDMGEXP)
##
## + 0 1 2 3 4 5 6 7 8
## 5 216 25 13 4 4 28 4 5 1
## B h H K m M
## 40 1 6 424665 7 11330
table(stormdata_sub$CROPDMGEXP)
##
## 0 2 B k K m M
## 19 1 9 21 281832 1 1994
From the tables, it looks as if the errors make up a relatively small portion of the dataset.
I will, however make the assumption that small “b”,“m”,“h”, and “k” were meant to be capitalised. I will change these for the sake of convenience, but I will leave the other errors such as the integers as they are. I do not believe they will substantively affect the analysis.
stormdata_sub$PROPDMGEXP <- sub("h", "H", stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("m", "M", stormdata_sub$PROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("k","K", stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("m","M", stormdata_sub$CROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("B","1000000000",x=stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("M","1000000", x=stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("K","1000", x= stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("H","100", x = stormdata_sub$PROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("B","1000000000",x=stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("M","1000000", x=stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("K","1000", x= stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("H","100", x = stormdata_sub$CROPDMGEXP)
stormdata_sub$PROPDMGEXP <- as.numeric(stormdata_sub$PROPDMGEXP)
## Warning: NAs introduced by coercion
stormdata_sub$CROPDMGEXP <- as.numeric(stormdata_sub$CROPDMGEXP)
stormdata_sub$PropCost <- stormdata_sub$PROPDMGEXP*stormdata_sub$PROPDMG
stormdata_sub$CropCost <- stormdata_sub$CROPDMGEXP*stormdata_sub$CROPDMG
stormdata_sub$Cost <- rowSums(data.frame(stormdata_sub$PropCost, stormdata_sub$CropCost), na.rm = TRUE)
stormdata_sub$FAT_INJ <- stormdata_sub$FATALITIES + stormdata_sub$INJURIES
library(dplyr)
by_type <- group_by(stormdata_sub,EVTYPE)
sum <- summarise(by_type, DEATHS_INJURIES = sum(FAT_INJ))
sum_ordered <- sum[order(-sum$DEATHS_INJURIES),]
sum_ordered_sub <- sum_ordered[sum_ordered$DEATHS_INJURIES > 0,]
head(sum_ordered_sub,50)
## Source: local data frame [50 x 2]
##
## EVTYPE DEATHS_INJURIES
## (fctr) (dbl)
## 1 TORNADO 96979
## 2 EXCESSIVE HEAT 8428
## 3 TSTM WIND 7461
## 4 FLOOD 7259
## 5 LIGHTNING 6046
## 6 HEAT 3037
## 7 FLASH FLOOD 2755
## 8 ICE STORM 2064
## 9 THUNDERSTORM WIND 1621
## 10 WINTER STORM 1527
## .. ... ...
library(ggplot2)
sub10 <- sum_ordered_sub[1:10,]
names <- as.character(sub10$EVTYPE)
STORM_TYPE <- factor(names, levels = names)
sub102 <- cbind(STORM_TYPE,sub10)
g1 <- ggplot(sub102,aes(STORM_TYPE))
g1 + geom_bar(aes(weight = DEATHS_INJURIES)) + xlab("STORM TYPE") + ylab("DEATHS & INJURIES") + ggtitle("Storm Types responsible for 10 highest rates of deaths and injuries in the USA")
sum2 <- summarise(by_type, TOTALCOST = sum(Cost,na.rm = TRUE)/1000000)
sum2_ordered <- sum2[order(-sum2$TOTALCOST),]
sum2_ordered_sub <- sum2_ordered[sum2_ordered$TOTALCOST > 0,]
head(sum2_ordered_sub,50)
## Source: local data frame [50 x 2]
##
## EVTYPE TOTALCOST
## (fctr) (dbl)
## 1 FLOOD 150319.678
## 2 HURRICANE/TYPHOON 71913.713
## 3 TORNADO 57352.114
## 4 STORM SURGE 43323.541
## 5 HAIL 18758.222
## 6 FLASH FLOOD 17562.129
## 7 DROUGHT 15018.672
## 8 HURRICANE 14610.229
## 9 RIVER FLOOD 10148.405
## 10 ICE STORM 8967.041
## .. ... ...
library(ggplot2)
cost_sub10 <- sum2_ordered_sub[1:10,]
names <- as.character(cost_sub10$EVTYPE)
STORM_TYPE <- factor(names, levels = names)
cost_sub102 <- cbind(STORM_TYPE,cost_sub10)
g1 <- ggplot(cost_sub102,aes(STORM_TYPE))
g1 + geom_bar(aes(weight = TOTALCOST)) + xlab("STORM TYPE") + ylab("TOTAL COST IN USD (millions)") + ggtitle("Storm Types responsible for 10 highest costs in terms of Property and Crop Damages")