U.S. National Oceanic and Atmospheric Administration’s (NOAA) prepared a storm database. That database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
** This project is aimed at finding out the worst weather event type in terms of population health and economic consequences in US from 1950 to 2011.**
The data analysis leading to the above result is elucidated below through various commands and functions.
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("dplyr")) install.packages("dplyr")
if (!require("knitr")) install.packages("knitr")
if (!require("R.utils")) install.packages("R.utils")
library(knitr)
setwd("D:/DataScienceJohnHopkins/Reproducible Research/CourseProject2")
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning=FALSE)
data_filename <- "repdata_data_StormData.csv.bz2"
if (!file.exists(data_filename)){
fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileURL, data_filename)
library(R.utils)
bunzip2(data_filename, "repdata_data_StormData.csv", remove = FALSE, skip = TRUE)
}
if (!exists('storm_data')) {
storm_data <- read.csv("repdata_data_StormData.csv", stringsAsFactors = FALSE)
}
str(storm_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
To reduce the computing time of processing this huge dataset, subset of variables with value > 0 were picked for the study. Those are the event types (variable “EVTYPE”), the figures related to population health impacts (variable “FATALITIES” & “INJURIES”) and economic consequences (variable“PropDMG”, “PROPDMGEXP”, “CROPDMG” & “CROPDMGEXP”):
subset1 <- subset(storm_data, EVTYPE != "?" & (FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0),
select = c("EVTYPE",
"FATALITIES",
"INJURIES",
"PROPDMG",
"PROPDMGEXP",
"CROPDMG",
"CROPDMGEXP"))
str(subset1)
## 'data.frame': 254632 obs. of 7 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
head(unique(subset1$EVTYPE), 20)
## [1] "TORNADO" "TSTM WIND"
## [3] "HAIL" "ICE STORM/FLASH FLOOD"
## [5] "WINTER STORM" "HURRICANE OPAL/HIGH WINDS"
## [7] "THUNDERSTORM WINDS" "HURRICANE ERIN"
## [9] "HURRICANE OPAL" "HEAVY RAIN"
## [11] "LIGHTNING" "THUNDERSTORM WIND"
## [13] "DENSE FOG" "RIP CURRENT"
## [15] "THUNDERSTORM WINS" "FLASH FLOODING"
## [17] "FLASH FLOOD" "TORNADO F0"
## [19] "THUNDERSTORM WINDS LIGHTNING" "THUNDERSTORM WINDS/HAIL"
length(unique(subset1$EVTYPE))
## [1] 487
The result of checking the construction yielded 985 different Event Types. That calls for the need to normalize and combine relevant Event Types.
# Convert all Event Types to upper cases.
subset1$EVTYPE <- toupper(subset1$EVTYPE)
# Combine similar Event Types
subset1$EVTYPE <- gsub('.*HEAT.*', 'HEAT', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*WARM.*', 'HEAT', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*HIGH.*TEMP.*', 'EXTREME HEAT', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*RECORD HIGH TEMPERATURES.*', 'EXTREME HEAT', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*STORM.*', 'STORM', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*WIND.*', 'WIND', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*CLOUD.*', 'CLOUD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*MICROBURST.*', 'MICROBURST', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*COLD.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*SNOW.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*FREEZ.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*LOW TEMPERATURE RECORD.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*ICE.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*FROST.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*LO.*TEMP.*', 'COLD', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*HAIL.*', 'HAIL', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*DRY.*', 'DRY', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*DUST.*', 'DUST', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*RAIN.*', 'RAIN', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*LIGHTNING.*', 'LIGHTNING', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*SUMMARY.*', 'SUMMARY', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*WET.*', 'WET', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*FIRE.*', 'FIRE', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*FOG.*', 'FOG', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*VOLCANIC.*', 'VOLCANIC', subset1$EVTYPE)
subset1$EVTYPE <- gsub('.*SURF.*', 'SURF', subset1$EVTYPE)
# Check the Event Types after tidying
length(unique(subset1$EVTYPE))
## [1] 96
table(subset1$PROPDMGEXP)
##
## - + 0 2 3 4 5 6 7
## 11585 1 5 210 1 1 4 18 3 3
## B h H K m M
## 40 1 6 231427 7 11320
table(subset1$CROPDMGEXP)
##
## ? 0 B k K m M
## 152663 6 17 7 21 99932 1 1985
# Convert all strings to upper case:
subset1$PROPDMGEXP <- toupper(subset1$PROPDMGEXP)
subset1$CROPDMGEXP <- toupper(subset1$CROPDMGEXP)
# Update the multiplier factor of PROPDMG
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='')|(subset1$PROPDMGEXP=='-')|(subset1$PROPDMGEXP=='?')|(subset1$PROPDMGEXP=='+')|(subset1$PROPDMGEXP=='0')] <- 0
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='1')] <- 1
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='2')] <- 2
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='3')] <- 3
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='4')] <- 4
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='5')] <- 5
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='6')] <- 6
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='7')] <- 7
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='8')] <- 8
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='H')] <- 2
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='K')] <- 3
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='M')] <- 6
subset1$PROPDMGEXP1[(subset1$PROPDMGEXP=='B')] <- 9
# Update the multiplier factor of CROPDMG
subset1$CROPDMGEXP1[(subset1$CROPDMGEXP=='')|(subset1$CROPDMGEXP=='-')|(subset1$CROPDMGEXP=='?')|(subset1$CROPDMGEXP=='+')|(subset1$CROPDMGEXP=='0')] <- 0
subset1$CROPDMGEXP1[(subset1$CROPDMGEXP=='K')] <- 3
subset1$CROPDMGEXP1[(subset1$CROPDMGEXP=='M')] <- 6
subset1$CROPDMGEXP1[(subset1$CROPDMGEXP=='B')] <- 9
# Calculate cost of damages
subset1$PROPDMGCOST <- subset1$PROPDMG*10^as.numeric(subset1$PROPDMGEXP1)
subset1$CROPDMGCOST <- subset1$CROPDMG*10^as.numeric(subset1$CROPDMGEXP1)
subset2 <- aggregate( x = list(Health_Impact = subset1$FATALITIES + subset1$INJURIES),
by=list(EVENT_TYPE=subset1$EVTYPE),
FUN=sum, na.rm=TRUE)
subset2 <- subset2[order(subset2$Health_Impact, decreasing=T),]
subset3 <- aggregate( x = list(Damage_Cost = subset1$PROPDMGCOST + subset1$CROPDMGCOST),
by=list(EVENT_TYPE=subset1$EVTYPE),
FUN=sum, na.rm=TRUE)
subset3 <- subset3[order(subset3$Damage_Cost, decreasing=T),]
DamageCostChart <- ggplot(head(subset3,10), aes(x=reorder(EVENT_TYPE, -Damage_Cost), y=Damage_Cost, fill = EVENT_TYPE)) +
geom_bar(stat="identity") +
xlab("Event Type") + ylab("Total Damage Cost ($)") +
ggtitle("Pareto Chart of Top 10 Weather Events in US - Economic Impacts")
print(DamageCostChart)
head(subset3,10)
## EVENT_TYPE Damage_Cost
## 19 FLOOD 180591769935
## 76 STORM 79670614754
## 44 HURRICANE/TYPHOON 71913712800
## 78 TORNADO 57418279447
## 23 HAIL 18782880986
## 15 DROUGHT 15018672000
## 36 HURRICANE 14610229010
## 92 WIND 13857954768
## 18 FIRE 8899910130
## 11 COLD 4726752550
HealthImpactChart <- ggplot(head(subset2,10), aes(x=reorder(EVENT_TYPE, -Health_Impact), y=Health_Impact, fill = EVENT_TYPE)) +
geom_bar(stat="identity") +
xlab("Event Type") + ylab("Total Fatalities & Injures Qty.") +
ggtitle("Pareto Chart of Top 10 Weather Events in US - Health Impacts")
print(HealthImpactChart)
head(subset2,10)
## EVENT_TYPE Health_Impact
## 78 TORNADO 97043
## 24 HEAT 12421
## 92 WIND 10284
## 19 FLOOD 10127
## 76 STORM 7325
## 54 LIGHTNING 6048
## 11 COLD 2011
## 18 FIRE 1698
## 23 HAIL 1386
## 44 HURRICANE/TYPHOON 1339
The outcome of NOAA database study between 1950 and 2011 showing the worst weather events type are outlined below: