This document aims to assist in the decision making for the preparation for severe weather events and the prioritization of resources for different types of events. In a nutshell we are looking to answer these questions:
Across the United States, which types of events are most harmful with respect to population health? Across the United States, which types of events have the greatest economic consequences? This analysis is based on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
Load and read the database:
setwd('D:\\Coursera\\Data_Science_Specialization\\05 - Reproducible Research\\Course Project 2')
zname = "StormData.csv.bz2"
url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
# download data
if(!file.exists(zname))
{
download.file(url, zname)
}
# read the data
if(!exists('dd'))
{
dd = read.csv(file=zname)
}
# show the data structure
str(dd)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Extracting the count of FATALITIES and INJURIES for each type of event, and find the total count depending on the type of event. Extract from the resulting table the data for some most harmful event type and figure out the most harmful event types.
# the most harmful data is the count of FATALITIES and INJURIES
# extract the data for FATALITIES and INJURIES into new dataframe
harmful = (dd[, c("EVTYPE", "FATALITIES", "INJURIES")])
# show the summary of the harmful data
summary(harmful)
## EVTYPE FATALITIES INJURIES
## HAIL :288661 Min. : 0.0000 Min. : 0.0000
## TSTM WIND :219940 1st Qu.: 0.0000 1st Qu.: 0.0000
## THUNDERSTORM WIND: 82563 Median : 0.0000 Median : 0.0000
## TORNADO : 60652 Mean : 0.0168 Mean : 0.1557
## FLASH FLOOD : 54277 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## FLOOD : 25326 Max. :583.0000 Max. :1700.0000
## (Other) :170878
# extract the data with nonzero count of the FATALITIES and INJURIES
harmful = harmful[harmful$FATALITIES > 0 | harmful$INJURIES > 0, ]
# find the sum of FATALITIES and INJURIES grouped by the event type,
# store the sum of FATALITIES and INJURIES in the new column HARMFUL_CNT, and arrange the resulting dataset
# in descending order by the new HARMFUL_CNT variable
library(dplyr)
fatalityCnt <- dd %>% group_by(EVTYPE) %>%
summarise_at(vars(FATALITIES, INJURIES), funs(sum(., na.rm=TRUE))) %>%
mutate(HARMFUL_CNT = FATALITIES + INJURIES, ) %>%
arrange(desc(HARMFUL_CNT))
# set the count of the most harmful events to investigate
topHarmfulEventCnt = 5
# extract the top topHarmfulEventCnt events from the resultin fatalities dataframe
topHarmful = head(fatalityCnt, topHarmfulEventCnt)
# Now show the figure of the most harmful event types
par(mgp=c(5,1,0))
par(mar=c(15, 10, 5, 5))
barplot(topHarmful$HARMFUL_CNT, width=2, space=2, topHarmful$EVTYPE, cex.axis=1, cex.names=1,
las=2,main=paste0("Top ", as.character(topHarmfulEventCnt), " Event type by Total of FATALITIES and INJURIES"),
ylab= 'Total of FATALITIES and INJURIES')
For the economical consequences the relevant data columns are EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG. Create the table for these and show its summary:
# for the economical consequences the relevant data columns are EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG
# create the datatable for economical consequences
ec <- dd[,c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
summary(ec)
## EVTYPE PROPDMG PROPDMGEXP
## HAIL :288661 Min. : 0.00 :465934
## TSTM WIND :219940 1st Qu.: 0.00 K :424665
## THUNDERSTORM WIND: 82563 Median : 0.00 M : 11330
## TORNADO : 60652 Mean : 12.06 0 : 216
## FLASH FLOOD : 54277 3rd Qu.: 0.50 B : 40
## FLOOD : 25326 Max. :5000.00 5 : 28
## (Other) :170878 (Other): 84
## CROPDMG CROPDMGEXP
## Min. : 0.000 :618413
## 1st Qu.: 0.000 K :281832
## Median : 0.000 M : 1994
## Mean : 1.527 k : 21
## 3rd Qu.: 0.000 0 : 19
## Max. :990.000 B : 9
## (Other): 9
##Get the table for Major causes of Economical Damages grouped by various event types.
##```{r, cache=FALSE}
# function to transform the symbols like 'K', 'B', etc... to exponent
mult=function(x)
{
m=0;
if(x=="") {m=1}
else if(x=="k"| x=="K") {m=1000}
else if(x=="M" | x=="m") {m=1000000}
else if(x=="B") {m=1000000000}
else if(x=="0"){m=1}
else if(x=="1"){m=10}
else if(x=="2"){m=100}
else if(x=="3"){m=1000}
else if(x=="4"){m=10000}
else if(x=="5"){m=100000}
else if(x=="6"){m=1000000}
else if(x=="7"){m=10000000}
else if(x=="8"){m=100000000}
m
}
# transform the data in the ec dataframe to get the damage costs
ec$PROPDMG_MULT <- sapply(ec$PROPDMGEXP, mult)
ec$CROPDMG_MULT <- sapply(ec$CROPDMGEXP, mult)
# get the total economical damage costs arranged by the total economical damage costs:
ecTot <- ec %>% mutate(COST_TOTAL = PROPDMG_MULT * PROPDMG + CROPDMG_MULT * CROPDMG) %>%
group_by(EVTYPE) %>%
summarise_at(vars(COST_TOTAL), funs(sum(., na.rm=TRUE))) %>%
arrange(desc(COST_TOTAL))
# show summary of the total economical damage costs
summary(ecTot)
## EVTYPE COST_TOTAL
## HIGH SURF ADVISORY: 1 Min. :0.000e+00
## COASTAL FLOOD : 1 1st Qu.:0.000e+00
## FLASH FLOOD : 1 Median :0.000e+00
## LIGHTNING : 1 Mean :4.846e+08
## TSTM WIND : 1 3rd Qu.:8.500e+04
## TSTM WIND (G45) : 1 Max. :1.503e+11
## (Other) :979
# Now show the figure of the most economical damaging event types
cntToShow = 10
topEcTot = head(ecTot, cntToShow)
par(mgp=c(5,1,0))
par(mar=c(15, 10, 5, 5))
barplot(topEcTot$COST_TOTAL, width=2, space=2, topEcTot$EVTYPE, cex.axis=1, cex.names=1,
las=2,main=paste0("Top ", as.character(cntToShow), " Event type by total economical damage costs"),
ylab= 'Total economical damage costs')