Title: Analysis of fatalities,injuries and cost from US storm events from 1950-2011

Synopsis: The findings from this analaysis is summarized as follows:

The analysis set out to answer two key questions which are to determine, firstly, the meteorological event types that mostly affect the health of US human population, and secondly, those events which have the greatest economic cosequences for United States. To explored the first question, the magnitude of injuries and fatalities were analysed in relation to the meteorological event type, while for the second question, the amount of cost incurred or lost as a result of property or crop damage were assessed in relation to the event type. The worst fatalities and injuries were both due to tornadoes, with a total of 5636 fatalities and 91407 injuries over the past 61 years. The largest amount of economic loss in terms of property damage was due to ligthning/high wind or storms accounting for almost 4 million dollars loss over the study period. *The largest loss due to crop damage was a result of hails, which accounted for about almost 600,000 dollars loss over the years

DATA PROCESSING

Data set was downloaded from the web database directly and stored in local computer directory. Data was unzipped before loading

#Downloading data into local computer & working directory from the database website
setwd("D:/Courses/Data_Science_Coursera_Course")
if (!dir.exists("./Storm_datasets")) {dir.create("./Storm_datasets")}
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("./Storm_datasets/datasets.zip")) {
download.file(url, destfile = "./Storm_datasets/datasets.zip") 
unzip("./Storm_datasets/datasets.zip", exdir = "./Storm_datasets")}

Dataset was read into R using the read.csv function

data <- read.csv("./Storm_datasets/datasets.zip", stringsAsFactors = FALSE)

Relevant R packages were opened

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Data transformations were done firstly to clean up the EVTYPE variable using regular expressions on the the string along with the “sub” function as shown in the code below.

event2 <- toupper(data$EVTYPE)
event2 <- sub("(.*)AVALANCHE|AVALANCE(.*)[0-9]?", "AVALANCHE", event2)
event2 <- sub("^ASTRONOMICAL(.*)[0-9]?", "ASTRONOMICAL TIDE", event2)
event2 <- sub("(.*)BLIZZARD(.*)[0-9]?", "BLIZZARD", event2)
event2 <- sub("(.*)FLOOD(.*)[0-9]?", "FLOOD", event2)
event2 <- sub("(.*)WIND(.*)CHILL(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("(.*)COLD(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("^LOW TEMP(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("HYPOTHERMIA(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("FREEZING DRIZZLE|FREEZING SPRAY(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("ICE ON ROAD(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("GLAZE(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("ICE(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("(.*)ICY(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("MUDSLIDE|LANDSLIDE|LANDSLIDES|MUDSLIDES", "LANDSLIDE", event2)
event2 <- sub("MUD(.*)SLIDE(S)?|LANDSLUMP|ROCK(.*)SLIDE(S)?", "LANDSLIDE", event2)
event2 <- sub("MARINE MISHAP", "MARINE ACCIDENT", event2)
event2 <- sub("(.*)DEBRIS(.*)[0-9]?", "DEBRIS FLOW", event2)
event2 <- sub("(.*)FOG(.*)[0-9]?", "FOG", event2)
event2 <- sub("(.*)SMOKE(.*)[0-9]?", "SMOKE", event2)
event2 <- sub("(.*)DROUGHT(.*)[0-9]?", "DROUGHT", event2)
event2 <- sub("(.*)DUST(.*)[0-9]?", "DUST", event2)
event2 <- sub("(.*)FROST(.*)([0-9])?", "FROST/FREEZE", event2)
event2 <- sub("(.*)FREEZE(.*)([0-9])?", "FROST/FREEZE", event2)
event2 <- sub("(.*)CLOUD(.*)([0-9])?", "FUNNEL CLOUD", event2)
event2 <- sub("(.*)HAIL(.*)([0-9])?", "HAIL", event2)
event2 <- sub("(.*)HEAT(.*)([0-9])?", "HEAT", event2)
event2 <- sub("UNSEASONABLY WARM(.*)", "HEAT", event2)
event2 <- sub("HYPERTHERMIA(.*)([0-9])?", "HEAT", event2)
event2 <- sub("(.*)RAIN(.*)([0-9])?", "HEAVY RAIN", event2)
event2 <- sub("(.*)SNOW(.*)([0-9])?", "SNOW", event2)
event2 <- sub("(.*)SURGE|TIDE(.*)([0-9])?", "STORM SURGE/TIDE", event2)
event2 <- sub("(.*)WIND[^?CHILL](.*)([0-9])?", "LIGHTNING/HIGH WIND/STORM", event2) #Omits expressions with CHILL in WIND CHILL.
event2 <- sub("(.*)STORM(.*)([0-9])?", "LIGHTNING/HIGH WIND/STORM", event2)
event2 <- sub("(.*)LIGHTNING(.*)([0-9])?", "LIGHTNING/HIGH WIND/STORM", event2)
event2 <- sub("(.*)WIND$", "LIGHTNING/HIGH WIND/STORM", event2) 
event2 <- sub("(.*)SURF(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HEAVY(.*)SEA(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)SEA(S)?(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)SWELLS(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)WATER(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)WAVES(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("RAPIDLY RISING WATER([0-9])?", "HIGH SURF", event2)
event2 <- sub("ROUGH SEAS", "HIGH SURF", event2)
event2 <- sub("(.*)HURRICANE(.*)([0-9])?", "HURRICANE", event2)
event2 <- sub("(.*)TYPHOON(.*)([0-9])?", "HURRICANE", event2)
event2 <- sub("(.*)RIP(.*)([0-9])?", "RIP CURRENT", event2)
event2 <- sub("(.*)CURRENT(.*)", "RIP CURRENT", event2)
event2 <- sub("RIP CURRENT CURRENTS|RIP CURRENTS", "RIP CURRENT", event2)
event2 <- sub("(.*)SEICHE(.*)([0-9])?", "SEICHE", event2)
event2 <- sub("(.*)WATERSPOUT(.*)([0-9])", "TORNADO/WATERSPOUT", event2)
event2 <- sub("(.*)TORNADO(.*)([0-9])?", "TORNADO/WATERSPOUT", event2)
event2 <- sub("(.*)TSUNAMI(.*)([0-9])?", "TSUNAMI", event2)
event2 <- sub("(.*)^VOLC(.*)([0-9])?", "VOLCANIC ASH", event2)
event2 <- sub("(.*)FIRE(S)?(.*)([0-9])?", "WILDFIRES", event2)
event2 <- sub("(.*)WINTER(.*)WEATHER(.*)([0-9])?", "WINTER WEATHER", event2)
event2 <- sub("(.*)SUMMARY|RECORD|NONE(.*)", "NA", event2)
event2 <- sub("(.*)SUMMARY(.*)", "NA", event2)
event2 <- sub("(.*)WET(.*)([0-9])?", "EXCESSIVE WETNESS", event2)
event2 <- sub("(.*)PRECIP(.*)([0-9])?", "HEAVY PRECIPITATION", event2)
event2 <- sub("(.*)HEAVY MIX(.*)([0-9])?", "HEAVY PRECIPITATION", event2)
data$EVTYPE2 <- event2

Further processing was done to create a data set with only the relevant variables and to convert variables to more useful formats.

data2 <- data %>% select(STATE__, STATE, EVTYPE2, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,  CROPDMG,CROPDMGEXP) %>% mutate(EVTYPE2 = as.factor(EVTYPE2), STATE = as.factor(STATE), FATALITIES =   as.numeric(FATALITIES), INJURIES = as.numeric(INJURIES), PROPDMG = as.numeric(PROPDMG), PROPDMGEXP = as.numeric(PROPDMGEXP), CROPDMG = as.numeric(CROPDMG), CROPDMGEXP = as.numeric(CROPDMGEXP))
## Warning: package 'bindrcpp' was built under R version 3.4.3
## Warning in evalq(as.numeric(PROPDMGEXP), <environment>): NAs introduced by
## coercion
## Warning in evalq(as.numeric(CROPDMGEXP), <environment>): NAs introduced by
## coercion

The other processing involved the subsetting of the data by the type of events to summarise fatality, injury, property and crop damage, by summing them up according to the type of event.

fatalitybyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(sumfatal = sum(FATALITIES,na.rm=TRUE))
injurybyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(suminjury = sum(INJURIES,na.rm=TRUE))
propdmgbyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(sumpropdmg = sum(PROPDMG))
cropdmgbyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(sumcrpdmg = sum(CROPDMG))

RESULTS

fatality <- fatalitybyevent %>% filter(sumfatal>mean(sumfatal))
fatality <- fatality %>% arrange(desc(sumfatal))
fatality5 <- fatality[1:5, ]
print(fatality)
## # A tibble: 14 x 2
##                      EVTYPE2 sumfatal
##                       <fctr>    <dbl>
##  1        TORNADO/WATERSPOUT     5636
##  2                      HEAT     3173
##  3 LIGHTNING/HIGH WIND/STORM     2791
##  4                     FLOOD     1525
##  5               RIP CURRENT      572
##  6                 AVALANCHE      225
##  7                 HIGH SURF      185
##  8                      SNOW      144
##  9                 HURRICANE      133
## 10                 ICY ROADS      114
## 11                HEAVY RAIN      113
## 12                  BLIZZARD      101
## 13                 WILDFIRES       90
## 14                       FOG       80

The table above shows the first 14 fatalities(ordered in terms of magnitude) and the event type causing them

propdmg <- propdmgbyevent %>% filter(sumpropdmg>mean(sumpropdmg))
propdmg <- propdmg %>% arrange(desc(sumpropdmg))
propdmg5 <- propdmg[1:5, ]
print(propdmg)
## # A tibble: 8 x 2
##                     EVTYPE2 sumpropdmg
##                      <fctr>      <dbl>
## 1 LIGHTNING/HIGH WIND/STORM 3950744.48
## 2        TORNADO/WATERSPOUT 3215747.86
## 3                     FLOOD 2436131.51
## 4                      HAIL  699320.38
## 5                      SNOW  149178.81
## 6                 WILDFIRES  125218.29
## 7                 ICY ROADS   76322.82
## 8                HEAVY RAIN   57938.21

The table above shows the first 8 event types causing the largest property damage(ordered in terms of magnitude)

cropdmg <- cropdmgbyevent %>% filter(sumcrpdmg>mean(sumcrpdmg))
cropdmg <- cropdmg %>% arrange(desc(sumcrpdmg))
cropdmg5 <- cropdmg[1:5, ]
print(cropdmg)
## # A tibble: 9 x 2
##                     EVTYPE2 sumcrpdmg
##                      <fctr>     <dbl>
## 1                      HAIL 585956.66
## 2                     FLOOD 364506.73
## 3 LIGHTNING/HIGH WIND/STORM 238461.45
## 4        TORNADO/WATERSPOUT 100026.72
## 5                   DROUGHT  33954.40
## 6                HEAVY RAIN  12457.30
## 7                 HURRICANE  11627.79
## 8              FROST/FREEZE   9640.41
## 9                 WILDFIRES   9565.74

The table above shows the first 9 event types causing the largest crop damages (ordered in terms of magnitude)

injury <- injurybyevent %>% filter(suminjury>mean(suminjury))
injury <- injury %>% arrange(desc(suminjury))
injury5 <- injury[1:5, ]
print(injury)
## # A tibble: 11 x 2
##                      EVTYPE2 suminjury
##                       <fctr>     <dbl>
##  1        TORNADO/WATERSPOUT     91407
##  2 LIGHTNING/HIGH WIND/STORM     18691
##  3                      HEAT      9226
##  4                     FLOOD      8604
##  5                 ICY ROADS      2411
##  6                 WILDFIRES      1608
##  7                      HAIL      1467
##  8                 HURRICANE      1333
##  9                      SNOW      1151
## 10                       FOG      1076
## 11                  BLIZZARD       805

The table above shows the first 11 event types causing the greatest number of injuries (ordered in terms of magnitude)

fatality5$EVTYPE2 <- tolower(fatality5$EVTYPE2)
injury5$EVTYPE2 <- tolower(injury5$EVTYPE2)
propdmg5$EVTYPE2 <- tolower(propdmg5$EVTYPE2)



qplot(EVTYPE2, data = fatality5, geom = "bar", weight = sumfatal,xlab = "Event Type", ylab = "Number of fatalities", main = "Total no. of fatalities in the US by storm event type:1950-2011") + scale_y_continuous("sumfatal")

qplot(EVTYPE2, data = injury5, geom = "bar", weight = suminjury,xlab = "Event Type", ylab = "Number of injuries", main = "Total no. of injuries in the US by storm event type:1950-2011") + scale_y_continuous("suminjury")

qplot(EVTYPE2, data = propdmg5, geom = "bar", weight = sumpropdmg, xlab = "Event Type", ylab = "Amount of property damage", main = "Total amount of property damage (US dollars) by storm event type:1950-2011") + scale_y_continuous("sumpropdmg")

#qplot(EVTYPE2, data = cropdmg5, geom = "bar", weight = sumcrpdmg, xlab = "Event Type", ylab = "Amount of #crop damage") + scale_y_continuous("sumcrpdmg")