The analysis set out to answer two key questions which are to determine, firstly, the meteorological event types that mostly affect the health of US human population, and secondly, those events which have the greatest economic cosequences for United States. To explored the first question, the magnitude of injuries and fatalities were analysed in relation to the meteorological event type, while for the second question, the amount of cost incurred or lost as a result of property or crop damage were assessed in relation to the event type. The worst fatalities and injuries were both due to tornadoes, with a total of 5636 fatalities and 91407 injuries over the past 61 years. The largest amount of economic loss in terms of property damage was due to ligthning/high wind or storms accounting for almost 4 million dollars loss over the study period. *The largest loss due to crop damage was a result of hails, which accounted for about almost 600,000 dollars loss over the years
Data set was downloaded from the web database directly and stored in local computer directory. Data was unzipped before loading
#Downloading data into local computer & working directory from the database website
setwd("D:/Courses/Data_Science_Coursera_Course")
if (!dir.exists("./Storm_datasets")) {dir.create("./Storm_datasets")}
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("./Storm_datasets/datasets.zip")) {
download.file(url, destfile = "./Storm_datasets/datasets.zip")
unzip("./Storm_datasets/datasets.zip", exdir = "./Storm_datasets")}
Dataset was read into R using the read.csv function
data <- read.csv("./Storm_datasets/datasets.zip", stringsAsFactors = FALSE)
Relevant R packages were opened
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Data transformations were done firstly to clean up the EVTYPE variable using regular expressions on the the string along with the “sub” function as shown in the code below.
event2 <- toupper(data$EVTYPE)
event2 <- sub("(.*)AVALANCHE|AVALANCE(.*)[0-9]?", "AVALANCHE", event2)
event2 <- sub("^ASTRONOMICAL(.*)[0-9]?", "ASTRONOMICAL TIDE", event2)
event2 <- sub("(.*)BLIZZARD(.*)[0-9]?", "BLIZZARD", event2)
event2 <- sub("(.*)FLOOD(.*)[0-9]?", "FLOOD", event2)
event2 <- sub("(.*)WIND(.*)CHILL(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("(.*)COLD(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("^LOW TEMP(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("HYPOTHERMIA(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("FREEZING DRIZZLE|FREEZING SPRAY(.*)[0-9]?", "COLD/WIND CHILL", event2)
event2 <- sub("ICE ON ROAD(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("GLAZE(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("ICE(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("(.*)ICY(.*)[0-9]?", "ICY ROADS", event2)
event2 <- sub("MUDSLIDE|LANDSLIDE|LANDSLIDES|MUDSLIDES", "LANDSLIDE", event2)
event2 <- sub("MUD(.*)SLIDE(S)?|LANDSLUMP|ROCK(.*)SLIDE(S)?", "LANDSLIDE", event2)
event2 <- sub("MARINE MISHAP", "MARINE ACCIDENT", event2)
event2 <- sub("(.*)DEBRIS(.*)[0-9]?", "DEBRIS FLOW", event2)
event2 <- sub("(.*)FOG(.*)[0-9]?", "FOG", event2)
event2 <- sub("(.*)SMOKE(.*)[0-9]?", "SMOKE", event2)
event2 <- sub("(.*)DROUGHT(.*)[0-9]?", "DROUGHT", event2)
event2 <- sub("(.*)DUST(.*)[0-9]?", "DUST", event2)
event2 <- sub("(.*)FROST(.*)([0-9])?", "FROST/FREEZE", event2)
event2 <- sub("(.*)FREEZE(.*)([0-9])?", "FROST/FREEZE", event2)
event2 <- sub("(.*)CLOUD(.*)([0-9])?", "FUNNEL CLOUD", event2)
event2 <- sub("(.*)HAIL(.*)([0-9])?", "HAIL", event2)
event2 <- sub("(.*)HEAT(.*)([0-9])?", "HEAT", event2)
event2 <- sub("UNSEASONABLY WARM(.*)", "HEAT", event2)
event2 <- sub("HYPERTHERMIA(.*)([0-9])?", "HEAT", event2)
event2 <- sub("(.*)RAIN(.*)([0-9])?", "HEAVY RAIN", event2)
event2 <- sub("(.*)SNOW(.*)([0-9])?", "SNOW", event2)
event2 <- sub("(.*)SURGE|TIDE(.*)([0-9])?", "STORM SURGE/TIDE", event2)
event2 <- sub("(.*)WIND[^?CHILL](.*)([0-9])?", "LIGHTNING/HIGH WIND/STORM", event2) #Omits expressions with CHILL in WIND CHILL.
event2 <- sub("(.*)STORM(.*)([0-9])?", "LIGHTNING/HIGH WIND/STORM", event2)
event2 <- sub("(.*)LIGHTNING(.*)([0-9])?", "LIGHTNING/HIGH WIND/STORM", event2)
event2 <- sub("(.*)WIND$", "LIGHTNING/HIGH WIND/STORM", event2)
event2 <- sub("(.*)SURF(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HEAVY(.*)SEA(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)SEA(S)?(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)SWELLS(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)WATER(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("(.*)HIGH(.*)WAVES(.*)([0-9])?", "HIGH SURF", event2)
event2 <- sub("RAPIDLY RISING WATER([0-9])?", "HIGH SURF", event2)
event2 <- sub("ROUGH SEAS", "HIGH SURF", event2)
event2 <- sub("(.*)HURRICANE(.*)([0-9])?", "HURRICANE", event2)
event2 <- sub("(.*)TYPHOON(.*)([0-9])?", "HURRICANE", event2)
event2 <- sub("(.*)RIP(.*)([0-9])?", "RIP CURRENT", event2)
event2 <- sub("(.*)CURRENT(.*)", "RIP CURRENT", event2)
event2 <- sub("RIP CURRENT CURRENTS|RIP CURRENTS", "RIP CURRENT", event2)
event2 <- sub("(.*)SEICHE(.*)([0-9])?", "SEICHE", event2)
event2 <- sub("(.*)WATERSPOUT(.*)([0-9])", "TORNADO/WATERSPOUT", event2)
event2 <- sub("(.*)TORNADO(.*)([0-9])?", "TORNADO/WATERSPOUT", event2)
event2 <- sub("(.*)TSUNAMI(.*)([0-9])?", "TSUNAMI", event2)
event2 <- sub("(.*)^VOLC(.*)([0-9])?", "VOLCANIC ASH", event2)
event2 <- sub("(.*)FIRE(S)?(.*)([0-9])?", "WILDFIRES", event2)
event2 <- sub("(.*)WINTER(.*)WEATHER(.*)([0-9])?", "WINTER WEATHER", event2)
event2 <- sub("(.*)SUMMARY|RECORD|NONE(.*)", "NA", event2)
event2 <- sub("(.*)SUMMARY(.*)", "NA", event2)
event2 <- sub("(.*)WET(.*)([0-9])?", "EXCESSIVE WETNESS", event2)
event2 <- sub("(.*)PRECIP(.*)([0-9])?", "HEAVY PRECIPITATION", event2)
event2 <- sub("(.*)HEAVY MIX(.*)([0-9])?", "HEAVY PRECIPITATION", event2)
data$EVTYPE2 <- event2
Further processing was done to create a data set with only the relevant variables and to convert variables to more useful formats.
data2 <- data %>% select(STATE__, STATE, EVTYPE2, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG,CROPDMGEXP) %>% mutate(EVTYPE2 = as.factor(EVTYPE2), STATE = as.factor(STATE), FATALITIES = as.numeric(FATALITIES), INJURIES = as.numeric(INJURIES), PROPDMG = as.numeric(PROPDMG), PROPDMGEXP = as.numeric(PROPDMGEXP), CROPDMG = as.numeric(CROPDMG), CROPDMGEXP = as.numeric(CROPDMGEXP))
## Warning: package 'bindrcpp' was built under R version 3.4.3
## Warning in evalq(as.numeric(PROPDMGEXP), <environment>): NAs introduced by
## coercion
## Warning in evalq(as.numeric(CROPDMGEXP), <environment>): NAs introduced by
## coercion
The other processing involved the subsetting of the data by the type of events to summarise fatality, injury, property and crop damage, by summing them up according to the type of event.
fatalitybyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(sumfatal = sum(FATALITIES,na.rm=TRUE))
injurybyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(suminjury = sum(INJURIES,na.rm=TRUE))
propdmgbyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(sumpropdmg = sum(PROPDMG))
cropdmgbyevent <- data2 %>% group_by(EVTYPE2) %>% summarise(sumcrpdmg = sum(CROPDMG))
fatality <- fatalitybyevent %>% filter(sumfatal>mean(sumfatal))
fatality <- fatality %>% arrange(desc(sumfatal))
fatality5 <- fatality[1:5, ]
print(fatality)
## # A tibble: 14 x 2
## EVTYPE2 sumfatal
## <fctr> <dbl>
## 1 TORNADO/WATERSPOUT 5636
## 2 HEAT 3173
## 3 LIGHTNING/HIGH WIND/STORM 2791
## 4 FLOOD 1525
## 5 RIP CURRENT 572
## 6 AVALANCHE 225
## 7 HIGH SURF 185
## 8 SNOW 144
## 9 HURRICANE 133
## 10 ICY ROADS 114
## 11 HEAVY RAIN 113
## 12 BLIZZARD 101
## 13 WILDFIRES 90
## 14 FOG 80
The table above shows the first 14 fatalities(ordered in terms of magnitude) and the event type causing them
propdmg <- propdmgbyevent %>% filter(sumpropdmg>mean(sumpropdmg))
propdmg <- propdmg %>% arrange(desc(sumpropdmg))
propdmg5 <- propdmg[1:5, ]
print(propdmg)
## # A tibble: 8 x 2
## EVTYPE2 sumpropdmg
## <fctr> <dbl>
## 1 LIGHTNING/HIGH WIND/STORM 3950744.48
## 2 TORNADO/WATERSPOUT 3215747.86
## 3 FLOOD 2436131.51
## 4 HAIL 699320.38
## 5 SNOW 149178.81
## 6 WILDFIRES 125218.29
## 7 ICY ROADS 76322.82
## 8 HEAVY RAIN 57938.21
The table above shows the first 8 event types causing the largest property damage(ordered in terms of magnitude)
cropdmg <- cropdmgbyevent %>% filter(sumcrpdmg>mean(sumcrpdmg))
cropdmg <- cropdmg %>% arrange(desc(sumcrpdmg))
cropdmg5 <- cropdmg[1:5, ]
print(cropdmg)
## # A tibble: 9 x 2
## EVTYPE2 sumcrpdmg
## <fctr> <dbl>
## 1 HAIL 585956.66
## 2 FLOOD 364506.73
## 3 LIGHTNING/HIGH WIND/STORM 238461.45
## 4 TORNADO/WATERSPOUT 100026.72
## 5 DROUGHT 33954.40
## 6 HEAVY RAIN 12457.30
## 7 HURRICANE 11627.79
## 8 FROST/FREEZE 9640.41
## 9 WILDFIRES 9565.74
The table above shows the first 9 event types causing the largest crop damages (ordered in terms of magnitude)
injury <- injurybyevent %>% filter(suminjury>mean(suminjury))
injury <- injury %>% arrange(desc(suminjury))
injury5 <- injury[1:5, ]
print(injury)
## # A tibble: 11 x 2
## EVTYPE2 suminjury
## <fctr> <dbl>
## 1 TORNADO/WATERSPOUT 91407
## 2 LIGHTNING/HIGH WIND/STORM 18691
## 3 HEAT 9226
## 4 FLOOD 8604
## 5 ICY ROADS 2411
## 6 WILDFIRES 1608
## 7 HAIL 1467
## 8 HURRICANE 1333
## 9 SNOW 1151
## 10 FOG 1076
## 11 BLIZZARD 805
The table above shows the first 11 event types causing the greatest number of injuries (ordered in terms of magnitude)
fatality5$EVTYPE2 <- tolower(fatality5$EVTYPE2)
injury5$EVTYPE2 <- tolower(injury5$EVTYPE2)
propdmg5$EVTYPE2 <- tolower(propdmg5$EVTYPE2)
qplot(EVTYPE2, data = fatality5, geom = "bar", weight = sumfatal,xlab = "Event Type", ylab = "Number of fatalities", main = "Total no. of fatalities in the US by storm event type:1950-2011") + scale_y_continuous("sumfatal")
qplot(EVTYPE2, data = injury5, geom = "bar", weight = suminjury,xlab = "Event Type", ylab = "Number of injuries", main = "Total no. of injuries in the US by storm event type:1950-2011") + scale_y_continuous("suminjury")
qplot(EVTYPE2, data = propdmg5, geom = "bar", weight = sumpropdmg, xlab = "Event Type", ylab = "Amount of property damage", main = "Total amount of property damage (US dollars) by storm event type:1950-2011") + scale_y_continuous("sumpropdmg")
#qplot(EVTYPE2, data = cropdmg5, geom = "bar", weight = sumcrpdmg, xlab = "Event Type", ylab = "Amount of #crop damage") + scale_y_continuous("sumcrpdmg")