The main goal of this research is to answer the following questions: 1. Across the United States, which types of events are most harmful with respect to population health? 2. Across the United States, which types of events have the greatest economic consequences? ### Synopsis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# READ DATA FROM CSV
storm <- read.csv("repdata_data_StormData.csv")
#Print dimensions and variables
sprintf("Dimensions of storm: Observations %s, Variables %s",nrow(storm),ncol(storm))
## [1] "Dimensions of storm: Observations 902297, Variables 37"
names(storm)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
#CLEANING DATA
#Type of events
typeof(storm$EVTYPE)
## [1] "integer"
diffEVTYPE <- unique(storm$EVTYPE)
length(diffEVTYPE)
## [1] 985
The number of different type of events is greater than the documentation says so we need to make the following transformation:
event_types <- tolower(storm$EVTYPE)
event_types <- gsub("[[:blank:][:punct:]+]", " ", event_types)
length(unique(event_types))
## [1] 874
Now the number of different events is as the documentation says. So update the data.
storm$EVTYPE <- event_types
As the number of different events is too big, we are only to consider the top 10. #### Question 1: Across the United States, which types of events are most harmful with respect to population health?
Fatalities <- aggregate(FATALITIES ~ EVTYPE, data=storm, sum)
Injuries <- aggregate(INJURIES ~ EVTYPE, data=storm, sum)
# Get Top 10
FatalitiesTop10 <- head(Fatalities[order(Fatalities$FATALITIES, decreasing = TRUE),],10)
InjuriesTop10 <- head(Injuries[order(Injuries$INJURIES, decreasing = TRUE),],10)
# Make plots
p1 <- ggplot(FatalitiesTop10,
aes(x=reorder(EVTYPE, FATALITIES), y=FATALITIES, fill=FATALITIES)) +
geom_bar(stat="identity") +
coord_flip() +
ylab("Total number of FATALITIES") +
xlab("Event type") +
theme(legend.position="none")
p2 <- ggplot(InjuriesTop10,
aes(x=reorder(EVTYPE, INJURIES), y=INJURIES, fill=INJURIES)) +
geom_bar(stat="identity") +
coord_flip() +
ylab("Total number of injuries") +
xlab("Event type") +
theme(legend.position="none")
#Print plots
print(p1)
print(p2)
Tornadoes cause most number of deaths and injuries among all event types by far.
The exponents in letter format must be converted to numeric and applied.
# Function to translate the exponent:
exp_transform <- function(e) {
# h -> hundred, k -> thousand, m -> million, b -> billion
if (e %in% c('h', 'H'))
return(2)
else if (e %in% c('k', 'K'))
return(3)
else if (e %in% c('m', 'M'))
return(6)
else if (e %in% c('b', 'B'))
return(9)
else if (!is.na(as.numeric(e))) # if a digit
return(as.numeric(e))
else if (e %in% c('', '-', '?', '+'))
return(1)
else {
stop("Invalid exponent value.")
}
}
prop_dmg_exp <- sapply(storm$PROPDMGEXP, FUN=exp_transform)
storm$PROPDMG_incl_exp <- storm$PROPDMG * (10 ** prop_dmg_exp)
crop_dmg_exp <- sapply(storm$CROPDMGEXP, FUN=exp_transform)
storm$CROPDMG_incl_exp <- storm$CROPDMG * (10 ** crop_dmg_exp)
Aggregate data, get Top 10 and plot
# Aggregate data
PROPDMG_incl_exp_per_evt <- aggregate(PROPDMG_incl_exp ~ EVTYPE, data=storm, sum)
CROPDMG_incl_exp_per_evt <- aggregate(CROPDMG_incl_exp ~ EVTYPE, data=storm, sum)
# Get Top 10
PROPDMG_Top10 <- head(PROPDMG_incl_exp_per_evt[order(PROPDMG_incl_exp_per_evt$PROPDMG_incl_exp, decreasing = TRUE),],10)
PROPDMG_Top10
## EVTYPE PROPDMG_incl_exp
## 138 flash flood 6.820237e+13
## 697 thunderstorm winds 2.086532e+13
## 741 tornado 1.078951e+12
## 209 hail 3.157558e+11
## 410 lightning 1.729433e+11
## 154 flood 1.446577e+11
## 366 hurricane typhoon 6.930584e+10
## 166 flooding 5.920826e+10
## 585 storm surge 4.332354e+10
## 270 heavy snow 1.793259e+10
CROPDMG_Top10 <- head(CROPDMG_incl_exp_per_evt[order(CROPDMG_incl_exp_per_evt$CROPDMG_incl_exp, decreasing = TRUE),],10)
CROPDMG_Top10
## EVTYPE CROPDMG_incl_exp
## 84 drought 13972566000
## 154 flood 5661968450
## 519 river flood 5029459000
## 382 ice storm 5022113500
## 209 hail 3025974480
## 357 hurricane 2741910000
## 366 hurricane typhoon 2607872800
## 138 flash flood 1421317100
## 125 extreme cold 1312973000
## 185 frost freeze 1094186000
# Make plots
p3 <- ggplot(data=PROPDMG_Top10,
aes(x=reorder(EVTYPE, PROPDMG_incl_exp), y=PROPDMG_incl_exp, fill=PROPDMG_incl_exp)) +
geom_bar(stat="identity") +
coord_flip() +
ylab("Total number of property damage") +
xlab("Event type") +
theme(legend.position="none")
p4 <- ggplot(data=CROPDMG_Top10,
aes(x=reorder(EVTYPE, CROPDMG_incl_exp), y=CROPDMG_incl_exp, fill=CROPDMG_incl_exp)) +
geom_bar(stat="identity") +
coord_flip() +
ylab("Total number of crop damage") +
xlab("Event type") +
theme(legend.position="none")
#Print plots
print(p3)
print(p4)
Flash floods and thunderstorm winds cost the largest property damages. The most severe weather event in terms of crop damage is drought.