This paper is made to answer two question posed by the Johns Hopkins University. These two questions are
1.Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? 2.Across the United States, which types of events have the greatest economic consequences?
To conduct the analisys the NOAA weather events database is used on the variables that define the event type and the number of injuries and fatalities to answer the first
question and the cost of property and crop damage. The software chosen to conduct the analisys is R and the study is published on RPubs. To conduct the analisys the dataset is subsetted on the variables (EVTYPE,FATALITIES,INJURIES) and the variables (PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP) are converted to PROPDMGVAL and CROPDMGVAL based on the dataset documentation.
Fist the data is loaded from the comma separated values file provided by the university. also the library dplyr is loaded to make the handling of the dataset easier.
## Dplyr library loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Data base loaded from csv file
if(!file.exists("./StormData.csv.bz2")) {
fileUrl <- paste("https://d396qusza40orc.cloudfront.net",
"/repdata%2Fdata%2FStormData.csv.bz2", sep = "")
download.file(fileUrl, "StormData.csv.bz2")
}
bzData <- "./StormData.csv.bz2"
data <- read.csv(bzfile(bzData))
## Names of the columns
names(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
## Checking the first values of the data
head(data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
After loading the data into R, the data is processed based on the two questions.
Fisrtly the PROPDMGEXP is converted on itβs data value as it is described in the database documentation, then the real value is calculated by multipliying
the PROPDMG and the PROPEXP variables to create a new PROPDMGVAL variable.
# Assigning values for the property exponent data
data$PROPEXP[data$PROPDMGEXP == "K"] <- 1000
data$PROPEXP[data$PROPDMGEXP == "M"] <- 1e+06
data$PROPEXP[data$PROPDMGEXP == ""] <- 1
data$PROPEXP[data$PROPDMGEXP == "B"] <- 1e+09
data$PROPEXP[data$PROPDMGEXP == "m"] <- 1e+06
data$PROPEXP[data$PROPDMGEXP == "0"] <- 1
data$PROPEXP[data$PROPDMGEXP == "5"] <- 1e+05
data$PROPEXP[data$PROPDMGEXP == "6"] <- 1e+06
data$PROPEXP[data$PROPDMGEXP == "4"] <- 10000
data$PROPEXP[data$PROPDMGEXP == "2"] <- 100
data$PROPEXP[data$PROPDMGEXP == "3"] <- 1000
data$PROPEXP[data$PROPDMGEXP == "h"] <- 100
data$PROPEXP[data$PROPDMGEXP == "7"] <- 1e+07
data$PROPEXP[data$PROPDMGEXP == "H"] <- 100
data$PROPEXP[data$PROPDMGEXP == "1"] <- 10
data$PROPEXP[data$PROPDMGEXP == "8"] <- 1e+08
# Assigning '0' to invalid exponent data
data$PROPEXP[data$PROPDMGEXP == "+"] <- 0
data$PROPEXP[data$PROPDMGEXP == "-"] <- 0
data$PROPEXP[data$PROPDMGEXP == "?"] <- 0
# Calculating the property damage value
data$PROPDMGVAL <- data$PROPDMG * data$PROPEXP
Secondly, the same process is applied to the CROPDMGEXP and the CROPDMG variable, then both variables are multiplied to create the CROPDMGVAL variable.
# Assigning values for the crop exponent data
data$CROPEXP[data$CROPDMGEXP == "M"] <- 1e+06
data$CROPEXP[data$CROPDMGEXP == "K"] <- 1000
data$CROPEXP[data$CROPDMGEXP == "m"] <- 1e+06
data$CROPEXP[data$CROPDMGEXP == "B"] <- 1e+09
data$CROPEXP[data$CROPDMGEXP == "0"] <- 1
data$CROPEXP[data$CROPDMGEXP == "k"] <- 1000
data$CROPEXP[data$CROPDMGEXP == "2"] <- 100
data$CROPEXP[data$CROPDMGEXP == ""] <- 1
# Assigning '0' to invalid exponent data
data$CROPEXP[data$CROPDMGEXP == "?"] <- 0
# calculating the crop damage value
data$CROPDMGVAL <- data$CROPDMG * data$CROPEXP
Thirdly, a subset is created of the database with the columns with the values, FATALITIES AND INJURIES to answer the first question, and PROPDMGVAL and CROPDMGVAL to answer
the second question.
# New database event from the columns required for the analisys
event <- data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMGVAL", "CROPDMGVAL")]
Finally, four new databases are created each one for one of the columns in the event dataset. each new dataset is created to summarise the data based on the event type
to proceed to plot the results in the final section.
event_injuries <- event %>% group_by(EVTYPE) %>% summarise(tot = sum(INJURIES)) %>% arrange(desc(tot))
event_fatalities <- event %>% group_by(EVTYPE) %>% summarise(tot = sum(FATALITIES)) %>% arrange(desc(tot))
event_propdmg <- event %>% group_by(EVTYPE) %>% summarise(tot = sum(PROPDMGVAL)) %>% arrange(desc(tot))
event_cropdmg <- event %>% group_by(EVTYPE) %>% summarise(tot = sum(CROPDMGVAL)) %>% arrange(desc(tot))
To answer the first question two plots were created. The first based on the weather events with the highest injuries and the second based on the weather events with the highest fatalities, it is clear in the barplot shown below that tornado is by far the most harmful event for in respecto to the population health.
par(mfrow = c(1, 2))
par(mar=c(6,4,2,2)+.5)
barplot(event_injuries$tot[1:8], las = 3, names.arg = event_injuries$EVTYPE[1:8], main = "Events with Highest Injuries",
ylab = "Number of Injuries",cex.names=0.75)
barplot(event_fatalities$tot[1:8], las = 3, names.arg = event_fatalities$EVTYPE[1:8], main = "Events with Highest Fatalities",
ylab = "Number of Fatalities",cex.names=0.75)
To answer the second question two plots were created. The first based on the weather events with the highest property damage and the second based on the weather events with the highest crop damages, in the first barplot shown below flood is by far the most harmful in respect with the property damage doubling the second event hurricane/typhoon, in the second plot it can be apreciated that to crops drought is the absolute winner, however, flood is second with almost6 billion dolars, which if summed with the property damages it exceeds the 150 billion dolars. in conclution to answer the second question flood is the event with the gratest economic consequence.
par(mfrow = c(1, 2))
par(mar=c(8,4,2,2)+.5)
barplot(event_propdmg$tot[1:8]/(10^9), las = 3, names.arg = event_propdmg$EVTYPE[1:8], main = "Events with Highest\n Property Damages",
ylab = "Damage Cost ($ billions)",cex.names=0.75)
barplot(event_cropdmg$tot[1:8]/(10^9), las = 3, names.arg = event_cropdmg$EVTYPE[1:8], main = "Events With Highest\n Crop Damages",
ylab = "Damage Cost ($ billions)",cex.names=0.75)