This analysis report is done as a part of the Coursera’s Reproducible Research course held by John Hopkins University. The goal of the report is to is to explore U.S. National Oceanic and Atmospheric Administration’s (storm_data) storm database (https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2) and answer the following questions:
The storm_data storm database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The database covers the time period between 1950 and November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
The following packages are needed to perform analysis.
# Needed libraries
library(dplyr)
library(knitr)
library(ggplot2)
library(gridExtra)
library(grid)
The data is downloaded from the following URL: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
# Download the data
url_storm_data <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url_storm_data, destfile="StormData.csv.bz2")
# Read the raw data
storm_data <- read.csv(bzfile("StormData.csv.bz2"))
head(storm_data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
We can see that the data frame has total 37 variables and most of them are not relevant for this analysis. The following code will create a subset of the data, which contains only needeed variables.
# Subsetting the dataset
storm_data <- storm_data[ , c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP',
'CROPDMG', 'CROPDMGEXP')]
# Exploring the crop exponent data
unique(storm_data$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
To be able to calculate economic consequences, we need to first extract the exponent value of PROPDMGEXP and CROPDMGEXP columns.
where: - H,h = hundreds = 100 - K,k = kilos = thousands = 1,000 - M,m = millions = 1,000,000 - B,b = billions = 1,000,000,000
# Ddd columns for property and crop damage totals
storm_data$TOTPROPDMG <- 0
storm_data$TOTCROPDMG <- 0
numbers <- as.character(1:9)
# Calculating the property damage totals
storm_data$TOTPROPDMG[which(storm_data$PROPDMGEXP %in% numbers)] <-
storm_data$PROPDMG[which(storm_data$PROPDMGEXP %in% numbers)] * 1
storm_data$TOTPROPDMG[which(storm_data$PROPDMGEXP == "H")] <-
storm_data$PROPDMG[which(storm_data$PROPDMGEXP == "H")] * 100
storm_data$TOTPROPDMG[which(storm_data$PROPDMGEXP == "K")] <-
storm_data$PROPDMG[which(storm_data$PROPDMGEXP == "K")] * 1000
storm_data$TOTPROPDMG[which(storm_data$PROPDMGEXP == "M")] <-
storm_data$PROPDMG[which(storm_data$PROPDMGEXP == "M")] * 1000000
storm_data$TOTPROPDMG[which(storm_data$PROPDMGEXP == "B")] <-
storm_data$PROPDMG[which(storm_data$PROPDMGEXP == "B")] * 1000000000
# Calculating the crop damage totals
storm_data$TOTCROPDMG[which(storm_data$CROPDMGEXP %in% numbers)] <-
storm_data$CROPDMG[which(storm_data$CROPDMGEXP %in% numbers)] * 1
storm_data$TOTCROPDMG[which(storm_data$CROPDMGEXP == "H")] <-
storm_data$CROPDMG[which(storm_data$CROPDMGEXP == "H")] * 100
storm_data$TOTCROPDMG[which(storm_data$CROPDMGEXP == "K")] <-
storm_data$CROPDMG[which(storm_data$CROPDMGEXP == "K")] * 1000
storm_data$TOTCROPDMG[which(storm_data$CROPDMGEXP == "M")] <-
storm_data$CROPDMG[which(storm_data$CROPDMGEXP == "M")] * 1000000
storm_data$TOTCROPDMG[which(storm_data$CROPDMGEXP == "B")] <-
storm_data$CROPDMG[which(storm_data$CROPDMGEXP == "B")] * 1000000000
# up-shift EVTYPE column
storm_data$EVTYPE <- toupper(storm_data$EVTYPE)
# Aggregating the data by event and summarizing injuries
injury_data <- storm_data %>%
group_by(EVTYPE) %>%
summarize(INJURIES = sum(INJURIES)) %>%
arrange(desc(INJURIES))
# Aggregating the data by event and summarizing fatalies
fatal_data <- storm_data %>%
group_by(EVTYPE) %>%
summarize(FATALITIES = sum(FATALITIES)) %>%
arrange(desc(FATALITIES))
# Creates a barplot containing top 10 injuries & fatalities
plot1 <- ggplot(data=head(injury_data,10), aes(x=reorder(EVTYPE, INJURIES), y=INJURIES)) +
geom_bar(fill="olivedrab",stat="identity") + coord_flip() +
ylab("Total number of injuries") + xlab("Event type") +
ggtitle("The most harmful events with respect to population health") +
theme(legend.position="none")
plot2 <- ggplot(data=head(fatal_data,10), aes(x=reorder(EVTYPE, FATALITIES), y=FATALITIES)) +
geom_bar(fill="red4",stat="identity") + coord_flip() +
ylab("Total number of fatalities") + xlab("Event type") +
theme(legend.position="none")
grid.arrange(plot1, plot2, nrow =2)
We can very clearly see from the resulting graph that TORNADO and EXCESSIVE HEAT events cause most of fatalities. TORNADO events are also causing most of the injuries. These are the most harmful weather events with respect to population health.
# Aggregating the data by event and summarizing property damage totals
prop_data <- storm_data %>%
group_by(EVTYPE) %>%
summarize(TOTPROPDMG = sum(TOTPROPDMG)) %>%
arrange(desc(TOTPROPDMG))
# Convert values to millions
prop_data$TOTPROPDMG <- prop_data$TOTPROPDMG/1e6
# Creates a table containing top 10 property damagies
kable(head(prop_data, n=10),
row.name= FALSE, col.names = c("Event Type","Cost of Damage"),
caption = "Greatest property damagies in millions",
format = "html",
table.attr = "border=\"2\"")
| Event Type | Cost of Damage |
|---|---|
| FLOOD | 144657.710 |
| HURRICANE/TYPHOON | 69305.840 |
| TORNADO | 56925.661 |
| STORM SURGE | 43323.536 |
| FLASH FLOOD | 16140.812 |
| HAIL | 15727.367 |
| HURRICANE | 11868.319 |
| TROPICAL STORM | 7703.891 |
| WINTER STORM | 6688.497 |
| HIGH WIND | 5270.046 |
From the table above we can see that FLOOD, HURRICANE/TYPHOON and TORNADO events are causing most of the poperty damagies.
# Aggregating the data by event and summarizing crop damage totals
crop_data <- storm_data %>%
group_by(EVTYPE) %>%
summarize(TOTCROPDMG = sum(TOTCROPDMG)) %>%
arrange(desc(TOTCROPDMG))
# Convert values to millions
crop_data$TOTCROPDMG <- crop_data$TOTCROPDMG/1e6
# Creates a table containing top 10 crop damagies
kable(head(crop_data, n=10),
row.name= FALSE, col.names = c("Event Type","Cost of Damage"),
caption = "Greatest crop damagies in millions",
format = "html",
table.attr = "border=\"2\"")
| Event Type | Cost of Damage |
|---|---|
| DROUGHT | 13972.566 |
| FLOOD | 5661.968 |
| RIVER FLOOD | 5029.459 |
| ICE STORM | 5022.114 |
| HAIL | 3025.537 |
| HURRICANE | 2741.910 |
| HURRICANE/TYPHOON | 2607.873 |
| FLASH FLOOD | 1421.317 |
| EXTREME COLD | 1312.973 |
| FROST/FREEZE | 1094.186 |
While DROUGHT and FLOOD are causing most of the crop damagies.