#As part of the second project in the Coursera course Reproducible Research we are asked to analyse economic and health consequencess of severe weather event during yhe years 1950 - 2011 using the NOAA database. The NOAA database tracks storms and weather events in the US. We are asked to answer the questions
#Q1: Across the United States, which types of events are most harmful with respect to population health? #Q2; Across the United States, which types of events have the greatest economic consequences?
#The analysis shows, that tornados are the most harmful events in respect to population health and floods have the greatest economic consequences.
##Data Processing #1 Loading packages The packages must be installed prior to running the code
library(ggplot2)
library(dplyr)
library(plyr)
library(Hmisc)
library(flextable)
#2 Reading the data Assuming data has been downloaded to working directory. The storm data can found at:NoAA data
stormfile<-"repdata_data_StormData.csv.bz2"
rawdata <- read.csv(file = stormfile, header=TRUE, sep=",")
#3 Transforming data and subsetting #According to NOAA, the data recording start from Jan. 1950. At that time, only one event type was recorded - tornado. More events gradually were added, and only from Jan 1996 all event types have been recorded. Since our objective is comparing the effects of different weather events, we need only to include events that started not earlier than Jan 1996.
# Reading data
rawdata$BGN_DATE <- strptime(rawdata$BGN_DATE, "%m/%d/%Y %H:%M:%S")
maindata <- subset(rawdata, BGN_DATE > "1995-12-31")
rm(rawdata)
#4 Selecting variables #Based on exploration of the raw data with the functions ‘str’, ‘names’, ‘dim’, ‘head’, ‘range’ I conclude that there are 7 variables we are interested in regarding the two questions.Namely: EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP.
#EVTYPE: Type of event #FATALITIES: Number of fatalities #INJURIES: Number of injuries #PROPDMG: Size of property damage #PROPDMGEXP: The exponent values #for PROPDMG #CROPDMG: Size of crop damage #CROPDMGEXP: The exponent values for CROPDMG
maindata <- subset(maindata, select = c(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
#5 There are many unique event types in EVTYPE column. We simplify by capitalizing all letters in EVTYPE column. Further, I subset only non-zero data regarding our target numbers
maindata$EVTYPE <- toupper(maindata$EVTYPE)
maindata <- maindata[maindata$FATALITIES !=0 |
maindata$INJURIES !=0 |
maindata$PROPDMG !=0 |
maindata$CROPDMG !=0, ]
##Results #We are now ready to answer the question:
#Q1: Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
#Sum fatalities ad injuries by Event Type
fatalities <- aggregate(FATALITIES ~ EVTYPE, data=maindata, sum)
injuries <- aggregate(INJURIES ~ EVTYPE, data=maindata, sum)
#Arrange in descending order by Event Type by number of fatalities or injuries - extract top ten
fatalities <- arrange(fatalities,desc(FATALITIES),EVTYPE)[1:10,]
injuries <- arrange(injuries,desc(INJURIES),EVTYPE)[1:10,]
#Fatalities:
fatalities
#Injuries:
injuries
#From the output it is evident that tornados are the most harmful weather event with respect to health looking across both fatalities and injuries.
#Supporting charts:
# Fatalities per event type
ggplot(fatalities, aes(x = EVTYPE, y = FATALITIES)) +
geom_bar(stat = "identity", fill = "red", width = NULL) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Event Type") + ylab("Fatalities")
# Injuries per event type
ggplot(injuries, aes(x = EVTYPE, y = INJURIES)) +
geom_bar(stat = "identity", fill = "red", width = NULL) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Event Type") + ylab("Injuries")
#Q2: Across the United States, which types of events have the greatest economic consequences? #Wee need to convert exponents to real numbers.
maindata$PROPDMGEXP <- gsub("[Hh]", "2", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("[Kk]", "3", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("[Mm]", "6", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("[Bb]", "9", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("\\+", "1", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("\\?|\\-|\\ ", "0", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- as.numeric(maindata$PROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Hh]", "2", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Kk]", "3", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Mm]", "6", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Bb]", "9", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("\\+", "1", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("\\-|\\?|\\ ", "0", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- as.numeric(maindata$CROPDMGEXP)
maindata$PROPDMGEXP[is.na(maindata$PROPDMGEXP)] <- 0
maindata$CROPDMGEXP[is.na(maindata$CROPDMGEXP)] <- 0
maindata <- mutate(maindata,
PROPDMGTOTAL = PROPDMG * (10 ^ PROPDMGEXP),
CROPDMGTOTAL = CROPDMG * (10 ^ CROPDMGEXP))
# Summing economic consequencess
Economic_data <- aggregate(cbind(PROPDMGTOTAL, CROPDMGTOTAL) ~ EVTYPE, data = maindata, FUN=sum)
Economic_data$ECONOMIC_LOSS <- Economic_data$PROPDMGTOTAL + Economic_data$CROPDMGTOTAL
Economic_data <- Economic_data[order(Economic_data$ECONOMIC_LOSS, decreasing = TRUE), ]
worsteconomicevents <- Economic_data[1:10,c(1,4)]
worsteconomicevents
# Loss per event type
ggplot(worsteconomicevents, aes(x = EVTYPE, y = ECONOMIC_LOSS)) +
geom_bar(stat = "identity", fill = "blue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Event Type") + ylab("Total Prop & Crop Damages (USD)") +
ggtitle("Total economic loss in the US in the period 1996 - 2011 by weather event")
#From the output it is evident, that Floods is the weather event with the most servere economic consequences.