Title: Weather event impacts on Human and Economic

Synopsis

As part of “Coursera and John Hopkins Bloomberg School of Public Health” Reproducible Research module, we will analyze National Weather Service's Storm Data between 1950 and November 2011 to find the answers for

  1. Weather event type causing the highest human injuries
  2. Weather event type causing the highest human fatalities
  3. Weather event type causing the highest loss in property, crop damages

The initial dataset can be found https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2.

We will use:

Environment from which this document is produced:

Setting global option for R and cache the libraries.

require(knitr)
# Set Global options to display the code
opts_chunk$set(echo=TRUE,cache=TRUE,
               fig.width=18, fig.height=8)
options("scipen"=100, "digits"=4)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Data Processing

Download and read source datafile

  1. Create a local directory named data if not existing
  2. Download the file repdata-data-StormData.csv.bz2
  3. Read the file into R data frame
# Download datafile
if (!file.exists("data")) {
   dir.create("data")
}
ad<-"http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(ad,
              "./data/repdata-data-StormData.csv.bz2")

# load initial data
q1 <- read.csv("./data/repdata-data-StormData.csv.bz2" )

Data Processing for injuries and fatalities

  1. Restrict the dataframe the the variables
  1. Compute the total injuries per event type
  2. Compute the total fatalities per event type
  3. Extract the highest number of injuries and its associated event type
  4. Extract the highest number of fatalities and its associated event type
# restrict data to EVTYPE, FATALITIES, INJURIES
hp <- q1[,c("EVTYPE","FATALITIES","INJURIES")]
colnames(hp) <-c("event_type","fatalities","injuries")

# processing data for injuries analysis
sinj <- group_by(hp,event_type) %>% summarize(sum(injuries))
colnames(sinj) <- c("event_type","total_injuries")
sinj <- sinj[(sinj$total_injuries>100),]

l <- sinj[(sinj$total_injuries==max(sinj$total_injuries)),]
e <- l$event_type
n <- l$total_injuries

# processing data for fatalities analysis
sfat <- group_by(hp,event_type) %>% summarize(sum(fatalities))
colnames(sfat) <- c("event_type","total_fatalities")
sfat <- sfat[(sfat$total_fatalitie>5),]

l1 <- sfat[(sfat$total_fatalities==max(sfat$total_fatalities)),]
e1 <- l1$event_type
n1 <- l1$total_fatalities

Data Processing for Economic Loss (Properties and Crops) analysis

  1. Restrict dataset to
  1. Convert all expense into thousands of dollars
  1. Compute total loss per event type for Property and for Crop separately
  2. Combined the 2 sets Property and Crop into 1 single dataset
  3. Add the loss for both property and crop and extract the highest number and its associated event type
# Load damage cost property
# Cleanse data EXP column only has K,M,B
damp <- q1[,c("EVTYPE","PROPDMG","PROPDMGEXP")]
damp <- damp[(damp$PROPDMGEXP %in% c("K","k","M","m","B","b")),]
damp$PROPDMGEXP <- as.character(damp$PROPDMGEXP)
damp$PROPDMGEXP <- ifelse((damp$PROPDMGEXP=="m"),"M",
                          damp$PROPDMGEXP)
damp$cost_k <- ifelse((damp$PROPDMGEXP=="M"),damp$PROPDMG*1000,
                      damp$PROPDMG)
damp$cost_k <- ifelse((damp$PROPDMGEXP=="B"),damp$PROPDMG*1000000,
                      damp$cost_k)

damps <- group_by(damp,EVTYPE) %>% summarize(sum(cost_k))
colnames(damps) <- c("event_type","total_cost_k")
damps$good <- c(rep("Property",nrow(damps)))

damc <- q1[,c("EVTYPE","CROPDMG","CROPDMGEXP")]
damc <- damc[(damc$CROPDMGEXP %in% c("K","k","M","m","B","b")),]
damc$CROPDMGEXP <- as.character(damc$CROPDMGEXP)
damc$CROPDMGEXP <- ifelse((damc$CROPDMGEXP=="m"),"M",
                          damc$CROPDMGEXP)
damc$CROPDMGEXP <- ifelse((damc$CROPDMGEXP=="k"),"K",
                          damc$CROPDMGEXP)
damc$cost_k <- ifelse((damc$CROPDMGEXP=="M"),damc$CROPDMG*1000,
                      damc$CROPDMG)
damc$cost_k <- ifelse((damc$CROPDMGEXP=="B"),damc$CROPDMG*1000000,
                      damc$cost_k)

damcs <- group_by(damc,EVTYPE) %>% summarize(sum(cost_k))
colnames(damcs) <- c("event_type","total_cost_k")
damcs$good <- c(rep("Crop",nrow(damcs)))

# merge the 2 datasets
dampcs <- rbind(damps,damcs)

# Get the max loss combined crop and property
cp <- group_by(dampcs,event_type) %>% summarize(sum(total_cost_k))
colnames(cp) <- c("event_type","sum_total_cost_k")
v <- cp[(cp$sum_total_cost_k==max(cp$sum_total_cost_k)),]

1) What is the Weather event type causing the highest injuries

Figure (Total Injuries over weather event types)

g <- ggplot(sinj,aes(x=event_type ,y=total_injuries))
plot10 <- g +
  geom_bar(stat="identity", position="identity",fill="pink",
           colour="white") +
  theme(axis.text.x = element_text(angle = 90, hjust=1, size=10, colour="black")) +
  labs(x="Type of Event", y="Total injuries") +
  ggtitle("Total of injuries over event type \n")  
print(plot10)

plot of chunk injuries

2) What is the weather event type causing the highest number of fatalities

Figure (Total fatalities over weather event types)

g <- ggplot(sfat,aes(x=event_type ,y=total_fatalities))
plot1 <- g +
  geom_bar(stat="identity", position="identity",fill="pink",
           colour="white") +
  theme(axis.text.x = element_text(angle = 90, hjust=1,size=10,colour="black")) +
  labs(x="Type of Event", y="Total fatalities") +
  ggtitle("Total of fatalities over event type \n")
print(plot1)

plot of chunk fatalities

3) What is the Weather event type causing the highest loss in property and crop damages combined.

Figure (Cumulative loss of property and loss of crop over event types)

For the figure to be more readable, the data is restricted to losses over 5 millions, for the investigation on highest event types responsible for the losses, this is not a problem, because the numbers are over billions. By restricting the display, we will be able to focus on the most critical event types.

# For display filter all cost lesser than 5 millions
dampcs <- dampcs[(dampcs$total_cost_k>5000),]

ggplot(dampcs, aes(x = event_type, y = log(total_cost_k),
                   fill = good)) +
  geom_bar(stat = "identity",colour="white") +
  theme(axis.text.x = element_text(angle = 90, hjust=1,
                                   size=10,colour="black")) +
  labs(x="Type of Event",
       y="Total Cost in Log thousands of dollars") +
  ggtitle("Loss across US due to damage from different type of events related to Weather\n") 

plot of chunk econ

Results:

  1. The weather event type causing the most injuries is TORNADO with 91346 injuries.
  2. Distribution of the number of injuries.
summary(sinj$total_injuries)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     129     251     545    3750    1360   91300
  1. The weather event type causing the most fatalities is TORNADO with 5633 fatalities.
  2. Distribution of the number of fatalities.
summary(sfat$total_fatalities)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       6      13      33     217     103    5630
  1. The weather event type causing the highest loss is FLOOD with 150319678.25 thousands of US dollars.
  2. Distribution of the amounts in thousands of US dollars.
summary(cp$sum_total_cost_k)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0        16       225   1110000      6320 150000000