Data Processing

Load R Packages

#Load the R packages needed
suppressMessages(library(dplyr));suppressMessages(library(plyr))
suppressWarnings(library(ggplot2))

Reading and Subsetting the Data

setwd("C:/Users/User/Desktop")

df1 <- read.csv("stormdata.bz2")

#Subset variables useful to our analysis
df <- df1 %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

df <- as.data.frame(df)

Data transformations

Exploring INJURIES & FATALITIES variable and transform them if necessary

str(df %>% select(FATALITIES))

## 'data.frame':    902297 obs. of  1 variable:
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...

str(df %>% select(INJURIES))

## 'data.frame':    902297 obs. of  1 variable:
##  $ INJURIES: num  15 0 2 2 2 6 1 0 14 0 ...

Both FATALITIES and INJURIES are numeric variables, transformations or imputations are not necessary.

Scaling PROPDMG with its respective exponent, PROPDMGEXP

#Take a look at the PROPDMGEXP variable
unique.propexp <- unique(df %>% select(PROPDMGEXP))
unique.propexp

##        PROPDMGEXP
## 1               K
## 11              M
## 54               
## 187564          B
## 187584          m
## 188780          +
## 189004          0
## 192527          5
## 198496          6
## 198689          ?
## 199104          4
## 199528          2
## 200331          3
## 209285          h
## 213319          7
## 216476          H
## 229327          -
## 233829          1
## 234228          8

#Create a numerical vector that represents this exponent data,
#where K refers to multiplier of 1e3, 
#M refers to 1e6, 
#B refers to 1e9, 
#H refers to 1e2,
#blanks or 0 refer to multiplier of 1
#5 refers to multiplier of 1e5 etc.
#"+","-", "?" will be ignored and considered as multiplier of 0

numeric.propexp <- c(1e3, 1e6, 1, 1e9, 1e6, 0, 1, 1e5, 1e6, 0, 1e4, 1e2, 1e3, 1e2, 1e7, 1e2, 0, 1, 1e8)
      
L <- length(unique.propexp)
#Now we scale the original PROPDMG variable by its respective PROPDMGEXP
for(i in 1:L){
      x <- unique.propexp[i]
      u1 <- df[,"PROPDMGEXP"] %in% x
      df[u1, "PROPDMG"] <- df[u1, "PROPDMG"]*numeric.propexp[i]
}

#We can take a look at the scaled PROPDMG
head(df %>% select(PROPDMG, PROPDMGEXP))

##   PROPDMG PROPDMGEXP
## 1    25.0          K
## 2     2.5          K
## 3    25.0          K
## 4     2.5          K
## 5     2.5          K
## 6     2.5          K

Scaling CROPDMG with its respective exponent, CROPDMGEXP

#Take a look at the CROPDMGEXP variable
unique.cropexp <- unique(df %>% select(CROPDMGEXP))
unique.cropexp

##        CROPDMGEXP
## 1                
## 187566          M
## 187571          K
## 187584          m
## 188633          B
## 192467          ?
## 192758          0
## 195667          k
## 221151          2

#Replace these exponent data numerically, 
#where K refers to multiplier of 1e3, 
#M refers to 1e6, 
#B refers to 1e9, 
#blanks or 0 refer to multiplier of 1
#2 refers to multiplier of 1e2 etc.
#"?" will be ignored and considered as multiplier of 0

numeric.cropexp <- c(1, 1e6, 1e3, 1e6, 1e9, 0, 1, 1e3, 1e2)
      
L <- length(unique.cropexp)
#now we scale the original PROPDMG variable by its respective PROPDMGEXP
for(i in 1:L){
      x <- unique.cropexp[i]
      u1 <- df[,"CROPDMGEXP"] %in% x
      df[u1, "CROPDMG"] <- df[u1, "CROPDMG"]*numeric.cropexp[i]
}

An aggregate measure of harmfulness to population health

#Create a new varaible called HARM as an aggregate measure of harmfulness to population health
#Aggregate both FATALITIES and INJURIES as a measure of harmfulness to population health by allocating weights of 1.5 and 1 to FATALITIES and INJURIES respectively.

df <- df %>% mutate(HARM = FATALITIES*1.5 + INJURIES*1.0)

df <-  as.data.frame(df)

To measure harmfulness to population health, we can allocate higher weight to FATALITIES and a lower weight to INJURIES, i.e. 1.5 and 1.0 respectively, and sum them up.

An aggregate measure of economic consequences

#Create a new variable called ECONCONS to measure the economic consequences of an event, where ECONCONS will be the sum of both PROPDMG and CROPDMG

df <- df %>% mutate(ECONCONS = PROPDMG + CROPDMG)

df <-  as.data.frame(df)

Results

Sum HARM and ECONCONS by EVTYPE

sumharm <- tapply(X = df[,"HARM"], INDEX = df[, "EVTYPE"], FUN = sum)
sdf.harm <- data.frame(EVTYPE = names(sumharm), sharm = sumharm)

sumecon <- tapply(X = df[,"ECONCONS"], INDEX = df[, "EVTYPE"], FUN = sum)
sdf.econ <- data.frame(EVTYPE = names(sumecon), secon = sumecon)

head(sdf.harm)

##                                      EVTYPE sharm
##    HIGH SURF ADVISORY    HIGH SURF ADVISORY     0
##  COASTAL FLOOD                COASTAL FLOOD     0
##  FLASH FLOOD                    FLASH FLOOD     0
##  LIGHTNING                        LIGHTNING     0
##  TSTM WIND                        TSTM WIND     0
##  TSTM WIND (G45)            TSTM WIND (G45)     0

head(sdf.econ)

##                                      EVTYPE secon
##    HIGH SURF ADVISORY    HIGH SURF ADVISORY   200
##  COASTAL FLOOD                COASTAL FLOOD     0
##  FLASH FLOOD                    FLASH FLOOD    50
##  LIGHTNING                        LIGHTNING     0
##  TSTM WIND                        TSTM WIND   108
##  TSTM WIND (G45)            TSTM WIND (G45)     8

Top 10 events that are most harmful to population health

sdf1 <- as.data.frame((sdf.harm %>% arrange(-sharm))[1:10,])


g1 <- ggplot(sdf1, aes(x= EVTYPE, weight = sharm)) + geom_bar() + xlab("Event Type") + ylab("Measure of harmfulness to population health") + ggtitle("Top 10 events that are most harmful to population health") + theme(axis.text.x = element_text(size=7, angle=25))
g1

We can observe that Tornado is significantly most harmful to population health than most other events.

Top 10 events that has the greatest economic consequences

sdf2 <- as.data.frame((sdf.econ %>% arrange(-secon))[1:10,])

g2 <- ggplot(sdf2, aes(x=EVTYPE, weight = secon)) + geom_bar() + xlab("Event Type") + ylab("Measure of economic damage") + ggtitle("Top 10 events that has the greatest economic consequences") + theme(axis.text.x = element_text(size=7, angle=25))
g2

We can observe that Tornado has the greatest economic consequences follow by Excessive Heat (such as heat waves), then TSTM Wind (Thunderstorm Wind).

Hence, the event that are most harmful to population health and has the greatest economic consequences is Tornado.

Analysis on Weather Events that are Most Harmful to the Population Health and the Economy using NOAA’s Storm Database

Jun Yitt, Cheah

March 31, 2017

Synopsis