Preparation

First let’s load the data, since it is a large data set we use bzfile option on read.csv

#EVTYPE Variable
        storm_data <- read.csv(bzfile("StormData.csv.bz2"))

2.After loading the data, let’s take a quick look to the data set to find more about it’s structure

str(storm_data)

## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

3.In order to be ready for analysis we need to make sure data can be aggreggated, the following code accomplish this task

# Validating all Fatalities are numbers
storm_data$FATALITIES <-as.numeric(storm_data$FATALITIES)
# Convert injuries also to numbers
storm_data$INJURIES <-as.numeric(storm_data$INJURIES)

Next we will use the EVTYPE column to find out the most harmful events

#Transform to upper case, first
storm_data$EVTYPE <-toupper(storm_data$EVTYPE)
event_type <- sort(unique(storm_data$EVENTYPE))

## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'NULL'

As a result we have the events data as follows (showing the first 50)

event_type[1:50]

## NULL

#In Order we could build the charts requested, we need to transfer the event types as factors
storm_data$EVTYPE <-as.factor(storm_data$EVTYPE)

Consolidating Event Data

Lethal Events

Now let’s consolidate lethal events by aggregating the number of fatalities by event type

library(data.table)

## Warning: package 'data.table' was built under R version 3.1.2

fatalities <- as.data.table(subset(aggregate(FATALITIES ~ EVTYPE, data = storm_data, FUN="sum"), FATALITIES >0))
fatalities <- fatalities[order(-FATALITIES),]
top20 <- fatalities[order(-FATALITIES), ]

The chart summarizing these findings is posted below

top20 <- fatalities[1:20, ]
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.1.2

ggplot(data=top20, aes(EVTYPE, FATALITIES, fill = FATALITIES)) + geom_bar(stat="identity") + xlab("Event") + ylab("Fatalities") + ggtitle("Top 20 Fatalities caused by Events") +
    coord_flip()

Injuries

injuries <- as.data.table(subset(aggregate(INJURIES ~ EVTYPE, data = storm_data, FUN = "sum"), INJURIES > 0))
injuries <- injuries[order(-INJURIES), ]
top20i <- injuries[1:20, ]

The chart summarizing the injuries is posted below

ggplot(data = top20i, aes(EVTYPE, INJURIES, fill = INJURIES)) + geom_bar(stat = "identity") + 
    xlab("Event") + ylab("Injuries") + ggtitle("Top 20 Injuries caused by Events") + 
    coord_flip() + theme(legend.position = "none")

Economic Impact Analysis

Since we have different exponents used on the economic impact figures, we need to consolidate the values. To help with this let’s create a table of unique exponents

storm_data$PROPDMGEXP <- toupper(storm_data$PROPDMGEXP)
unique(storm_data$PROPDMGEXP)

##  [1] "K" "M" ""  "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"

table(storm_data$PROPDMGEXP)

## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      H      K      M 
##      4      5      1     40      7 424665  11337

After cleaning the exponents, we need to convert them into numerical figures:

# This functions converts each exponent into it's correspondent numeric figure
calculateExp <- function(x, exp = "") {
    switch(exp, `-` = x * -1, `?` = x, `+` = x, `1` = x, `2` = x * (10^2), `3` = x * 
        (10^3), `4` = x * (10^4), `5` = x * (10^5), `6` = x * (10^6), `7` = x * 
        (10^7), `8` = x * (10^8), H = x * 100, K = x * 1000, M = x * 1e+06, 
        B = x * 1e+09, x)
}

applyCalculateExp <- function(vx, vexp) {
    if (length(vx) != length(vexp)) 
        stop("Not same size")
    result <- rep(0, length(vx))
    for (i in 1:length(vx)) {
        result[i] <- calculateExp(vx[i], vexp[i])
    }
    result
}

Now we can calculate the economic costs caused by the events

storm_data$EconomicCosts <- applyCalculateExp(as.numeric(storm_data$PROPDMG), storm_data$PROPDMGEXP)
summary(storm_data$EconomicCosts)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -1.500e+01  0.000e+00  0.000e+00  4.746e+05  5.000e+02  1.150e+11

Consolidating the Economic Costs

costs <- as.data.table(subset(aggregate(EconomicCosts ~ EVTYPE, data = storm_data, FUN = "sum"), EconomicCosts > 0))
costs <- costs[order(-EconomicCosts), ]

The graph for the top 20 could be generated from

library(scales)

## Warning: package 'scales' was built under R version 3.1.2

top20c <- costs[1:20, ]
ggplot(data = top20c, aes(EVTYPE, EconomicCosts, fill = EconomicCosts)) + geom_bar(stat = "identity") + 
    scale_y_continuous(labels = comma) + xlab("Event") + ylab("Economic costs in $") + 
    ggtitle("Economic costs from Top 20 Events ") + coord_flip() + theme(legend.position = "none")

US-Storm-Data-Analysis

Alejandro Fraga