Synopsis:

Using the NOAA storm data we will determine which type of weather events are the most harmful to the health of the popluation and which weather events have the greatest economic impact.

Data Processing:

Load Data

download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
              "StormData.csv.bz2", method="curl") 
storm <- read.csv(bzfile("StormData.csv.bz2"))

Verify data has been loaded and look at the data type of the variables

str(storm, var.len=2)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","%SD",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Reformat Data for Analysis

In order to properly analyize the data, we must reformat it to work with ‘R’.

Trim and convert EVTYPE to uppercase. Then combine some Levels that are duplicated

library(stringr)
storm$EVTYPE <- str_trim(toupper(storm$EVTYPE))
storm$EVTYPE[grep("AVALANCE|AVALANCHE", storm$EVTYPE)] <- "AVALANCHE"
storm$EVTYPE[grep("TORNADO|TORNDAO", storm$EVTYPE)] <- "TORNADO"
storm$EVTYPE[grep("HURRICANE|TSUNAMI|TYPOON", storm$EVTYPE)] <- "HURRICANE"
storm$EVTYPE[grep("TROPICAL STORM", storm$EVTYPE)] <- "TROPICAL STORM"
storm$EVTYPE[grep("WIND|WINDS|WND|THUNDERSTORMW|WINS|TSTMW", storm$EVTYPE)] <- "WIND"
storm$EVTYPE[grep("HAIL", storm$EVTYPE)] <- "HAIL"
storm$EVTYPE[grep("FLOOD", storm$EVTYPE)] <- "FLOOD"
storm$EVTYPE[grep("COLD", storm$EVTYPE)] <- "COLD"
storm$EVTYPE[grep("HEAT", storm$EVTYPE)] <- "HEAT"
storm$EVTYPE[grep("LIGHTNING|LIGHTING|LIGNTNING", storm$EVTYPE)] <- "LIGHTNING"
storm$EVTYPE[grep("FIRE", storm$EVTYPE)] <- "WILD FIRE"
storm$EVTYPE[grep("RAIN|PERCIPITATION|SNOW|SLEET|PRECIP", storm$EVTYPE)] <- "PERCIPITATION"
storm$EVTYPE[grep("WINTER|WINTRY|ICE|FREEZ|FROST", storm$EVTYPE)] <- "WINTER CONDITIONS"

Because the units for each instance of PROPDMG and CROPDMG we need to calculate the true value in order to compare them against eachother. We do this by converting PROPDMGEXP and CROPDMGEXP into thier respective numbers if a logicial value was used. It the value was not logical we set the value to 0.

library(plyr)
dmgCalc <- function(amount, unit)
{
        x <- toupper(unit)
        x[ x != "K" & x != "M" & x != "B"] <- 0
        x[x=="K"] <- .000001
        x[x=="M"] <- .001
        x[x=="B"] <- 1
        return(amount * as.numeric(x))
}

storm$PROPVALUE <- dmgCalc(storm$PROPDMG, storm$PROPDMGEXP)
storm$CROPVALUE <- dmgCalc(storm$CROPDMG, storm$CROPDMGEXP)

Create a data frame that summarizes the following variables by EVTYPE: FATALITIES - Number of fatalities INJURIES - Number of Injuries PROPVALUE - Amount of property damage CROPVALUE - Amount of crop damage

df <- ddply(storm, .(EVTYPE), summarize, 
            Fatalities = sum(FATALITIES), 
            Injuries = sum(INJURIES),
            Health = sum(FATALITIES + INJURIES),
            Property = sum(PROPVALUE),
            Crop = sum(CROPVALUE),
            Damage = sum(PROPVALUE + CROPVALUE))

Now we subset it to only get the types where there were health outcomes or property damage.

df <- subset(df, Fatalities > 0 | Injuries > 0 | Property > 0 | Crop > 0)

Results:

Popluation Health

library(reshape2)
library(ggplot2)
health <- melt(df[df$Health > 800, 1:3], id="EVTYPE")
ggplot(health, aes(x=EVTYPE, y=value, fill=variable)) + geom_bar(width=.8, stat="identity")+
        theme(axis.text.x=element_text(angle=90, hjust=1))+
        labs(title="U.S. Weather Event Caused Fatality & Injury", x="Event Type",
             y="Number of People")

plot of chunk unnamed-chunk-7

Economic Consequences

damage <- melt(df[df$Damage > 5, c(1, 5, 6)], id="EVTYPE")
ggplot(damage, aes(x=EVTYPE, y=value, fill=variable)) + geom_bar(width=.8, stat="identity")+
        theme(axis.text.x=element_text(angle=90, hjust=1))+
        labs(title="U.S. Weather Event Caused Property & Crop Damage", x="Event Type",
             y="Billions of Dollars")

plot of chunk unnamed-chunk-8