library(R.utils)
## Warning: package 'R.utils' was built under R version 3.1.3
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.1.3
## Loading required package: R.methodsS3
## Warning: package 'R.methodsS3' was built under R version 3.1.3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## 
## R.utils v2.0.0 (2015-02-28) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3

Synopsis

This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

For this analysis, I am interested in fatalities, injuries, property damage and crop damage caused by severe weather in the U.S. Therefore, I begin by subsetting the data to include only those four things.

Then I look at the health effects and economic effects seperately.

Data Processing

This is how I uploaded the NOAA data:

bunzip2("repdata-data-StormData.csv.bz2","StormData2.csv",remove=FALSE)
data <- read.csv("StormData.csv",as.is=TRUE,comment.char="")

This is a very large dataset, so the first thing I decided to do was subset it and select the rows that recorded positive values for economic or health damage:

q <- (data$FATALITIES > 0) | (data$INJURIES > 0) | (data$PROPDMG > 0) | (data$CROPDMG > 0)
data <- data[q,]

Cleaning and standardizing event types

data$EVTYPE <- toupper(x = data$EVTYPE)
data$EVTYPE <- gsub(" +", 
                    " ",
                    data$EVTYPE)
data$EVTYPE <- gsub(" $", 
                    "",
                    data$EVTYPE)

There are over 400 event types in the data; however, the National Weather Service Storm Data Documentation Table 2.1.1 only has around 100. This is the code that tries to bring these two numbers closer together and clean up the event types:

data$EVTYPE[grep("NON-THUNDERSTORM WIND|NON THUNDERSTORM WIND|GUSTY WIND|^WIND|^WINDS|HIGH WIND.|WIND DAMAGE",data$EVTYPE)] <- "HIGH WIND"       
data$EVTYPE[grep("STRONG WINDS|STRONG WIND",data$EVTYPE)]<-"STRONG WIND"
                    

data$EVTYPE[grep("THUNDEERSTORM|THUNDERESTORM|THUNDERSTROM|THUDERSTORM|THUNERSTORM|THUNDERTORM|TUNDERSTORM|TSTM WIND|STORM FORCE WINDS|THUNDERSTORMW|THUNDERSTORMWINDS|THUNDERSTORM WINS|TSTMW|THUNDERSTORMS WIND|THUNDERSTORM WINDS|THUNDERSTORM WIND/HAIL|THUNDERSTORM WINDS HAIL|THUNDERSTORM WINDSS|SEVERE THUNDERSTORM|THUNDERSTORM|THUNDERSTORM WIND......*",
                    data$EVTYPE)] <- "THUNDERSTORM WIND"

data$EVTYPE[grep("URBAN/SML STREAM FLD|FLASH FLOODING|FLOOD/FLASH FLOOD|RIVER FLOOD|URBAN FLOODING|URBAN/SMALL STREAM FLOOD|FLASH FLOODING/FLOOD|FLASH FLOODS|URBAN FLOOD|FLASH FLOOD/FLOOD|FLASH",
                    data$EVTYPE)] <-"FLASH FLOOD"

data$EVTYPE[grep("HEAVY RAINS/FLOODING|^FLOODING|^FLOOD.",
                    data$EVTYPE)]<-"FLOOD"

data$EVTYPE[grep("LAKE-EFFECT SNOW|EXCESSIVE SNOW|SQUAL|HEAVY SNOW",
                    data$EVTYPE)]<-"HEAVY SNOW"
data$EVTYPE[grep("HEAVY RAINS|^RAIN|PRECIPITATION",
                    data$EVTYPE)]<-"HEAVY RAIN"
data$EVTYPE[grep("HEAVY MIX|RECORD SNOW",
                    data$EVTYPE)]<-"WINTER STORM"

data$EVTYPE[grep("WINTER WEATHER|LIGHT SNOW|^SNOW|FREEZING RAIN|LIGHT FREEZING RAIN|SNOW FREEZING RAIN|FREEZING DRIZZLE|WINTRY|LAKE EFFECT SNOW|ICY ROADS|^ICE$",
                    data$EVTYPE)] <-"WINTER WEATHER"

data$EVTYPE[grep("FALLING SNOW/ICE|GLAZE ICE|HARD FREEZE|ICE FLOES",
                    data$EVTYPE)]<-"ICE STORM"

data$EVTYPE[grep("FIRE",
                    data$EVTYPE)]<-"WILDFIRE"

data$EVTYPE[grep("SLIDE",
                 data$EVTYPE)]<-"LANDSLIDE"

data$EVTYPE[grep("TORNADO",
                 data$EVTYPE)]<-"TORNADO"

data$EVTYPE[grep("HURRICANE|TYPHOON",
                 data$EVTYPE)]<-"HURRICANE"

data$EVTYPE[grep("UNSEASONABLY COLD|^COLD|EXTENDED COLD|UNSEASONABLE COLD|LOW TEMPERATURE",
                 data$EVTYPE)]<-"COLD/WIND CHILL"

data$EVTYPE[grep("EXTREME COLD/WIND CHILL|RECORD COLD|EXTREME WINDCHILL|HYPOTHERMIA/EXPOSURE",
                 data$EVTYPE)]<-"EXTREME COLD"

data$EVTYPE[grep("FROST|FREEZE",
                 data$EVTYPE)]<-"FROST/FREEZE"

data$EVTYPE[grep("^FOG",
                 data$EVTYPE)]<-"DENSE FOG"

data$EVTYPE[grep("BLIZZARD",
                 data$EVTYPE)]<-"BLIZZARD"

data$EVTYPE[grep("HAIL",
                 data$EVTYPE)]<-"HAIL"

data$EVTYPE[grep("RIP",
                 data$EVTYPE)]<-"RIP CURRENT"

data$EVTYPE[grep("SURF|HIGH WATER|HIGH SWELLS|HIGH TIDE",
                 data$EVTYPE)]<-"HIGH SURF"

data$EVTYPE[grep("HEAT",
                 data$EVTYPE)]<-"HEAT"

That is probably as good as it is going to get. Now, I will aggregate the event types with a smaller count than “OTHER” into the “OTHER” event type.

event.table <- sort(table(data$EVTYPE), decreasing = TRUE)
event.types.under.OTHER <- names(event.table[event.table < 34])

data$EVTYPE[data$EVTYPE %in% event.types.under.OTHER] <- "OTHER"
data$BGN_DATE <- as.Date(data$BGN_DATE,format="%m/%d/%Y %H:%M:%S")
data$BGN_DATE <- format(data$BGN_DATE, format="%Y")
data$BGN_DATE <- as.factor(data$BGN_DATE)
data$EVTYPE <- as.factor(data$EVTYPE)

My final event types table looks like this:

sort(table(data$EVTYPE),decreasing=TRUE)
## 
##   THUNDERSTORM WIND             TORNADO                HAIL 
##              119822               39969               26164 
##         FLASH FLOOD           LIGHTNING               FLOOD 
##               22662               13293               10253 
##           HIGH WIND         STRONG WIND          HEAVY SNOW 
##                6372                3472                1640 
##        WINTER STORM            WILDFIRE          HEAVY RAIN 
##                1518                1259                1154 
##                HEAT      WINTER WEATHER           ICE STORM 
##                 980                 902                 716 
##         RIP CURRENT      TROPICAL STORM        EXTREME COLD 
##                 643                 416                 340 
##               OTHER           AVALANCHE             DROUGHT 
##                 290                 268                 266 
##            BLIZZARD           HIGH SURF           HURRICANE 
##                 256                 240                 232 
##           LANDSLIDE           DENSE FOG         STORM SURGE 
##                 209                 182                 177 
##       COASTAL FLOOD        FROST/FREEZE     COLD/WIND CHILL 
##                 168                 153                 151 
##          DUST STORM          DUST DEVIL      DRY MICROBURST 
##                 103                  95                  78 
##    COASTAL FLOODING    STORM SURGE/TIDE          WATERSPOUT 
##                  61                  47                  47 
## TROPICAL DEPRESSION 
##                  35

Processing Crop and Property Damage Variables

#Property Damage = PROP
valid.letter <- c("H", "K", "M", "B")
multiplier <- c(100, 1000, 1e+06, 1e+09)
names(multiplier) <- valid.letter
data$PROPDMGEXP <- toupper(data$PROPDMGEXP)
valid <- data$PROPDMGEXP %in% valid.letter
data$PROPDMGEXP[!valid] <- ""
data$PROP[valid] <- data$PROPDMG[valid] * multiplier[data$PROPDMGEXP[valid]]

#Crop Damage = CROP
data$CROPDMGEXP <- toupper(data$CROPDMGEXP)
valid2 <- data$CROPDMGEXP %in% valid.letter
data$CROPDMGEXP[!valid2] <- ""
data$CROP[valid2] <- data$CROPDMG[valid2] * multiplier[data$CROPDMGEXP[valid2]]

Results

Which Weather Events Cause the Most Fatalities and Injuries?

F <- data %>% group_by(EVTYPE) %>%
        summarize(total.fatality = sum(FATALITIES),
                  mean.fatality = mean(FATALITIES),
                  total.injury = sum(INJURIES),
                  mean.injury = mean(INJURIES),
                  total = n(),na.rm=TRUE) %>%
        filter(mean.fatality > .1|total.fatality>500)%>%
        arrange(desc(total.fatality))


ggplot(F, aes(x = reorder(EVTYPE,total.fatality), y = total))+
        geom_point(aes(size = F$total.injury),color="red",alpha=.35)+
        geom_point(aes(color=F$mean.fatality, size = F$total.fatality),alpha=.85)+
        labs(x="Weather Event",y="Total Events",title="Fatal Weather Events Since 1950",size="Total Fatalities", color="Mean Fatality Rate")+
        scale_size_continuous(range=c(1,15), limit =c(10,2000),na.value=16)+
        theme(axis.text.x = element_text(face="bold", angle = 60, hjust = 1))

This scatterplot displays the weather events with a mean fatality rate above 10% or a total fatality count of 500 or more. The blue represents the number of fatalities, and the red represents the number of injuries.

Which Weather Events Cause the Most Property and Crop Damage?

D <- data %>% group_by(EVTYPE)%>%
        summarize(total = n(),
                  total.crop.damage = sum(CROP,
                  na.rm=TRUE),
                  mean.crop.damage = mean(CROP,
                  na.rm=TRUE),
                  total.property.damage = sum(PROP,
                  na.rm=TRUE),
                  mean.property.damage = mean(PROP,
                  na.rm=TRUE)) %>%
        filter(total.crop.damage>10000000 & total.property.damage>10000000) %>%
        arrange(desc(total.property.damage))

ggplot(D, aes(x = reorder(EVTYPE,total.property.damage), y = total))+
        geom_point(aes(size = D$total.crop.damage),color="dark green",alpha=.35)+
        geom_point(aes(color=D$mean.property.damage, size = D$total.property.damage),alpha=.45)+
        labs(x="Weather Event",y="Total Events",title="Crop/Property Damage Caused by Weather Events Since 1950",size="Total Damage", color="Mean Damage per Event")+
        scale_size_continuous(range=c(1,20), limit =c(100000,6600000000),na.value=21)+
        theme(axis.text.x = element_text(face="bold", angle = 60, hjust = 1))

This plot shows the weather events that have caused more than $10 million in both crop and property damage since 1950. The green points represent crop damage and the blue points represent property damage.