Synopsis

This report addresses an impact of severe weather on public health and economy of the United States of America. The data for this analysis originates from National Oceanic and Atmospheric Administration’s (NOAA) database. Author assumes correlation between casualties from bad weather and geographical location of the impacted area. Firstly raw data is analyzed and cleaned. Secondly “state”dataset from R datasets package is used to link event locations to geographical divisions of the United States.

Geographical division of the US

Severe weather can result in many deaths and injuries and can cause significant damage to the economy of the country. Weather patterns are closely connected with country’s geography. Therefore we consider weather impact on geographical state divisions of the United States. There are 9 Regional divisions used by the United States Census Bureau:

  1. Region 1: Northeast
  1. Region 2: Midwest (Prior to June 1984, the Midwest Region was designated as the North Central Region.)
  1. Region 3: South
  1. Region 4: West

To have visual illustration we use Census geographical map of the United States.

Data Processing

Loading required libraries

R package management tool “pacman” to load libraries

# R package management tool
if (!require("pacman")) {
  install.packages("pacman", repos = "http://cran.us.r-project.org")
}
## Loading required package: pacman
library(pacman)
# Load required libraries
p_load(plyr,dplyr,lattice,ggplot2,grid,R.utils,datasets,knitr)

Downloading, unzipping and reading data

Download data

# cache = TRUE
fileurl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
f <- file.path(getwd(),  "StormData.csv.bz2")
download.file(fileurl, f, method = "curl")

Unzip data

# Unzip bz2 file
bunzip2(f, "StormData.csv", remove = FALSE, skip = TRUE)
## [1] "StormData.csv"
## attr(,"temporary")
## [1] FALSE

Read data

# Read csv file
path <- file.path(getwd()) # path to the dataset
S <- tbl_df(read.csv(file.path(path, "StormData.csv"),header=TRUE, stringsAsFactors=FALSE))

Take a peek at dataset

dim(S)
## [1] 902297     37
str(S)

Cleaning up the dataset

The column “EVTYPE” is not clean. We will replace all occurences of events with ones permitted in Storm Data Event Table (See Storm Data Documentation).

S$EVTYPE <- toupper(S$EVTYPE) # Make all strings uppercase
R<- grep("LIGHTNING+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"LIGHTNING"
R<- grep("HEAVY SEAS+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HIGH SEAS"
R<- grep("MARINE ACCIDENT", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"MARINE MISHAP"
R<- grep("MUD+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"LANDSLIDE"
R<- grep("HIGH WATER+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"STORM SURGE/TIDE"
R<- grep("FROST+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
R<- grep("BRUSH+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"WILDFIRE"
R<- grep("TORRENTIAL+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HEAVY RAIN"
R<- grep("HEAVY RAIN+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HEAVY RAIN"
R<- grep("SMALL HAIL+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HAIL"
R<- grep("FREEZING RAIN+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"SLEET"
R<- grep("FREEZING DRIZZLE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
R<- grep("ICE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
R<- grep("EXCESSIVE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HEAVY RAIN"
R<- grep("MIXED+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"SLEET"
R<- grep("ICY ROADS+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
R<- grep("WINTR+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"WINTER WEATHER"
R<- grep("TORNADO+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"TORNADO"
R<- grep("BLACK ICE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
R<- grep("ROUGH+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"STORM SURGE/TIDE"
R<- grep("URBAN+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FLOOD"
R<- grep("LOW+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"COLD"
R<- grep("TROPICAL+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"TROPICAL STORM"
R<- grep("WILD+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"WILDFIRE"
R<- grep("GLAZE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
R<- grep("FOG+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"DENSE FOG"
R<- grep("STORM SURGE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"STORM SURGE/TIDE"
R<- grep("HYPOTHERMIA+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"COLD"
R<- grep("WINTER STORM+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"WINTER STORM"
R<- grep("THUN+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"TSTM WIND"
R <- grep("HURR+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HURRICANE"
R <- grep("RIP+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"RIP CURRENT"
R <- grep("COLD+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"COLD"
R <- grep("HEAT+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HEAT"
R <- grep("WIND+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"STRONG WIND"
R <- grep("FLOOD+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FLOOD"
R <- grep("WINTER WEATHER+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"WINTER WEATHER"
R <- grep("HEAVY SURF+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HIGH SURF"
R <- grep("SNOW+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"BLIZZARD"
R <- grep("UNSEASONABLY WARM+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HEAT"
R <- grep("HAIL+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"HAIL"
R <- grep("LIGHT+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"LIGHTNING"
R <- grep("DUST DEVIL+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"DUST DEVIL"
R <- grep("FREEZE+", S$EVTYPE, ignore.case=TRUE)
S$EVTYPE[R]<-"FROST/FREEZE"
# Multiplot function
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Impact of weather on the health across the US

Subsetting storm dataset to address fatalities and injuries

# select dates, state_id, state abbrev., fatalities, and event types
suppressWarnings(FAT <- select(S, BGN_DATE, STATE__, STATE, FATALITIES, EVTYPE))
# remove DC and maritime areas as their contribution is negligible
cond1 <- FAT$STATE__== 11 | FAT$STATE__ >=60
FAT <- FAT[!cond1,]
INJ <- select(S, BGN_DATE, STATE__, STATE, INJURIES, EVTYPE)
# remove DC and maritime areas as their contribution is negligible
cond1 <- INJ$STATE__== 11 | INJ$STATE__ >=60
INJ <- INJ[!cond1,]
# rename columns as date, state_id, state abbreviation, fatalities, events
names(FAT) <- c("DATE", "STATE.ID", "STATE.ABB", "FATALITIES", "EVENTS")
# rename columns as date, state_id, state, injuries, events
names(INJ)<- c("DATE", "STATE.ID", "STATE.ABB", "INJURIES", "EVENTS")
# further filtering of FAT and INJ datasets for cases with nonzero casualties
FAT<-filter(FAT,FATALITIES>0)
INJ <-filter(INJ,INJURIES>0)

We use “state”dataset from R datasets package. It contains information about 50 states of the United States of America. We use state.division factor giving state divisions (New England, Middle Atlantic, South Atlantic, East South Central, West South Central, East North Central, West North Central, Mountain, and Pacific).

state.data = cbind(data.frame(state.x77), state.abb, state.area, state.center,  state.division, state.name, state.region)
state.data <- select(state.data,  state.abb,  state.division,  state.name, state.region)
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace
state2division <- select(state.data,  state.abb,  state.division,  state.name)
state2division <- arrange(state2division, by = state.division)
names(state.data) <- c("STATE.ABB", "STATE.DIVISION", "STATE.NAME", "STATE.REGION")
m_f=merge(FAT,state.data, by = "STATE.ABB")
m_i=merge(INJ,state.data, by = "STATE.ABB")
# aggregate total fatalities&injuries by the state division in descending order
Agg_F <- aggregate(FATALITIES ~ EVENTS+STATE.DIVISION, data = m_f, FUN = sum) 
Agg_I <- aggregate(INJURIES ~ EVENTS+STATE.DIVISION, data = m_i, FUN = sum) 
Agg_F <- arrange(Agg_F, desc(FATALITIES))
Agg_I <- arrange(Agg_I, desc(INJURIES))
# Agg_F <- filter(Agg_F, FATALITIES > 1)
# Agg_I <- filter(Agg_I, INJURIES > 1)
summary(Agg_F)
##     EVENTS                     STATE.DIVISION   FATALITIES     
##  Length:177         Pacific           :24     Min.   :   1.00  
##  Class :character   South Atlantic    :23     1st Qu.:   4.00  
##  Mode  :character   Mountain          :22     Median :  16.00  
##                     Middle Atlantic   :21     Mean   :  83.82  
##                     West South Central:20     3rd Qu.:  74.00  
##                     New England       :19     Max.   :1560.00  
##                     (Other)           :48
summary(Agg_I)
##     EVENTS                     STATE.DIVISION    INJURIES      
##  Length:185         Pacific           :29     Min.   :    1.0  
##  Class :character   South Atlantic    :23     1st Qu.:    8.0  
##  Mode  :character   Middle Atlantic   :22     Median :   68.0  
##                     West South Central:20     Mean   :  753.7  
##                     West North Central:19     3rd Qu.:  337.0  
##                     Mountain          :19     Max.   :21729.0  
##                     (Other)           :53
# Further filter of data based on its summary
Agg_F <- filter(Agg_F, FATALITIES > 7)
Agg_I <- filter(Agg_I, INJURIES > 10)
p1 <- ggplot(data = Agg_F, aes(fill = EVENTS, y = FATALITIES, x = EVENTS)) + geom_bar(position = "dodge", stat = "identity") + facet_wrap(~STATE.DIVISION,scales = "free_y") + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab("Types of Events") + ylab("Total Number of Fatalities") + ggtitle("Fatalities from the Severe Weather in the US Divisions from 1950 to 2011")
p2 <- ggplot(data = Agg_I, aes(fill = EVENTS, y = INJURIES, x = EVENTS)) + geom_bar(position = "dodge", stat = "identity") + facet_wrap(~STATE.DIVISION,scales = "free_y") + theme(axis.text.x = element_text(angle=90, hjust=1))+ xlab("Types of Events") + ylab("Total Number of Injuries") + ggtitle("Injuries from the Severe Weather in the US Divisions from 1950 to 2011") 
multiplot(p1,p2,rows=2)

## [1] 2

Impact of weather on the economy across the US

Subsetting storm dataset to address property and crop damage

# Subset dataset S for property and crop damage 
PROP <- select(S, BGN_DATE, STATE__, STATE, PROPDMG, PROPDMGEXP, EVTYPE)
# remove DC and maritime areas as their contribution is negligible
cond1 <- PROP$STATE__== 11 | PROP$STATE__ >=60
PROP <- PROP[!cond1,]
CROP <- select(S, BGN_DATE, STATE__, STATE, CROPDMG, CROPDMGEXP, EVTYPE)
# remove DC and maritime areas as their contribution is negligible
cond1 <- CROP$STATE__== 11 | CROP$STATE__ >=60
CROP <- CROP[!cond1,]
# analyze data columns realated to the property and crop damage exponents.
unique(CROP$CROPDMGEXP)
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"
unique(PROP$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
# new variables cdmgexp and pdmgexp to handle  CROPDMGEXP and PROPDMGEXP
# new variables for total crop and property damage CROPDMGTOT and PROPDMGTOT
CROP$cdmgexp[CROP$CROPDMGEXP==""]  <- 1e0
CROP$cdmgexp[CROP$CROPDMGEXP=="M"] <- 1e6
CROP$cdmgexp[CROP$CROPDMGEXP=="K"] <- 1e3
CROP$cdmgexp[CROP$CROPDMGEXP=="m"] <- 1e6
CROP$cdmgexp[CROP$CROPDMGEXP=="B"] <- 1e9
CROP$cdmgexp[CROP$CROPDMGEXP=="?"] <- 1e0
CROP$cdmgexp[CROP$CROPDMGEXP=="0"] <- 1e0
CROP$cdmgexp[CROP$CROPDMGEXP=="k"] <- 1e3
CROP$cdmgexp[CROP$CROPDMGEXP=="2"] <- 1e2

PROP$pdmgexp[PROP$PROPDMGEXP=="H"] <- 1e2
PROP$pdmgexp[PROP$PROPDMGEXP=="K"] <- 1e3
PROP$pdmgexp[PROP$PROPDMGEXP=="M"] <- 1e6
PROP$pdmgexp[PROP$PROPDMGEXP==""]  <- 1e0
PROP$pdmgexp[PROP$PROPDMGEXP=="B"] <- 1e9
PROP$pdmgexp[PROP$PROPDMGEXP=="m"] <- 1e6
PROP$pdmgexp[PROP$PROPDMGEXP=="h"] <- 1e0
PROP$pdmgexp[PROP$PROPDMGEXP=="+"] <- 1e0
PROP$pdmgexp[PROP$PROPDMGEXP=="-"] <- 1e0
PROP$pdmgexp[PROP$PROPDMGEXP=="0"] <- 1e0
PROP$pdmgexp[PROP$PROPDMGEXP=="1"] <- 1e1
PROP$pdmgexp[PROP$PROPDMGEXP=="2"] <- 1e2
PROP$pdmgexp[PROP$PROPDMGEXP=="3"] <- 1e3
PROP$pdmgexp[PROP$PROPDMGEXP=="4"] <- 1e4
PROP$pdmgexp[PROP$PROPDMGEXP=="5"] <- 1e5
PROP$pdmgexp[PROP$PROPDMGEXP=="6"] <- 1e6
PROP$pdmgexp[PROP$PROPDMGEXP=="7"] <- 1e7
PROP$pdmgexp[PROP$PROPDMGEXP=="8"] <- 1e8
PROP$pdmgexp[PROP$PROPDMGEXP=="?"] <- 1e0
# Total crop and property damage
CROP$CROPDMGTOT <- as.numeric(CROP$cdmgexp) * CROP$CROPDMG
PROP$PROPDMGTOT <- as.numeric(PROP$pdmgexp) * PROP$PROPDMG
# removing unnecessary columns
CROP = subset(CROP, select = -c(BGN_DATE, STATE__, CROPDMG, CROPDMGEXP,cdmgexp))
PROP = subset(PROP, select = -c(BGN_DATE, STATE__, PROPDMG, PROPDMGEXP,pdmgexp))
# filter out zero values
CROP<-filter(CROP,CROPDMGTOT>0)
PROP <-filter(PROP,PROPDMGTOT>0)
# rename columns as state abbreviation, TOTAL.CROP.DMG, EVENTS
names(CROP) <- c("STATE.ABB", "EVENTS", "TOTAL.CROP.DMG")
# rename columns as state abbreviation, TOTAL PROPERTY DAMAGE, EVENTS
names(PROP)<- c("STATE.ABB", "EVENTS","TOTAL.PROP.DMG")

We use “state”dataset from R datasets package. It contains information about 50 states of the United States of America. We use state.region factor giving state regions (Northeast, North Central, South, and West).

state.data = cbind(data.frame(state.x77), state.abb, state.area, state.center,  state.division, state.name, state.region)
state.data <- select(state.data,  state.abb, state.region)
# state2region <- select(state.data,  state.abb,  state.region)
state.data <- arrange(state.data, by = state.region)
names(state.data) <- c("STATE.ABB", "STATE.REGION")
m_c=merge(CROP,state.data, by = "STATE.ABB")
m_p=merge(PROP,state.data, by = "STATE.ABB")
# aggregate total fatalities&injuries by the state division in descending order
Agg_P <- aggregate(TOTAL.PROP.DMG ~ EVENTS+STATE.REGION, data = m_p, FUN = sum) 
Agg_C <- aggregate(TOTAL.CROP.DMG ~ EVENTS+STATE.REGION, data = m_c, FUN = sum) 
Agg_P <- arrange(Agg_P, desc(TOTAL.PROP.DMG))
Agg_C <- arrange(Agg_C, desc(TOTAL.CROP.DMG))
summary(Agg_P)
##     EVENTS                 STATE.REGION TOTAL.PROP.DMG     
##  Length:140         Northeast    :30    Min.   :1.500e+02  
##  Class :character   South        :39    1st Qu.:1.000e+05  
##  Mode  :character   North Central:28    Median :3.611e+06  
##                     West         :43    Mean   :3.033e+09  
##                                         3rd Qu.:2.153e+08  
##                                         Max.   :1.205e+11
summary(Agg_C)
##     EVENTS                 STATE.REGION TOTAL.CROP.DMG     
##  Length:64          Northeast    :11    Min.   :1.550e+03  
##  Class :character   South        :16    1st Qu.:5.241e+06  
##  Mode  :character   North Central:17    Median :4.440e+07  
##                     West         :20    Mean   :7.559e+08  
##                                         3rd Qu.:5.280e+08  
##                                         Max.   :9.634e+09

Total Property and Crop Damage due to Severe Weather across US Regions from 1950 to 2011

# filtering data based on its summary
Agg_P <- filter(Agg_P, TOTAL.PROP.DMG > 2e9)
p1 <- ggplot(data = Agg_P, aes(fill = EVENTS, y = TOTAL.PROP.DMG, x = EVENTS)) + geom_bar(position = "dodge", stat = "identity") + facet_wrap(~STATE.REGION,scales = "free_y") + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab("Types of Events") + ylab("Total Property Damage ($USD)")
Agg_C <- filter(Agg_C, TOTAL.CROP.DMG > 1e8)
p2 <- ggplot(data = Agg_C, aes(fill = EVENTS, y = TOTAL.CROP.DMG, x = EVENTS)) + geom_bar(position = "dodge", stat = "identity") + facet_wrap(~STATE.REGION,scales = "free_y") + theme(axis.text.x = element_text(angle=90, hjust=1)) + xlab("Types of Events") + ylab("Total Crop Damage ($USD)")
multiplot(p1,p2,cols=2)

Results

As it was originally expected number of most harmful events from extreme weather is strongly correlated with the geographical location of the area. For example, avalanches are major cause of death in Mountain areas, while most injuries in the same area are coming from strong winds. Wildfire is the main source of injuries in the Pacific, where California is the largest state, while most of fatalities in the same region are due to heavy rains. Most devastating effect on human lives across most of the regions are coming from deadly tornadoes (first East South Central and second West South Central divisions).

Most of property damage is reported in the West due to heavy floods and from hurricanes in the South. Most of crop damage is originated from storm surge/tide in the South and from flood in the North Central region.

Conclusion

Deadliest weather events are tornadoes, which are widespread all across the country. Most damaging events for the economy are floods, followed by hurricanes and tornadoes.