by Alex Tuzhikov, August 15 2015

Quick preparation:

Loading libraries:

if(!require("data.table")){
  install.packages("data.table")
  library("data.table")
}
## Loading required package: data.table
if(!require("plyr")){
  install.packages("plyr")
  library("plyr")
}
## Loading required package: plyr
if(!require("dplyr")){
  install.packages("dplyr")
  library("dplyr")
}
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if(!require("tidyr")){
  install.packages("tidyr")
  library("tidyr")
}
## Loading required package: tidyr
if(!require("knitr")){
  install.packages("knitr")
  library("knitr")
}
## Loading required package: knitr
if(!require("ggplot2")){
  install.packages("ggplot2")
  library("ggplot2")
}
## Loading required package: ggplot2
if(!require("lubridate")){
  install.packages("lubridate")
  library("lubridate")
}
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## 
## The following object is masked from 'package:plyr':
## 
##     here
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, mday, month, quarter, wday, week, yday, year
if(!require("R.utils")){
  install.packages("R.utils")
  library("R.utils")
}
## Loading required package: R.utils
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## 
## R.utils v2.1.0 (2015-05-27) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings
if(!require("rgeos")){
  install.packages("rgeos")
  library("rgeos")
}
## Loading required package: rgeos
## rgeos version: 0.3-11, (SVN revision 479)
##  GEOS runtime version: 3.4.2-CAPI-1.8.2 r3921 
##  Linking to sp version: 1.1-0 
##  Polygon checking: TRUE
if(!require("rgdal")){
  install.packages("rgdal")
  library("rgdal")
}
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 1.0-4, (SVN revision 548)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
##  Linking to sp version: 1.1-1 
## 
## Attaching package: 'rgdal'
## 
## The following object is masked from 'package:R.oo':
## 
##     getDescription
if(!require("readr")){
  install.packages("readr")
  library("readr")
}
## Loading required package: readr
if(!require("maptools")){
  install.packages("maptools")
  library("maptools")
}
## Loading required package: maptools
## Checking rgeos availability: TRUE
if(!require("mapproj")){
  install.packages("mapproj")
  library("mapproj")
}
## Loading required package: mapproj
## Loading required package: maps
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:plyr':
## 
##     ozone
if(!require("gridExtra")){
  install.packages("gridExtra")
  library("gridExtra")
}
## Loading required package: gridExtra
if(!require("scales")){
  install.packages("scales")
  library("scales")
}
## Loading required package: scales
## 
## Attaching package: 'scales'
## 
## The following objects are masked from 'package:readr':
## 
##     col_factor, col_numeric

Setting options:

opts_chunk$set(echo = TRUE,fig.width = 7, fig.height = 7)

Synopsis

Weather conditions are an important factor, which must be accounted for in any particular region nationwide. Severe weather, such as tornados, floods, draughts and lightning cause injuries or death among the populatin and might have a strong impact on the economic well-being of the region by causing damage to property, crop and infrastructure of the region. Therefore, it is imperative, that statistics over previous severe weather conditions are properly conduted and studied in order to raise awareness, minimize the malignant consequences and adjust budgets to ceratian expectaions of disasters in any particular region. Here we study the data on more than 60 years of natural disaster observations across the United States in order to show that tornados and floods may be considered as a leading cause of damage to public health and property, and vary across the United States.

Data processing

Obtaining the data

#check if the file has alerady been downloaded
storm.data<-"StormData"
storm.data.csv<-paste0(storm.data,".csv")
storm.data.bz<-paste0(storm.data.csv,".bz2")

if(!file.exists(storm.data.bz)){
  #download the file
  download.file(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
                destfile=storm.data.bz, method = "curl")
}
#extract the data and select only the columns that may be valuable for this study
noaa.data<- read.csv(file=storm.data.bz, header=TRUE, stringsAsFactors = FALSE) %>% 
  select(STATE,COUNTYNAME,BGN_DATE,EVTYPE,MAG,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)

Cleaning the data

Apparently, some of the data need to get tidy. See below:

head(noaa.data)
##   STATE COUNTYNAME           BGN_DATE  EVTYPE MAG FATALITIES INJURIES
## 1    AL     MOBILE  4/18/1950 0:00:00 TORNADO   0          0       15
## 2    AL    BALDWIN  4/18/1950 0:00:00 TORNADO   0          0        0
## 3    AL    FAYETTE  2/20/1951 0:00:00 TORNADO   0          0        2
## 4    AL    MADISON   6/8/1951 0:00:00 TORNADO   0          0        2
## 5    AL    CULLMAN 11/15/1951 0:00:00 TORNADO   0          0        2
## 6    AL LAUDERDALE 11/15/1951 0:00:00 TORNADO   0          0        6
##   PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1    25.0          K       0           
## 2     2.5          K       0           
## 3    25.0          K       0           
## 4     2.5          K       0           
## 5     2.5          K       0           
## 6     2.5          K       0

Here BGN_DATE should probably be of a more appropriate class, that represents a date.

#simply convert the dates and get rid of the old BGN_DATE column
noaa.data <- noaa.data %>% mutate(EVENT_DATE=mdy_hms(BGN_DATE)) %>% select(-c(BGN_DATE))

Now, there is still an issue with the PROPDMG value (property damage). Currently,

unique(noaa.data$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"

and we can not use these values “as is”, so some conversion is due here as well. I will assign a numeric value, which corresponds to some power of ten to each of the values:

#            "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
power.val<-c( 3,  6,  0,  9,  6,  0,  0,  5,  6,  0,  4,  2,  3,  0,  7,  2,  0,  1,  8)
val.map<-mapvalues(x = noaa.data$PROPDMGEXP,
          from=c("K","M","", "B","m","+","0","5","6","?","4","2","3","h","7","H","-","1","8"),
          to=sapply(power.val,function(i){return(10^i)}))
rm(power.val)
noaa.data<- noaa.data %>% mutate(PROPDMG=PROPDMG*as.numeric(val.map)) %>% select(-c(PROPDMGEXP))
rm(val.map)

Now, the same thing occurs with the CROPDMG and CROPDMGEXP:

unique(noaa.data$CROPDMGEXP)
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"
#           ""  "M" "K" "m" "B" "?" "0" "k" "2"
power.val<-c(0,  6,  3,  6,  9,  1,  1,  3,  2)
val.map<-mapvalues(x = noaa.data$CROPDMGEXP,
          from=c("","M","K","m","B","?","0","k","2"),
          to=sapply(power.val,function(i){return(10^i)}))
rm(power.val)
noaa.data<- noaa.data %>% mutate(CROPDMG=CROPDMG*as.numeric(val.map)) %>% select(-c(CROPDMGEXP))
rm(val.map)

Results

Now that the data may be considered clean, let’s try to answer the first question:
Across the United States, which types of events are most harmful with respect to population health?
To do so, let’s do a quick summary:

noaa.data.summary<- noaa.data %>% group_by(EVTYPE) %>% 
  summarize(total.fatalities=sum(FATALITIES), total.injuries=sum(INJURIES), 
            total.prop.dmg=sum(PROPDMG), total.crop.dmg=sum(CROPDMG)) %>%
  arrange(-(total.fatalities+total.injuries+total.prop.dmg+total.crop.dmg))
#quick summary
head(noaa.data.summary)
## Source: local data frame [6 x 5]
## 
##              EVTYPE total.fatalities total.injuries total.prop.dmg
## 1             FLOOD              470           6789   144657709807
## 2 HURRICANE/TYPHOON               64           1275    69305840000
## 3           TORNADO             5633          91346    56947380676
## 4       STORM SURGE               13             38    43323536000
## 5              HAIL               15           1361    15735267513
## 6       FLASH FLOOD              978           1777    16822673978
## Variables not shown: total.crop.dmg (dbl)

Apparently, there is a lot of weather conditions that cause damage health and property to a certain extent, but only some have a devastating effect. To comprehend the information it is best to take just the top part of the list. The distribution of total damage, caused by the disasters, sorted descending, shows that we can safely use the top 10 severe weather conditions as they have the most impact.

Let’s see which of the natural disasters affect health the most:

#let's assume total damage to the health of the popultaion to be a the total fatalities and injuries combined
noaa.data.summary.health<- noaa.data.summary %>% mutate(total.health.dmg=total.fatalities+total.injuries) %>%
  arrange(-total.health.dmg) %>% do({
    .$EVTYPE<-factor(.$EVTYPE, levels=.$EVTYPE)
    return(.[1:10,])
  })
#plot
gp.health<-ggplot(data=noaa.data.summary.health, mapping=aes(x=EVTYPE, y=total.health.dmg))+geom_bar(stat="identity",fill="red") +
  theme(axis.text.x=element_text(angle=90),legend.position="none",
        strip.background=element_blank(),
        plot.background=element_rect(fill = "white", colour = "white"),
         panel.background = element_blank())+
  ylab("Total Health Damage Cases")+xlab("Disaster Type")
#map the events
#get summary by state
noaa.data.health.by.state<- noaa.data %>% group_by(STATE, EVTYPE) %>% 
  summarize(total.health.dmg=sum(FATALITIES+INJURIES)) %>% filter(EVTYPE %in% noaa.data.summary.health$EVTYPE)
#select only the top ones
evtype<-c("TORNADO", "EXCESSIVE HEAT", "FLOOD", "LIGHTNING")
noaa.data.health.by.state.plot<- noaa.data.health.by.state %>% 
  filter(EVTYPE %in% evtype) %>% 
  mutate(EVTYPE=factor(EVTYPE, levels = evtype))

#prepare the hexagons
us.states.geo<-"us_states_hexgrid.geojson"
if(!file.exists(us.states.geo)){
  download.file("https://gist.githubusercontent.com/hrbrmstr/51f961198f65509ad863/raw/219173f69979f663aa9192fbe3e115ebd357ca9f/us_states_hexgrid.geojson", 
              us.states.geo, method = "curl")
}
state.hex<- readOGR(us.states.geo, "OGRGeoJSON")
## OGR data source with driver: GeoJSON 
## Source: "us_states_hexgrid.geojson", layer: "OGRGeoJSON"
## with 51 features
## It has 6 fields
centers <- cbind.data.frame(data.frame(gCentroid(state.hex, byid=TRUE), id=state.hex@data$iso3166_2))
#map to the US map
us.map <- fortify(state.hex, region="iso3166_2")
noaa.data.health.by.state.plot <- noaa.data.health.by.state.plot %>% filter(STATE %in% unique(us.map$id))
#plot
us.map.health.gp<-ggplot()+
  geom_map(data=us.map, map=us.map, mapping=aes(x=long, y=lat, map_id=id), color="white", size=0.5)+
  #now the actual layer
  geom_map(data=noaa.data.health.by.state.plot, map=us.map,
                    aes(fill=total.health.dmg, map_id=STATE))+
  geom_map(data=noaa.data.health.by.state.plot, map=us.map, aes(map_id=STATE), fill="white", alpha=0, color="white", show_guide=FALSE)+
  geom_text(data=centers, aes(label=id, x=x, y=y), color="white", size=3)+
  scale_fill_distiller(name="Damages to Health\nand\nCasualties", palette="Reds", 
                       na.value="white", labels=sprintf("%d%%", seq(0,100,25)))+
  coord_map()+facet_wrap(~EVTYPE,nrow = 2)+
  theme(panel.border=element_blank(),
        plot.title=element_text(face="bold", size=24),
        panel.grid=element_blank(),
        axis.ticks=element_blank(),
        axis.text=element_blank(),
        strip.background=element_blank(),
        plot.background=element_rect(fill = "white", colour = "white"),
        strip.text=element_text(face="bold", hjust=0, size=14),
        strip.background=element_rect(colour="white"),
        legend.position="bottom",
        panel.background = element_blank(),
        legend.title.align=1)+
  xlab("")+ylab("")

grid.arrange(gp.health, us.map.health.gp, ncol=2,widths=c(1,3))

So, apparently, tornados keep harassing the mid-west, while it can get really hot in Montana (surprising actually) and Texas is being flooded badly sometimes, while Florida has lightning issues. Unfortunately, the most devastating disaster, - tornado, is wastly spread across the central states (according to the map) and have the greatest impact on the national health overall.

That being said, let’s see for the second question:
Across the United States, which types of events have the greatest economic consequences?

To answer this question lets look at the damages to the property and crops.

#first the summary to see which variables are we looking for
noaa.data.summary.economy<- noaa.data.summary %>% mutate(total.econ.dmg=total.prop.dmg+total.crop.dmg) %>%
  arrange(-total.econ.dmg) %>% do({
    .$EVTYPE<-factor(.$EVTYPE, levels=.$EVTYPE)
    return(.[1:10,])
  })
#most damagind factors
evtype<-c("FLOOD", "HURRICANE/TYPHOON", "TORNADO", "STORM SURGE")
#get summary by state
noaa.data.economy.by.state<- noaa.data %>% group_by(STATE, EVTYPE) %>% 
  summarize(total.econ.dmg=sum(PROPDMG+CROPDMG)) %>% filter(EVTYPE %in% noaa.data.summary.economy$EVTYPE)
noaa.data.economy.by.state.plot<- noaa.data.economy.by.state %>% 
  filter(EVTYPE %in% evtype) %>% 
  mutate(EVTYPE=factor(EVTYPE, levels = evtype))
#plot the damage level
gp.economy<-ggplot(data=noaa.data.summary.economy, mapping=aes(x=EVTYPE, y=total.econ.dmg))+
  geom_bar(stat="identity",fill="chartreuse4") +
  theme(axis.text.x=element_text(angle=90),legend.position="none",
        strip.background=element_blank(),
        plot.background=element_rect(fill = "white", colour = "white"),
         panel.background = element_blank())+
  ylab("Total Economy Damage Cases")+xlab("Disaster Type")
#plot maps
us.map.economy.gp<-ggplot()+
  geom_map(data=us.map, map=us.map, mapping=aes(x=long, y=lat, map_id=id), color="white", size=0.5)+
  #now the actual layer
  geom_map(data=noaa.data.economy.by.state.plot, map=us.map,
                    aes(fill=total.econ.dmg, map_id=STATE))+
  geom_map(data=noaa.data.economy.by.state.plot, map=us.map, aes(map_id=STATE), fill="white", alpha=0, color="white", show_guide=FALSE)+
  geom_text(data=centers, aes(label=id, x=x, y=y), color="white", size=3)+
  scale_fill_distiller(name="Damages to Property\nand\nEconomy", palette="Greens", 
                       na.value="white", labels=sprintf("%d%%", seq(0,100,25)))+
  coord_map()+facet_wrap(~EVTYPE,nrow = 2)+
  theme(panel.border=element_blank(),
        plot.title=element_text(face="bold", size=24),
        panel.grid=element_blank(),
        axis.ticks=element_blank(),
        axis.text=element_blank(),
        strip.background=element_blank(),
        plot.background=element_rect(fill = "white", colour = "white"),
        strip.text=element_text(face="bold", hjust=0, size=14),
        strip.background=element_rect(colour="white"),
        legend.position="bottom",
        panel.background = element_blank(),
        legend.title.align=1)+
  xlab("")+ylab("")

grid.arrange(gp.economy, us.map.economy.gp, ncol=2,widths=c(1,3))

According to the plot above, the most damaging to the US economy are floods and typhoons, that mainly occur in California and Florida respectively.

As we have space for one more allowed plot, let’s see if the weather is getting more volotile over time in the it’s most devastaing manifestations.

#get only the worst disaster's data
evtype<-c("TORNADO", "EXCESSIVE HEAT", "FLOOD", "LIGHTNING", "HURRICANE/TYPHOON", "STORM SURGE")
#let's bring the rather uncomparable values to one scale - [0,1]
noaa.data.time.summary<- noaa.data %>% group_by(EVENT_DATE) %>% 
            filter(EVTYPE %in% evtype) %>%
  mutate(FATALITIES= rescale(FATALITIES,from =c(0,max(FATALITIES)),to = c(0,1)),
         INJURIES= rescale(INJURIES,from =c(0,max(INJURIES)),to = c(0,1)),
         PROPDMG= rescale(PROPDMG,from =c(0,max(PROPDMG)),to = c(0,1)),
         CROPDMG= rescale(CROPDMG,from =c(0,max(CROPDMG)),to = c(0,1))) %>%
  group_by(EVENT_DATE,EVTYPE) %>%
  summarize(mean.fatalities=mean(FATALITIES), mean.injuries=mean(INJURIES), 
            mean.propdmg=mean(PROPDMG), mean.cropdmg=mean(CROPDMG), magnitude=mean(MAG))
#plot
damage.gp<-ggplot(data=noaa.data.time.summary, mapping=aes(x=EVENT_DATE, y=mean.fatalities+mean.injuries+mean.propdmg+mean.cropdmg, group=EVTYPE, color=EVTYPE))+
  geom_smooth(method="loess", fill="steelblue", alpha=0.3)+ylab("Total Damage")+xlab("Date")+
  theme(plot.background=element_rect(fill = "white", colour = "white"),
        strip.text=element_text(face="bold", hjust=0, size=14),
        strip.background=element_rect(colour="white"),
        panel.background = element_blank(),
        legend.position="none")

magnitute.gp<-ggplot(data=noaa.data.time.summary, mapping=aes(x=EVENT_DATE, y=magnitude, group=EVTYPE, color=EVTYPE))+
  geom_smooth(method="loess", fill="steelblue", alpha=0.3)+ylab("Magnitude")+xlab("Date")+
  theme(plot.background=element_rect(fill = "white", colour = "white"),
        strip.text=element_text(face="bold", hjust=0, size=14),
        strip.background=element_rect(colour="white"),
        panel.background = element_blank(),
        legend.position="bottom",
        legend.key=element_rect(colour = "white"))+
  labs(colour="Disaster")

grid.arrange(damage.gp, magnitute.gp, nrow=2)

Surprisingly, even though the magnitude for some (namely STORM SURGE) dropped over the years on average, the damage it causes does not seem to have been affected and even increased. I sepculate that this might be a result of lowered vigilance while the infrastructure of the regions has been developing rapidly.