Synopsis

We use data from the NOAA storm database to address the following questions concerning the effect of severe weather events in the U.S. between 1950 and 2011:

  1. Across the United States, which types of events are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

We measure effects on public health by the number of fatalities and injuries caused by the weather events and economic losses by the property and crop damgages. Analysis of the data shows that tornado and flood are the most harmful events for public health and that flood and hurricane the most harmful to the economics.

Data Processing

The raw data is of the form of a comma-separated-value file compressed via the bzip2 algorithm. You may want to refer some documentation of the database available:

the National Weather Service Storm Data Documentation

National Climatic Data Center Storm Events FAQ

Load the libraries

Load the necessary libraries to perform data transformation and plotting

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(reshape2)
## Loading required package: reshape2
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
require(gridExtra)
## Loading required package: gridExtra

Load the data

stormdata<-read.csv('repdata%2Fdata%2FStormData.csv.bz2',header=T,stringsAsFactors = F)
str(stormdata)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Clean the data

We first convert the names of the variables into its lower case and then subset the variables relevant to our analysis, namely

  1. ‘evtype’, the type of the weather
  2. ‘fatalities’, the number of fatalities
  3. ‘injuries’, the number of injuries
  4. ‘propdmg’, the amount of property damages
  5. ‘propdmgexp’, the unit of the value of property damages
  6. ‘cropdmg’, the amount of crop damages
  7. ‘cropdmgexp’, the unit of the value of crop damages
names(stormdata)<-tolower(names(stormdata))
data<-select(stormdata,c(evtype,fatalities,injuries,propdmg,propdmgexp,cropdmg,cropdmgexp))

#We drop the cases when the numeric variables are all zeros. 
data<-filter(data,!(fatalities==0&injuries==0&propdmg==0&cropdmg==0))

#We convert the selected character variables, i.e., 'evtype', 'propdmgexp', and 'cropdmgexp' into lower cases: 
data<-transform(data,evtype=tolower(evtype),propdmgexp=tolower(propdmgexp),cropdmgexp=tolower(cropdmgexp))

head(unique(data$evtype), 10)
##  [1] "tornado"                   "tstm wind"                
##  [3] "hail"                      "ice storm/flash flood"    
##  [5] "winter storm"              "hurricane opal/high winds"
##  [7] "thunderstorm winds"        "hurricane erin"           
##  [9] "hurricane opal"            "heavy rain"
#Next we correct misspelling names and classify correlated events
data$evtype<-gsub('s$|s/','',data$evtype)
data$evtype<-gsub('/$|-$','',data$evtype)
data$evtype<-gsub('^/','',data$evtype)
data$evtype<-gsub('\\?$','other',data$evtype)
data$evtype<-gsub('ing$','',data$evtype)
data$evtype<-gsub('\\\\$','/',data$evtype)
data$evtype<-gsub('^record','',data$evtype)
data$evtype<-gsub('^\\s+','',data$evtype)
data$evtype<-gsub('tstm','thunderstorm',data$evtype)
data$evtype<-gsub('thunderstorm wi$|thunderstorm win$|thunderestorm wind|thundeerstorm wind|thunerstorm wind|tunderstorm wind|thuderstorm wind|thunderstrom wind|thundertorm wind','thunderstorm wind',data$evtype)
data$evtype<-gsub('hvy','heavy',data$evtype)

data$evtype[grepl('^tornado|whirlwind|cold air tornado',data$evtype)]<-'tornado'
data$evtype[grepl('unseasonably warm and dry|heat wave drought',data$evtype)]<-'drought'
data$evtype[grepl('hurricane|typhoon',data$evtype)]<-'hurricane'
data$evtype[grepl('^thunderstorm|^severe thunderstorm',data$evtype)]<-'thunderstorm wind'
data$evtype[grepl('lightning thunderstorm wind|lightning and thunderstorm wind',data$evtype)]<-'lightning and thunderstorm'
data$evtype[grepl('non-thunderstorm wind|non thunderstorm wind|wind damage|non-severe wind damage|^wind and',data$evtype)]<-'wind'
data$evtype[grepl('^high wind|high wind damage|wind storm',data$evtype)]<-'high wind'
data$evtype[grepl('^hail|small hail',data$evtype)]<-'hail'
data$evtype[grepl('tropical storm',data$evtype)]<-'tropical storm'
data$evtype[grepl('winter weather mix|winter weather/mix|wintry mix|ice road|icy road|ice on road|snow and ice|ice and snow|snow/ice| snow/ ice/falling snow/ice',data$evtype)]<-'winter weather'
data$evtype[grepl('breakup flood|river flood|fld|urban flood|urban/small stream flood|minor flood|rural flood|major flood|ice jam flood|small stream flood|lake flood|urban and small stream floodin|river and stream flood|flood/river flood|heavy rain and flood|heavy rainflood|snowmelt flood|ice jam flood \\(minor|tidal flood',data$evtype)]<-'flood'
data$evtype[grepl('erosion/cstl flood|coastal flooding/erosion|coastal  flooding/erosion',data$evtype)]<-'coastal flood'
data$evtype[grepl('^flash flood|flood flash|flashflood|flash flood|flash/flood',data$evtype)]<-'flash flood'
data$evtype[grepl('mudslide|mud slide|landslide|rock slide',data$evtype)]<-'landslide'
data$evtype[grepl('gusty wind|gustnado|gusty wind/rain',data$evtype)]<-'gustnado'
data$evtype[grepl('high surf|heavy surf|high surf advisory|hazardous surf| heavy surf/high surf',data$evtype)]<-'high surf'
data$evtype[grepl('freeze|damaging freeze|frost\\\\freeze|hard freeze| agricultural freeze',data$evtype)]<-'frost/freeze'
data$evtype[grepl('coastalstorm',data$evtype)]<-'coastal storm'
data$evtype[grepl('hyperthermia/exposure|extreme heat',data$evtype)]<-'excessive heat'
data$evtype[grepl('heat wave|heat',data$evtype)]<-'heat'
data$evtype[grepl('cold|chill|low temperature|hypothermia',data$evtype)]<-'cold'
data$evtype[grepl('lightn|lightning',data$evtype)]<-'lightning'
data$evtype[grepl('wetnes',data$evtype)]<-'wetness'

Note that the ‘propdmgexp’ and ‘cropdmpexp’ contain non-numeric values such as ‘k’ for thousand, and ‘b’ for billions. We will map these values to numbers and calculate the absolute property damages and crop damages.

unique(data$propdmgexp)
##  [1] "k" "m" ""  "b" "+" "0" "5" "6" "4" "h" "2" "7" "3" "-"
unique(data$cropdmgexp)
## [1] ""  "m" "k" "b" "?" "0"
data$propdmgexp[!data$propdmgexp%in%c('h','k','m','b')]<-NA
data$propdmgexp[data$propdmgexp=='h']<-10^2
data$propdmgexp[data$propdmgexp=='k']<-10^3
data$propdmgexp[data$propdmgexp=='m']<-10^6
data$propdmgexp[data$propdmgexp=='b']<-10^9
data$propdmg<-data$propdmg*as.numeric(data$propdmgexp)
data$cropdmgexp[!data$cropdmgexp%in%c('k','m','b')]<-NA
data$cropdmgexp[data$cropdmgexp=='k']<-10^3
data$cropdmgexp[data$cropdmgexp=='m']<-10^6
data$cropdmgexp[data$cropdmgexp=='b']<-10^9
data$cropdmg<-data$cropdmg*as.numeric(data$cropdmgexp)

We create new variables called ‘pubhealth’ (the public health, measured by the sum of the fatalities and injuries) and ‘ecodmg’ (the ecomonic losses, estimated by the sum of damages in properties and crops) to address the two questions given in this project.

data<-mutate(data[complete.cases(data),],pubhealth=fatalities+injuries,ecodmg=propdmg+cropdmg)
data<-select(data,-c(propdmgexp,cropdmgexp))
#write.csv(data,file='cleanstormdata.csv',row.names=F)

Results

We aggregate and arrange the data according to different weather events. Then we merge the data of the fatalities and injuries and transform it to long from wide by melting the data. Similar process for the data of property and crop damages.

data<-transform(data,evtype=factor(evtype))
data_agg<-aggregate(.~evtype,data,sum)

fatals<-arrange(data_agg[,1:2],desc(fatalities))[1:5,]
injurs<-arrange(data_agg[,c(1,3)],desc(injuries))[1:5,]

pubhealth.m<-merge(fatals,injurs,by='evtype',all=T)
pubhealth.m<-melt(pubhealth.m,id.vars='evtype',measure.vars=c('fatalities','injuries'),variable.name='damagetype',value.name='number') 

proploss<-arrange(data_agg[,c(1,4)],desc(propdmg))[1:5,]
croploss<-arrange(data_agg[,c(1,5)],desc(cropdmg))[1:5,]
ecodmg.m<-merge(proploss,croploss,by='evtype',all=T)
ecodmg.m<-melt(ecodmg.m,id.vars='evtype',measure.vars=c('propdmg','cropdmg'),variable.name='damagetype',value.name='amount')

Below we draw the figures for the fatalities, injuries, property damgages and crop damages to have a preliminary insight on the impact of the severe weathers. The figure shows that the tornado is the most dangerous to cause fatalities and injuries while the flood is the most harmful to property and crops.

p1<-ggplot(pubhealth.m, aes(evtype, number)) +   
        geom_bar(aes(fill = damagetype), position = "dodge", stat="identity") +
        theme(axis.text.x = element_text(angle = 45,size=10, hjust = 1)) + 
        xlab("weather event") + ylab("public health") + 
        ggtitle("Top events to injuries & fatalities")


p2<-ggplot(ecodmg.m, aes(evtype, amount/10^9)) +   
        geom_bar(aes(fill = damagetype), position = "dodge", stat="identity") +
        theme(axis.text.x = element_text(angle = 45,size=10, hjust = 1,vjust=1))+ 
        xlab("weather event") + ylab("economic losses(billion dollar)") + 
        ggtitle("Top events to property & crop")
grid.arrange(p1,p2,ncol=2)

We plot a figure summarizing the top weather events affecting public health (estimated by the number of fatalities and the number of injuries) and economics (measured by the damages in properties and crops) in the following.

pubhealths<-arrange(data_agg[,c(1,6)],desc(pubhealth))[1:10,]%>%transform(evtype=as.character(evtype))
ecodmgs<-arrange(data_agg[,c(1,7)],desc(ecodmg))[1:10,]%>%transform(evtype=as.character(evtype))

par(mfrow=c(1,2))
pubhealths$evtype[6:length(pubhealths$evtype)]<-'others'
pubhealthsplot <- aggregate(pubhealth~evtype,pubhealths, sum)
with(pubhealthsplot,pie(pubhealth, evtype, radius=0.8, col=rainbow(10),main = "Events affecting public health",clockwise=T))

ecodmgs$evtype[6:length(ecodmgs$evtype)]<-'others'
ecodmgsplot <- aggregate(ecodmg~evtype,ecodmgs, sum)
with(ecodmgsplot,pie(ecodmg, evtype, radius=0.8, col=rainbow(10),main = "Events affecting economics",clockwise=T))

The figure above reveals that the most important cause of fatalities and injuries is tornado, followed by flood and the most harmful event of economics is flood, followed by hurricane.