We use data from the NOAA storm database to address the following questions concerning the effect of severe weather events in the U.S. between 1950 and 2011:
We measure effects on public health by the number of fatalities and injuries caused by the weather events and economic losses by the property and crop damgages. Analysis of the data shows that tornado and flood are the most harmful events for public health and that flood and hurricane the most harmful to the economics.
The raw data is of the form of a comma-separated-value file compressed via the bzip2 algorithm. You may want to refer some documentation of the database available:
the National Weather Service Storm Data Documentation
National Climatic Data Center Storm Events FAQ
Load the necessary libraries to perform data transformation and plotting
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(reshape2)
## Loading required package: reshape2
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
require(gridExtra)
## Loading required package: gridExtra
stormdata<-read.csv('repdata%2Fdata%2FStormData.csv.bz2',header=T,stringsAsFactors = F)
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
We first convert the names of the variables into its lower case and then subset the variables relevant to our analysis, namely
names(stormdata)<-tolower(names(stormdata))
data<-select(stormdata,c(evtype,fatalities,injuries,propdmg,propdmgexp,cropdmg,cropdmgexp))
#We drop the cases when the numeric variables are all zeros.
data<-filter(data,!(fatalities==0&injuries==0&propdmg==0&cropdmg==0))
#We convert the selected character variables, i.e., 'evtype', 'propdmgexp', and 'cropdmgexp' into lower cases:
data<-transform(data,evtype=tolower(evtype),propdmgexp=tolower(propdmgexp),cropdmgexp=tolower(cropdmgexp))
head(unique(data$evtype), 10)
## [1] "tornado" "tstm wind"
## [3] "hail" "ice storm/flash flood"
## [5] "winter storm" "hurricane opal/high winds"
## [7] "thunderstorm winds" "hurricane erin"
## [9] "hurricane opal" "heavy rain"
#Next we correct misspelling names and classify correlated events
data$evtype<-gsub('s$|s/','',data$evtype)
data$evtype<-gsub('/$|-$','',data$evtype)
data$evtype<-gsub('^/','',data$evtype)
data$evtype<-gsub('\\?$','other',data$evtype)
data$evtype<-gsub('ing$','',data$evtype)
data$evtype<-gsub('\\\\$','/',data$evtype)
data$evtype<-gsub('^record','',data$evtype)
data$evtype<-gsub('^\\s+','',data$evtype)
data$evtype<-gsub('tstm','thunderstorm',data$evtype)
data$evtype<-gsub('thunderstorm wi$|thunderstorm win$|thunderestorm wind|thundeerstorm wind|thunerstorm wind|tunderstorm wind|thuderstorm wind|thunderstrom wind|thundertorm wind','thunderstorm wind',data$evtype)
data$evtype<-gsub('hvy','heavy',data$evtype)
data$evtype[grepl('^tornado|whirlwind|cold air tornado',data$evtype)]<-'tornado'
data$evtype[grepl('unseasonably warm and dry|heat wave drought',data$evtype)]<-'drought'
data$evtype[grepl('hurricane|typhoon',data$evtype)]<-'hurricane'
data$evtype[grepl('^thunderstorm|^severe thunderstorm',data$evtype)]<-'thunderstorm wind'
data$evtype[grepl('lightning thunderstorm wind|lightning and thunderstorm wind',data$evtype)]<-'lightning and thunderstorm'
data$evtype[grepl('non-thunderstorm wind|non thunderstorm wind|wind damage|non-severe wind damage|^wind and',data$evtype)]<-'wind'
data$evtype[grepl('^high wind|high wind damage|wind storm',data$evtype)]<-'high wind'
data$evtype[grepl('^hail|small hail',data$evtype)]<-'hail'
data$evtype[grepl('tropical storm',data$evtype)]<-'tropical storm'
data$evtype[grepl('winter weather mix|winter weather/mix|wintry mix|ice road|icy road|ice on road|snow and ice|ice and snow|snow/ice| snow/ ice/falling snow/ice',data$evtype)]<-'winter weather'
data$evtype[grepl('breakup flood|river flood|fld|urban flood|urban/small stream flood|minor flood|rural flood|major flood|ice jam flood|small stream flood|lake flood|urban and small stream floodin|river and stream flood|flood/river flood|heavy rain and flood|heavy rainflood|snowmelt flood|ice jam flood \\(minor|tidal flood',data$evtype)]<-'flood'
data$evtype[grepl('erosion/cstl flood|coastal flooding/erosion|coastal flooding/erosion',data$evtype)]<-'coastal flood'
data$evtype[grepl('^flash flood|flood flash|flashflood|flash flood|flash/flood',data$evtype)]<-'flash flood'
data$evtype[grepl('mudslide|mud slide|landslide|rock slide',data$evtype)]<-'landslide'
data$evtype[grepl('gusty wind|gustnado|gusty wind/rain',data$evtype)]<-'gustnado'
data$evtype[grepl('high surf|heavy surf|high surf advisory|hazardous surf| heavy surf/high surf',data$evtype)]<-'high surf'
data$evtype[grepl('freeze|damaging freeze|frost\\\\freeze|hard freeze| agricultural freeze',data$evtype)]<-'frost/freeze'
data$evtype[grepl('coastalstorm',data$evtype)]<-'coastal storm'
data$evtype[grepl('hyperthermia/exposure|extreme heat',data$evtype)]<-'excessive heat'
data$evtype[grepl('heat wave|heat',data$evtype)]<-'heat'
data$evtype[grepl('cold|chill|low temperature|hypothermia',data$evtype)]<-'cold'
data$evtype[grepl('lightn|lightning',data$evtype)]<-'lightning'
data$evtype[grepl('wetnes',data$evtype)]<-'wetness'
Note that the ‘propdmgexp’ and ‘cropdmpexp’ contain non-numeric values such as ‘k’ for thousand, and ‘b’ for billions. We will map these values to numbers and calculate the absolute property damages and crop damages.
unique(data$propdmgexp)
## [1] "k" "m" "" "b" "+" "0" "5" "6" "4" "h" "2" "7" "3" "-"
unique(data$cropdmgexp)
## [1] "" "m" "k" "b" "?" "0"
data$propdmgexp[!data$propdmgexp%in%c('h','k','m','b')]<-NA
data$propdmgexp[data$propdmgexp=='h']<-10^2
data$propdmgexp[data$propdmgexp=='k']<-10^3
data$propdmgexp[data$propdmgexp=='m']<-10^6
data$propdmgexp[data$propdmgexp=='b']<-10^9
data$propdmg<-data$propdmg*as.numeric(data$propdmgexp)
data$cropdmgexp[!data$cropdmgexp%in%c('k','m','b')]<-NA
data$cropdmgexp[data$cropdmgexp=='k']<-10^3
data$cropdmgexp[data$cropdmgexp=='m']<-10^6
data$cropdmgexp[data$cropdmgexp=='b']<-10^9
data$cropdmg<-data$cropdmg*as.numeric(data$cropdmgexp)
We create new variables called ‘pubhealth’ (the public health, measured by the sum of the fatalities and injuries) and ‘ecodmg’ (the ecomonic losses, estimated by the sum of damages in properties and crops) to address the two questions given in this project.
data<-mutate(data[complete.cases(data),],pubhealth=fatalities+injuries,ecodmg=propdmg+cropdmg)
data<-select(data,-c(propdmgexp,cropdmgexp))
#write.csv(data,file='cleanstormdata.csv',row.names=F)
We aggregate and arrange the data according to different weather events. Then we merge the data of the fatalities and injuries and transform it to long from wide by melting the data. Similar process for the data of property and crop damages.
data<-transform(data,evtype=factor(evtype))
data_agg<-aggregate(.~evtype,data,sum)
fatals<-arrange(data_agg[,1:2],desc(fatalities))[1:5,]
injurs<-arrange(data_agg[,c(1,3)],desc(injuries))[1:5,]
pubhealth.m<-merge(fatals,injurs,by='evtype',all=T)
pubhealth.m<-melt(pubhealth.m,id.vars='evtype',measure.vars=c('fatalities','injuries'),variable.name='damagetype',value.name='number')
proploss<-arrange(data_agg[,c(1,4)],desc(propdmg))[1:5,]
croploss<-arrange(data_agg[,c(1,5)],desc(cropdmg))[1:5,]
ecodmg.m<-merge(proploss,croploss,by='evtype',all=T)
ecodmg.m<-melt(ecodmg.m,id.vars='evtype',measure.vars=c('propdmg','cropdmg'),variable.name='damagetype',value.name='amount')
Below we draw the figures for the fatalities, injuries, property damgages and crop damages to have a preliminary insight on the impact of the severe weathers. The figure shows that the tornado is the most dangerous to cause fatalities and injuries while the flood is the most harmful to property and crops.
p1<-ggplot(pubhealth.m, aes(evtype, number)) +
geom_bar(aes(fill = damagetype), position = "dodge", stat="identity") +
theme(axis.text.x = element_text(angle = 45,size=10, hjust = 1)) +
xlab("weather event") + ylab("public health") +
ggtitle("Top events to injuries & fatalities")
p2<-ggplot(ecodmg.m, aes(evtype, amount/10^9)) +
geom_bar(aes(fill = damagetype), position = "dodge", stat="identity") +
theme(axis.text.x = element_text(angle = 45,size=10, hjust = 1,vjust=1))+
xlab("weather event") + ylab("economic losses(billion dollar)") +
ggtitle("Top events to property & crop")
grid.arrange(p1,p2,ncol=2)
We plot a figure summarizing the top weather events affecting public health (estimated by the number of fatalities and the number of injuries) and economics (measured by the damages in properties and crops) in the following.
pubhealths<-arrange(data_agg[,c(1,6)],desc(pubhealth))[1:10,]%>%transform(evtype=as.character(evtype))
ecodmgs<-arrange(data_agg[,c(1,7)],desc(ecodmg))[1:10,]%>%transform(evtype=as.character(evtype))
par(mfrow=c(1,2))
pubhealths$evtype[6:length(pubhealths$evtype)]<-'others'
pubhealthsplot <- aggregate(pubhealth~evtype,pubhealths, sum)
with(pubhealthsplot,pie(pubhealth, evtype, radius=0.8, col=rainbow(10),main = "Events affecting public health",clockwise=T))
ecodmgs$evtype[6:length(ecodmgs$evtype)]<-'others'
ecodmgsplot <- aggregate(ecodmg~evtype,ecodmgs, sum)
with(ecodmgsplot,pie(ecodmg, evtype, radius=0.8, col=rainbow(10),main = "Events affecting economics",clockwise=T))
The figure above reveals that the most important cause of fatalities and injuries is tornado, followed by flood and the most harmful event of economics is flood, followed by hurricane.