The aim of the project is to identify the most harmful weather events in terms of their effect on people’s health and economy across the USA.
The analysis is based on U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database which collects characteristics of major storms and weather events in the United States, including their time, place, estimates of fatalities, injuries, and property damage. For the purposes of the projects, the data for the period of Year 1996-2007 has been used.
The results of the analysis show the following:
Excessive Heats and Tornadoes are the primary causes for weather event fatalities (38% of all fatalities)
Tornadoes is also the major cause for injuries, accounting for 36% of total injuries
Cold and Snow lead to the highest # of fatal cases (14 on average per event) and Heat waves to highest # of injuries (70 on average per event)
Floods are the primary cause for property damage contributing 39% of all weather events damage
Hurrican/typhoons lead to biggest property destruction per event
The biggest total damage to crops is caused by droughts(38% of total weather events damage)
For the analysis we use U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The orinigal data file is repdata-data-StormData.csv.bz2
stormdata<-read.csv("repdata-data-StormData.csv.bz2",header=TRUE,sep=",")
str(stormdata)
summary(stormdata)
head(stormdata)
For the analysis I will leave only the neccesary variables, including EVTYPE, and all data informing on the injuries/economic effect: FATALITIES,INJURIES and DMG parameters. All other variables, including the place, time of event and magnituted should be taken out.
However, to make sure that we compare apples to apples, it’s neccesary to review the time period for which different types of weather events have been tracked.
## review the timefame of different weather events track
first_number<-function(x){return(x[1])}
start_date<-aggregate(BGN_DATE~EVTYPE,FUN=first_number,data=stormdata)
start_date$BGN_DATE<-as.character(levels(start_date$BGN_DATE)[start_date$BGN_DATE])
data<-1:985
class(data)<-"Date"
for (i in 1:985){
data[i]<-as.Date(strsplit(start_date$BGN_DATE,split=" ")[[i]][[1]],"%m/%d/%Y")}
summary(data)
## Min. 1st Qu. Median Mean 3rd Qu.
## "1950-04-18" "1994-11-27" "1995-08-05" "1996-01-16" "1996-10-05"
## Max.
## "2007-03-08"
The starting observation dates for different event types vary from year 1950 to 2007. Given, that for the majority of event types, the observation starts in the period of 1994-1996, we can select 1996 as the start date. This would require subsetting the dataframe.
data<-1:16335
class(data)<-"Date"
for (i in 1:16335){
data[i] = as.Date(strsplit(levels(stormdata$BGN_DATE),split=" ")[[i]][[1]],"%m/%d/%Y")}
##Add a variable which would show BGN_DATE values in Date format (it's easier to add a new variable than the revalue BGN_DATE to Date format)
stormdata[,38]<-factor(as.numeric(stormdata$BGN_DATE),labels=data)
##selecting the neccesary variables
stormdata<-stormdata[,c(8,23:28,38)]
##Removing observation prior to year 1995, which I selected as a starting Date to make EVTYPES comparable
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.1.1
stormdata<-stormdata[year(stormdata$V38)>=1996,]
Crop and Property data require preprocessing into single numeric columns
summary(stormdata$PROPDMGEXP)
## - ? + 0 1 2 3 4 5
## 276185 0 0 0 1 0 0 0 0 0
## 6 7 8 B h H K m M
## 0 0 0 32 0 0 369938 0 7374
summary(stormdata$CROPDMGEXP)
## ? 0 2 B k K m M
## 373069 0 0 0 4 0 278686 0 1771
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.1
stormdata$PROPDMGEXP<-mapvalues(stormdata$PROPDMGEXP, from=c("-","+","?","H","h","K","M","m","B",""),to=c("0","0","0","2","2","3","6","6","9","1"))
stormdata$PROPDMGEXP<-as.numeric(levels(stormdata$PROPDMGEXP)[stormdata$PROPDMGEXP])
stormdata$CROPDMGEXP<-mapvalues(stormdata$CROPDMGEXP,from=c("?","B","k","K","m","M",""),to=c("0","9","3","3","6","6","1"))
stormdata$CROPDMGEXP<-as.numeric(levels(stormdata$CROPDMGEXP)[stormdata$CROPDMGEXP])
stormdata$PROPDMG<-stormdata$PROPDMG*10^stormdata$PROPDMGEXP
stormdata$CROPDMG<-stormdata$CROPDMG*10^stormdata$CROPDMGEXP
Next, I aggregated the data by EVTYPE, applying the functions: - sum, to see the total damage caused by event type - mean, it’s important to look at mean values, as some events can be rare but devastating
fatalities<-aggregate(FATALITIES~EVTYPE,FUN="sum",data=stormdata)
fatalities_mean<-aggregate(FATALITIES~EVTYPE,FUN="mean",data=stormdata)
injuries<-aggregate(INJURIES~EVTYPE,FUN="sum",data=stormdata)
injuries_mean<-aggregate(INJURIES~EVTYPE,FUN="mean",data=stormdata)
prop<-aggregate(PROPDMG~EVTYPE,FUN="sum",data=stormdata)
prop_mean<-aggregate(PROPDMG~EVTYPE,FUN="mean",data=stormdata)
crop<-aggregate(CROPDMG~EVTYPE,FUN="sum",data=stormdata)
crop_mean<-aggregate(CROPDMG~EVTYPE,FUN="mean",data=stormdata)
fatalitiess<-fatalities[order(fatalities$FATALITIES,decreasing=TRUE),]
fatalitiess_mean<-fatalities_mean[order(fatalities_mean$FATALITIES,decreasing=TRUE),]
injuriess<-injuries[order(injuries$INJURIES,decreasing=TRUE),]
injuriess_mean<-injuries_mean[order(injuries_mean$INJURIES,decreasing=TRUE),]
props<-prop[order(prop$PROPDMG,decreasing=TRUE),]
props_mean<-prop_mean[order(prop_mean$PROPDMG,decreasing=TRUE),]
crops<-crop[order(crop$CROPDMG,decreasing=TRUE),]
crops_mean<-crop_mean[order(crop_mean$CROPDMG,decreasing=TRUE),]
par(mfrow = c(2, 2),oma=c(0,0,3,1),mar=c(5,10,4,2))
barplot(fatalitiess[c(1:5),]$FATALITIES,names.arg=fatalitiess[c(1:5),]$EVTYPE,col="brown",horiz=TRUE,las=1,width=1,ylim=c(0,6),cex.names=0.8,main="Total # of fatal cases",font.main= 10)
barplot(fatalitiess_mean[c(1:5),]$FATALITIES,names.arg=fatalitiess_mean[c(1:5),]$EVTYPE,col="brown",horiz=TRUE,las=1,width=1,ylim=c(0,6),cex.names=0.8,main="Mean # of fatal cases",font.main= 10)
barplot(injuriess[c(1:5),]$INJURIES,names.arg=injuriess[c(1:5),]$EVTYPE,col="chocolate",horiz=TRUE,las=1,width=1,ylim=c(0,6),cex.names=0.8,main="Total # of injuries",font.main= 10)
barplot(injuriess_mean[c(1:5),]$INJURIES,names.arg=injuriess_mean[c(1:5),]$EVTYPE,col="chocolate",horiz=TRUE,las=1,width=1,ylim=c(0,6),cex.names=0.8,main="Mean # of injuries",font.main= 10)
mtext("Weather events effect on population health", side=3.5, line=1, outer=TRUE, cex=1.5, font=1.5)
par(mfrow = c(2, 2),mar=c(5,10,4,2),oma=c(0,0,3,1))
barplot(props[c(1:5),]$PROPDMG,names.arg=props[c(1:5),]$EVTYPE,col="green4",horiz=TRUE,las=1,width=1,ylim=c(0,6),main="Total property damage",cex.names=0.8,font.main= 10)
barplot(props_mean[c(1:5),]$PROPDMG,names.arg=props_mean[c(1:5),]$EVTYPE,col="green4",horiz=TRUE,las=1,width=1,ylim=c(0,6),main="Mean property damage",cex.names=0.8,font.main= 10)
barplot(crops[c(1:5),]$CROPDMG,names.arg=crops[c(1:5),]$EVTYPE,col="seagreen1",horiz=TRUE,las=1,width=1,ylim=c(0,6),main="Total damage to crops",cex.names=0.8,font.main= 10)
barplot(crops_mean[c(1:5),]$CROPDMG,names.arg=crops_mean[c(1:5),]$EVTYPE,col="seagreen1",horiz=TRUE,las=1,width=1,ylim=c(0,6),main="Mean damage to crops",cex.names=0.8,font.main= 10)
mtext("Economic damage from weather events", side=3.5, line=1, outer=TRUE, cex=1.5, font=1.5)
##Fatalities
fatalitiess[1,2]/sum(fatalitiess[,2])
## [1] 0.2058
fatalitiess[2,2]/sum(fatalitiess[,2])
## [1] 0.173
fatalitiess_mean[1,2]
## [1] 14
##Injuries
injuriess[1,2]/sum(injuriess[,2])
## [1] 0.3565
injuriess_mean[1,2]
## [1] 70
##Property damage
props[1,2]/sum(props[,2])
## [1] 0.3925
props_mean[1,2]
## [1] 787566364
##Crops damage
crops[1,2]/sum(crops[,2])
## [1] 0.3846