Synopsis

Recent past , it has been observed that extreme weather events caused huge damage to human lives and properties across the United States leading to fatalities,injuries and economic crisis. To minimized the impact of server weather events is a key corcern for many muncipalities and communities.This paper uses the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to find out which are the weathers events causing maximum casualities, crop and property damage.

Data Processing

The data is compressed csv file which is downloaded online. This section is about the data processing techniques used.

if(!file.exists("./NOAA")){dir.create("./NOAA")}

surl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(surl,destfile = "./NOAA/SD.bz2", mode= "wb")
storm.Data <- read.csv(bzfile("./NOAA/SD.bz2"), header = T, sep =",")
dim(storm.Data)
## [1] 902297     37
names(storm.Data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

There are 902297 rows and 37 columns in the dataset. The dataset is further trimmed to extract only relevant columns i.e. “BGN_DATA”, “EVTYPE”,“FATALITIES” ,“INJURIES”,“PROPDMG”,“CROPDMG” for this assignment.The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records.For the analysis, only 20 years record starting from 1991 was considered.

storm.Data$BGN_DATE <- as.Date(storm.Data$BGN_DATE,'%m/%d/%Y %H:%M:%S')
storm.Data$Year <- format(storm.Data$BGN_DATE,'%Y')
storm.Data <- storm.Data[storm.Data$Year > 1990 & (storm.Data$FATALITIES > 0 | storm.Data$INJURIES > 0 | storm.Data$PROPDMG > 0 | storm.Data$CROPDMG > 0 ), c(38,8,23,24,25,26,27,28)]
storm.Data$PROPDMGEXP<- gsub("K", '1000', storm.Data$PROPDMGEXP)
storm.Data$PROPDMGEXP <- gsub("M", '1000000', storm.Data$PROPDMGEXP)
storm.Data$PROPDMGEXP <- gsub(" ", '1', storm.Data$PROPDMGEXP)
storm.Data$CROPDMGEXP<- gsub("K", '1000', storm.Data$CROPDMGEXP)
storm.Data$CROPDMGEXP <- gsub("M", '1000000', storm.Data$CROPDMGEXP)
storm.Data$PROPDMGEXP[storm.Data$PROPDMGEXP==""]  <- NA
storm.Data$CROPDMGEXP[storm.Data$CROPDMGEXP==""]  <- NA
storm.Data[is.na(storm.Data)] <- 1
storm.Data$PROPDMGEXP <- storm.Data$PROPDMG * as.numeric(storm.Data$PROPDMGEXP)
## Warning: NAs introduced by coercion
storm.Data$CROPDMGEXP <- storm.Data$CROPDMG * as.numeric(storm.Data$CROPDMGEXP)
## Warning: NAs introduced by coercion
storm.Data$casualities <- storm.Data$FATALITIES + storm.Data$INJURIES

what are ten worst events causing maximum damage in last 20 years?

library(dplyr)
storm.Data$casualities <- storm.Data$FATALITIES + storm.Data$INJURIES
by_event <- group_by(storm.Data,EVTYPE)
Year_SData <- data.frame(summarise(by_event,casualities = sum(casualities,na.rm=T),Prop.dmg = sum(PROPDMG, na.rm=T), Crop.dmg = sum(CROPDMG, na.rm =T)))
# Events causing maximum Casualities (Injuries + Fatalities)
head(Year_SData[order(-Year_SData$casualities),c(1)],10)
##  [1] TORNADO           EXCESSIVE HEAT    FLOOD            
##  [4] LIGHTNING         TSTM WIND         HEAT             
##  [7] FLASH FLOOD       ICE STORM         THUNDERSTORM WIND
## [10] WINTER STORM     
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND
# Events causing  maximum Property Damage
head(Year_SData[order(-Year_SData$Prop.dmg),c(1)],10)  
##  [1] TORNADO            FLASH FLOOD        TSTM WIND         
##  [4] FLOOD              THUNDERSTORM WIND  HAIL              
##  [7] LIGHTNING          THUNDERSTORM WINDS HIGH WIND         
## [10] WINTER STORM      
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND
# Events causingg Maximum Crop Damage
head(Year_SData[order(-Year_SData$Crop.dmg),c(1)],10)
##  [1] HAIL               FLASH FLOOD        FLOOD             
##  [4] TSTM WIND          TORNADO            THUNDERSTORM WIND 
##  [7] DROUGHT            THUNDERSTORM WINDS HIGH WIND         
## [10] HEAVY RAIN        
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND

Worst 10 event causing maximum casualities are TORNADO , EXCESSIVE HEAT,FLOOD , DROUGHT ,TSTM WIND , HEAT ,ICE STORM ,FLASH FLOOD ,THUNDERSTORM WIND,WINTER STORM. After a close inspection of the dataset, we still see some discrepancies in the data, like for example TSTM WIND , THUNDERSTORM WIND , THUNDERSTROM WINDS. etc respresent same event THUNDERSTORM but there are multiple enteries in dataset. Further cleansing of the dataset is done to remove this discrepacies.

Data Cleansing

# 1. THUNDERSTORM and TSTM wind
Thunderstorm <- storm.Data[grep("^THUNDER",storm.Data$EVTYPE),]
TSTM <- storm.Data[grep("^TSTM",storm.Data$EVTYPE),]
Thunderstorm <- rbind(Thunderstorm, TSTM)
by_year <- group_by(Thunderstorm,Year)
E.Thunderstorm <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T), Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.Thunderstorm$EVTYPE <- rep("THUNDERSTORM", nrow(E.Thunderstorm))
# 2. EXCESSIVE HEAT and HEAT WAVE , EXCESSIVE HEAT will fall in as event category
heat <-storm.Data[grep("^HEAT",storm.Data$EVTYPE),]
ex.heat <-storm.Data[grep("^EXCESSIVE HEAT",storm.Data$EVTYPE),]
heat <- rbind(heat,ex.heat)
by_year <- group_by(heat,Year)
E.Heat <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T), Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.Heat$EVTYPE <- rep("HEAT",nrow(E.Heat))
# 3.FLOOD
flood <-storm.Data[grep("^FLOOD",storm.Data$EVTYPE),]
by_year <- group_by(flood,Year)
E.flood <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T), Property.Dmg= sum(PROPDMG,na.rm=T), Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.flood$EVTYPE <- rep("FLOODS",nrow(E.flood))
# 4. FLASH FLOOD
Flash.flood <- storm.Data[grep("^FLASH",storm.Data$EVTYPE),]
by_year <- group_by(Flash.flood,Year)
E.flash.flood <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T),Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.flash.flood$EVTYPE <- rep("FLASH_FLOOD",nrow(E.flash.flood))
# 5. DROUGHT
drought <- storm.Data[grep("^DROUGHT",storm.Data$EVTYPE),]
by_year <- group_by(drought,Year)
E.drought <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T), Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.drought$EVTYPE <- rep("DROUGHT",nrow(E.drought))
# 6. TORNADO
tornado <- storm.Data[grep("^TORNADO",storm.Data$EVTYPE),]
by_year <- group_by(tornado,Year)
E.tornado <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T), Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.tornado$EVTYPE <- rep("TORNADO",nrow(E.tornado))
# 7. ICE STORM and WINTERSTORM
ice.storm <-storm.Data[grep("^ICE STORM",storm.Data$EVTYPE),]
winter.storm <-storm.Data[grep("^WINTER STORM",storm.Data$EVTYPE),]
winter.storm <- rbind(ice.storm, winter.storm)
by_year <- group_by(winter.storm,Year)
E.winter.storm <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T),Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.winter.storm$EVTYPE <- rep("WINTER_STORM",nrow(E.winter.storm))
# 8. HURRICANE/TYPOON
Hurricane <- storm.Data[grep("^HURRICANE",storm.Data$EVTYPE),]
typoon <- storm.Data[grep("^TYPOON",storm.Data$EVTYPE),]
Hurrican.typoon<- rbind(Hurricane,typoon)
by_year <- group_by(Hurrican.typoon,Year)
E.Hur.typ <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T),  Crop.Dmg = sum(CROPDMG,na.rm=T)))
E.Hur.typ$EVTYPE <- rep("HURRICANE_TYPOON", nrow(E.Hur.typ))
# 9. TSUANAMI
tsunami <- storm.Data[grep("^TSUNAMI",storm.Data$EVTYPE),]
by_year <- group_by(tsunami,Year)
E.tsunami <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T),Crop.Dmg = sum(CROPDMG,na.rm=T))) 
E.tsunami$EVTYPE <-rep("TSUNAMI",nrow(E.tsunami))
# 10. HAIL
hail <- storm.Data[grep("^HAIL",storm.Data$EVTYPE),]
by_year <- group_by(hail,Year)
E.hail <- data.frame(summarise(by_year,Casualities = sum(casualities,na.rm=T),Property.Dmg= sum(PROPDMG,na.rm=T), Crop.Dmg = sum(CROPDMG,na.rm=T))) 
E.hail$EVTYPE <- rep("HAIL",nrow(E.hail))

# Clean Dataset
F_Storm.Data <- rbind(E.hail,E.tsunami,E.Hur.typ,E.winter.storm,E.tornado,E.drought ,E.flash.flood,E.flood,E.Heat,E.Thunderstorm)
head(F_Storm.Data)
##   Year Casualities Property.Dmg Crop.Dmg EVTYPE
## 1 1991           8         0.00     0.00   HAIL
## 2 1992         109         0.00     0.00   HAIL
## 3 1993          17     37585.05 18280.35   HAIL
## 4 1994          27     53966.95 46148.00   HAIL
## 5 1995         206     22868.00 16968.96   HAIL
## 6 1996          83     31278.93 25026.00   HAIL

Event Types most harmful for population health

To see how theses events have impacted the human population across the united states.

Casualities <- F_Storm.Data[,c(1,5,2)]
Cas.Event<- dcast(Casualities, Year ~ EVTYPE)
## Using Casualities as value column: use value.var to override.
Cas.Event[is.na(Cas.Event)] <- 0
Cas.Event$Year <- as.integer(Cas.Event$Year)
ylim <- c(0, max(Cas.Event$FLASH_FLOOD + Cas.Event$FLOODS + Cas.Event$HAIL + Cas.Event$HEAT + Cas.Event$HURRICANE_TYPOON + Cas.Event$DROUGHT
+ Cas.Event$THUNDERSTORM + Cas.Event$TORNADO + Cas.Event$TSUNAMI + Cas.Event$WINTER_STORM ))

xx <- c(Cas.Event$Year, rev(Cas.Event$Year))
yyff <- c(rep(0, nrow(Cas.Event)), rev(Cas.Event$TORNADO))
plot(x=Cas.Event$Year, y=Cas.Event$TORNADO, ylim=ylim, col='light blue',type='l', ylab='Casualities', xlab='Year',main='Worst Casualities By Weather Event\n Across United States (1991-2011)')
polygon(xx, yyff, col='lightgreen') 

yyfld <- c(Cas.Event$TORNADO, rev(Cas.Event$TORNADO) + rev(Cas.Event$FLASH_FLOOD))
polygon(xx, yyfld, col='royalblue') 

yyh <- c(Cas.Event$TORNADO + Cas.Event$FLASH_FLOOD , rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD))
polygon(xx, yyh, col='plum') 

yyhe <- c(Cas.Event$TORNADO + Cas.Event$HEAT  + Cas.Event$FLASH_FLOOD,
rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$DROUGHT))  
polygon(xx, yyhe, col='maroon')

yyht <- c(Cas.Event$TORNADO + Cas.Event$HEAT  + Cas.Event$FLASH_FLOOD + Cas.Event$DROUGHT,
rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$DROUGHT) + rev(Cas.Event$HAIL))  
polygon(xx, yyht, col='red')


yylg <-   c(Cas.Event$TORNADO + Cas.Event$HEAT  + Cas.Event$FLASH_FLOOD + Cas.Event$DROUGHT + Cas.Event$HAIL ,rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$ DROUGHT)+ rev(Cas.Event$HAIL) + rev(Cas.Event$WINTER_STORM))  
polygon(xx, yylg, col='orange') 

yyth <- c(Cas.Event$TORNADO + Cas.Event$HEAT + Cas.Event$FLASH_FLOOD + Cas.Event$DROUGHT
 + Cas.Event$HAIL + Cas.Event$WINTER_STORM ,rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$DROUGHT) + rev(Cas.Event$HAIL) + rev(Cas.Event$WINTER_STORM) + rev(Cas.Event$HURRICANE_TYPOON)) 
polygon(xx, yyth, col='purple') 

yyt <- c(Cas.Event$TORNADO + Cas.Event$HEAT  + Cas.Event$FLASH_FLOOD + Cas.Event$DROUGHT + Cas.Event$HAIL + Cas.Event$WINTER_STORM + Cas.Event$HURRICANE_TYPOON ,rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$DROUGHT) + rev(Cas.Event$HAIL)+ rev(Cas.Event$WINTER_STORM) + rev(Cas.Event$HURRICANE_TYPOON) + rev(Cas.Event$THUNDERSTORM)) 
polygon(xx, yyt, col='darkgreen')

yyts <-c(Cas.Event$TORNADO + Cas.Event$HEAT  + Cas.Event$FLASH_FLOOD + Cas.Event$DROUGHT + Cas.Event$HAIL + Cas.Event$WINTER_STORM + Cas.Event$HURRICANE_TYPOON + Cas.Event$THUNDERSTORM ,
rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$DROUGHT) + rev(Cas.Event$HAIL) + rev(Cas.Event$WINTER_STORM) + rev(Cas.Event$HURRICANE_TYPOON) + rev(Cas.Event$THUNDERSTORM) +  rev(Cas.Event$FLOODS)) 
polygon(xx, yyts, col='grey') #9

yyws <- c(Cas.Event$TORNADO + Cas.Event$HEAT  + Cas.Event$FLASH_FLOOD + Cas.Event$DROUGHT + Cas.Event$WINTER_STORM + Cas.Event$HURRICANE_TYPOON + Cas.Event$THUNDERSTORM +
Cas.Event$FLOODS ,rev(Cas.Event$TORNADO) + rev(Cas.Event$HEAT)+ rev(Cas.Event$FLASH_FLOOD) + rev(Cas.Event$DROUGHT)+ rev(Cas.Event$HAIL) + rev(Cas.Event$WINTER_STORM) + rev(Cas.Event$HURRICANE_TYPOON) + rev(Cas.Event$THUNDERSTORM) +  rev(Cas.Event$FLOODS) + rev(Cas.Event$TSUNAMI) ) 
polygon(xx, yyws, col='yellow') # 10


legend(2000, 9700, c('Tornato', 'FlashFlood', 'Heat', " Drought","Hail","Winter storm", "Hurricane/Typoon", "Thunderstorm","Floods","Tsunami"), 
       fill=c('lightgreen', 'royalblue', 'plum',"maroon","red","orange","purple","darkgreen","grey","yellow"),cex=0.8,bty = "n")

From the chart it is clear that Tornados,Heat, Thunderstorm and floods are the events most harmful to human population.

Crop Damage

Across the United States, which types of events have the greatest economic consequences? Again we use the dataset “F_Storm.Data”. For this analysis we have calculated the 20 years annual average crop damage(1991-2011) , 5 years average annual crop damage(2006-2011), and crop damage for year 2011.

library(dplyr)
head(F_Storm.Data)
##   Year Casualities Property.Dmg Crop.Dmg EVTYPE
## 1 1991           8         0.00     0.00   HAIL
## 2 1992         109         0.00     0.00   HAIL
## 3 1993          17     37585.05 18280.35   HAIL
## 4 1994          27     53966.95 46148.00   HAIL
## 5 1995         206     22868.00 16968.96   HAIL
## 6 1996          83     31278.93 25026.00   HAIL
by_20year <- group_by(F_Storm.Data,EVTYPE)
Avg_An_20yrs_dmg <- data.frame(summarise(by_20year, Avg_Prop_Dmg_20yrs = round(mean(Property.Dmg, na.rm=T),2), Avg_Crop_Dmg_20yrs= round(mean(Crop.Dmg,na.rm=T),2)))

by_5year <- group_by(F_Storm.Data[F_Storm.Data$Year > 2005,], EVTYPE)
Avg_An_5yrs_dmg <- data.frame(summarise(by_5year, Avg_Prop_Dmg_5yrs = round(mean(Property.Dmg, na.rm=T),2), Avg_Crop_Dmg_5yrs = round(mean(Crop.Dmg,na.rm=T),2)))

by_2011 <- group_by(F_Storm.Data[F_Storm.Data$Year ==2011 ,], EVTYPE)
Avg_An_yr2011_dmg <- data.frame(summarise(by_2011, Avg_Prop_Dmg_2011 = round(mean(Property.Dmg, na.rm=T),2), Avg_Crop_Dmg_2011 = round(mean(Crop.Dmg,na.rm=T),2)))

Final_Damage<- merge(Avg_An_20yrs_dmg , Avg_An_5yrs_dmg,  by ="EVTYPE")
Final_Damage <- merge(Final_Damage,Avg_An_yr2011_dmg, by="EVTYPE" )

CrpDmg <- Final_Damage[,c(1,3,5,7)]
CrpDmg<-melt(CrpDmg, id = c("EVTYPE"))
names(CrpDmg) <- c("EVTYPE","Annual.Avg","Crop.Dmg")
ggplot(CrpDmg, aes(EVTYPE, Crop.Dmg)) +   
    geom_bar(aes(fill = Annual.Avg), position = "dodge", stat="identity") +
    xlab("") +
    ylab("Annual Average Crop Damage Casualities") + 
    ggtitle("Weather Events Causing Worst Crop Damage \nin United States (1991-2011)") +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.1)) +
    theme(plot.title = element_text(lineheight=.8, face="bold"))

From the chart , the weather events causing most crop damage are Flood, Hail,Thunderstorm, Tornado,Flash-Flood.

Property Damage

For this analysis we have calculated the 20 years annual average Property damage (1991-2011) , 5 years average annual property damage (2006-2011) , and crop damage for year 2011. Property Damage caused due weather events from 1991-2011

library(reshape2)
PropDmg <- Final_Damage[,c(1,2,3,6)]
PropDmg <-melt(PropDmg, id = c("EVTYPE"))
names(PropDmg) <- c("EVTYPE","Annual.Avg","Prop.Dmg")
ggplot(PropDmg, aes(EVTYPE, Prop.Dmg)) +   
    geom_bar(aes(fill = Annual.Avg), position = "dodge", stat="identity") +
    xlab("") +
    ylab("Annual Average Property Damage Casualities") + 
    ggtitle("Weather Events Causing Worst Property Damage \nin United States (1991-2011)") +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.1)) + 
    theme(plot.title = element_text(lineheight=.8, face="bold"))

From the above chart, the weather events causing maximum damage to property across united states are Tornado, Thunderstorm, flash_floods and floods. ## Results## From above analysis it is pretty clear that Tornados,Heat,Hail, Floods,Thunderstorms has the most harmful effect to live and property across united states