This project involves exploring the U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
By analyzing the database, this project identifies the types of major storms and weather events, which have the most population health effects (casualties) and greatest economic consequences in the United States.
1.Load the Storm data, and remove unused variables
setwd("~/Desktop/Coursera/Reproducible Research/Peer Assessment 2")
Data<-read.csv(bzfile("repdata-data-StormData.csv.bz2"))
StormData<-subset(Data,select=c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP"))
row.names(StormData)<-NULL
2.Trim the leading and trailing spaces in the EVTYPE variable, and uppercase the EVTYPE variable
StormData$EVTYPE<-sub("(^ +)|( +$)","",StormData$EVTYPE)
StormData$EVTYPE<-toupper(StormData$EVTYPE)
3.Add a new variable Type by summarizing the StormData EVTYPE, so the event types are reduced to 49 including NA type. (NA type event needs more detail information to analyze.)
Keywords<-list('AVALANC',
'HURRICANE',
'TYPHOON',
'WIND',
'WND',
'HAIL',
'RAIN',
'SHOW',
'FREEZ',
'LOW',
'COLD',
'CHILL',
'WINTRY',
'WINTER',
'COOL',
'THERMIA',
'SNOW',
'THUNDERSTORM',
'STORM',
'FLOOD',
'FLD',
'STREAM',
'RIP CURRENT',
'SURF',
'WAVE',
'SURGE',
'TORN',
'LIG',
'FUNNEL',
'WALL CLOUD',
'WATERSPO',
'WAYTER',
'WATER SPO',
'BLIZZARD',
'TIDE',
'HIGH',
'WARM',
'HEAT',
'HOT',
'MARINE MISHAP',
'MARINE ACCIDENT',
'DUST',
'MUD',
'SLIDE',
'FROST',
'SEICHE',
'EROS',
'ASH',
'GUSTNADO',
'PRECIP',
'SLEET',
'FOG',
'DRY',
'DRIE',
'DROUGHT',
'DOWNBURST',
'WET',
'DRIZZLE',
'GLAZE',
'TROPICAL DEPRESSION',
'VOLCANIC ERUPTION',
'VOG',
'SEAS',
'RED FLAG CRITERIA',
'SMOKE',
'LANDSLUMP',
'LANDSPOUT',
'ICE',
'ICY',
'FIRE',
'MICROBURST',
'TURBULENCE',
'TSUNAMI',
'DAM ',
'DROWN'
)
No<-list(1,2,2,3,3,4,5,5,6,6,
6,6,6,6,6,6,7,8,8,9,
9,9,10,10,10,10,11,12,13,13,
14,14,14,15,16,17,17,17,17,18,
18,19,20,20,21,22,23,24,25,26,
27,28,29,29,29,30,31,32,33,34,
35,36,37,38,39,40,41,42,42,43,
44,45,46,47,48
)
TypeTable<-data.frame(cbind(Keywords,No))
StormData$Type<-0
for (i in 1:length(TypeTable$No)) {
location<-grep(TypeTable$Keywords[[i]],StormData$EVTYPE)
StormData[location,8]<-TypeTable$No[[i]]
}
StormData<-StormData[order(StormData$Type),]
3.Count the Casualties by summing the fatalities and injuries
StormData$CASUALTIES<-StormData$FATALITIES+StormData$INJURIES
4.Create a new dataset CasData by summing the Casualties by each event type
StormData$Type<-as.factor(StormData$Type)
TotCASUALTIES<-tapply(StormData$CASUALTIES,StormData$Type,sum)
EVENTNAMES<-list('NA',
'AVALANCHE',
'HURRICANE or TYPHOON',
'WIND',
'HAIL',
'RAIN',
'FREEZE or COLD or CHILL or WINTRY or WINTER or COOL or HYPOTHERMIA',
'SNOW',
'THUNDERSTORM or STORM',
'FLOOD or STREAM',
'RIP CURRENT or SURF',
'TORNADO',
'LIGHTNING',
'FUNNEL CLOUD or WALL CLOUD',
'WATERSPOUT',
'BLIZZARD',
'TIDE',
'HIGH or WARM or HEAT or HOT',
'MARINE MISHAP or MARINE ACCIDENT',
'DUST',
'MUD SLIDE',
'FROST',
'SEICHE',
'EROSION',
'ASH',
'GUSTNADO',
'PRECIPITATION',
'SLEET',
'FOG',
'DRY',
'DOWNBURST',
'WET',
'DRIZZLE',
'GLAZE',
'TROPICAL DEPRESSION',
'VOLCANIC ERUPTION',
'VOG',
'SEAS',
'RED FLAG CRITERIA',
'SMOKE',
'LANDSLUMP',
'LANDSPOUT',
'ICE or ICY',
'FIRE',
'MICROBURST',
'TURBULENCE',
'TSUNAMI',
'DAM BREAK or DAM FAILURE',
'DROWNING'
)
CasData<-data.frame(cbind(EVENTNAMES,TotCASUALTIES))
colnames(CasData)<-c('EVENTS','CASUALTIES')
row.names(CasData)<-NULL
CasData$EVENTS<-as.character(CasData$EVENTS)
CasData$CASUALTIES<-as.numeric(CasData$CASUALTIES)
5.Sort the CasData by the decreasing order of the Casualties
CasData<-CasData[order(CasData$CASUALTIES,decreasing =T,na.last=T),]
row.names(CasData)<-NULL
6.Subset the CasData into a Top5CasData including the top Five event types
Top5CasData<-CasData[1:5,]
Top5CasData$CASUALTIES<-round(Top5CasData$CASUALTIES/100,1)
7.Trim the leading and trailing spaces in the PROPDMGEXP and CROPDMGEXP
StormData$PROPDMGEXP<-sub("(^ +)|( +$)","",StormData$PROPDMGEXP)
StormData$CROPDMGEXP<-sub("(^ +)|( +$)","",StormData$CROPDMGEXP)
8.Create CMultiplier and PMultiplier to reflect the real economic damage in numerical values
StormData$PMultiplier<-0
StormData$CMultiplier<-0
StormData$PMultiplier[StormData$PROPDMGEXP %in% c("B","b")]<-1000000000
StormData$PMultiplier[StormData$PROPDMGEXP %in% c("M","m")]<-1000000
StormData$PMultiplier[StormData$PROPDMGEXP %in% c("K","K")]<-1000
StormData$PMultiplier[StormData$PROPDMGEXP %in% c("H","h")]<-100
StormData$PMultiplier[StormData$PROPDMGEXP %in% 1]<-10
StormData$PMultiplier[StormData$PROPDMGEXP %in% 2]<-100
StormData$PMultiplier[StormData$PROPDMGEXP %in% 3]<-1000
StormData$PMultiplier[StormData$PROPDMGEXP %in% 4]<-10000
StormData$PMultiplier[StormData$PROPDMGEXP %in% 5]<-100000
StormData$PMultiplier[StormData$PROPDMGEXP %in% 6]<-1000000
StormData$PMultiplier[StormData$PROPDMGEXP %in% 7]<-10000000
StormData$PMultiplier[StormData$PROPDMGEXP %in% 8]<-100000000
StormData$PMultiplier[StormData$PROPDMGEXP %in% c("-","+","?",0)]<-0
StormData$CMultiplier[StormData$CROPDMGEXP %in% c("B","b")]<-1000000000
StormData$CMultiplier[StormData$CROPDMGEXP %in% c("M","m")]<-1000000
StormData$CMultiplier[StormData$CROPDMGEXP %in% c("K","k")]<-1000
StormData$CMultiplier[StormData$CROPDMGEXP %in% 2]<-100
StormData$CMultiplier[StormData$CROPDMGEXP %in% c("-","+","?",0)]<-0
9.Multiply the base DMG by the multiplier, sum the PROPDMG and CROPDMG by each event type, and create a new dataset DMGData including the event types and total economic DMG
StormData$PROPERDMG<-as.numeric(StormData$PROPDMG)*StormData$PMultiplier
TotPROPDMG<-tapply(StormData$PROPERDMG,StormData$Type,sum)
StormData$CROPDMGEXP<-as.numeric(StormData$CROPDMG)*StormData$CMultiplier
TotCROPDMG<-tapply(StormData$PROPERDMG,StormData$Type,sum)
TotDMG<-TotPROPDMG+TotCROPDMG
DMGData<-data.frame(cbind(EVENTNAMES,TotDMG))
colnames(DMGData)<-c('EVENTS','ECODMG')
row.names(DMGData)<-NULL
DMGData$EVENTS<-as.character(DMGData$EVENTS)
DMGData$ECODMG<-as.numeric(DMGData$ECODMG)
10.Sort the DMGData by the decreasing order of total economic damage
DMGData<-DMGData[order(DMGData$ECODMG,decreasing =T,na.last=T),]
row.names(DMGData)<-NULL
11.Subset the DMGData into Top5DMGData including the top Five event types
Top5DMGData<-DMGData[1:5,]
Top5DMGData$ECODMG<-round(Top5DMGData$ECODMG/1000000000,3)
1.The top 5 most harmful with respect to population health types of event across the United States:
Top5CasData
## EVENTS CASUALTIES
## 1 TORNADO 970.2
## 2 HIGH or WARM or HEAT or HOT 144.9
## 3 WIND 80.7
## 4 FLOOD or STREAM 73.9
## 5 LIGHTNING 60.5
Plot histogram of the Top5CasData
library(ggplot2)
gg1 <- ggplot(Top5CasData, aes(x=as.character(Top5CasData$EVENTS)))
gg1 <- gg1 + geom_bar(aes(weight=as.numeric(Top5CasData$CASUALTIES)))
gg1 <- gg1 + labs(list(x="Event Types", y="Total Casulties (in hundred)", title="Top 5 Event Types of the Most Casualties"))
gg1 <- gg1 +theme(axis.text.x = element_text(angle = 70, hjust = 1))
gg1
2.The top 5 greatest economic consequences types of events across the United States:
Top5DMGData
## EVENTS ECODMG
## 1 FLOOD or STREAM 301.24
## 2 HURRICANE or TYPHOON 170.51
## 3 TORNADO 117.10
## 4 RIP CURRENT or SURF 86.65
## 5 THUNDERSTORM or STORM 42.57
Plot histogram of the Top5DMGData
gg2 <- ggplot(Top5DMGData, aes(x=as.character(Top5DMGData$EVENTS)))
gg2 <- gg2 + geom_bar(aes(weight=as.numeric(Top5DMGData$ECODMG)))
gg2 <- gg2 + labs(list(x="Event Types", y="Total Economic Damage (in $ Billion)", title="Top 5 Event Types of the Greatest Economic Consequences (Damages)"))
gg2 <- gg2 +theme(axis.text.x = element_text(angle = 70, hjust = 1))
gg2