Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project using the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. To unswer the follwing questions:
The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
The NOAA webset contains more detailed information about this database.
Download dataset and store data in local folder
setwd("C:/Users/may/Desktop/DataScience/Reproducible_Research/Final_project")
data_raw<-read.csv(bzfile("StormData.csv.bz2"),na.strings = "NA")
Data cleaning
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data<-select(data_raw,REFNUM, BGN_DATE, STATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP)
#clean the EVTYPE(Events type) variable: made all the char to lower case and replace all punct. to space
EVTYPE2<-tolower(data$EVTYPE)
EVTYPE2<-gsub("[[:blank:][:punct:]+]", " ", EVTYPE2)
data$EVTYPE<-as.factor(EVTYPE2)
#clean the PROPDMGEXP and CROPDMGEXP variable
change <- function(i) {
if (i %in% c('h', 'H'))
return(2)
else if (i %in% c('k', 'K'))
return(3)
else if (i %in% c('m', 'M'))
return(6)
else if (i %in% c('b', 'B'))
return(9)
else if (!is.na(as.numeric(i))) # if a digit
return(as.numeric(i))
else if (i %in% c('', '-', '?', '+'))
return(0)
else {
stop("Invalid")
}
}
pr_exp <- sapply(data$PROPDMGEXP, FUN=change)
cr_exp<-sapply(data$CROPDMGEXP, FUN=change)
#Calculate the property damage and crop damage based on PROPDMG/PROPDMGEXP and CROPDMG/CROPDMGEXP
data$PROPDMG2 <- data$PROPDMG * (10 ** pr_exp)
data$CROPDMG2 <- data$CROPDMG * (10 ** cr_exp)
harm<- data%>%
select(EVTYPE,FATALITIES,INJURIES)%>%
group_by(EVTYPE)%>%
summarise(fatal=sum(FATALITIES),injur=sum(INJURIES))%>%
mutate(fatInj=fatal+injur)
fatal<- head(harm[order(harm$fatal, decreasing = T), ], 10)
injury <- head(harm[order(harm$injur, decreasing = T), ], 10)
fetal_injury<-head(harm[order(harm$fatInj, decreasing = T), ], 10)
Table1. Top 10 fatal events
fatal[,c("EVTYPE","fatal")]
## # A tibble: 10 x 2
## EVTYPE fatal
## <fct> <dbl>
## 1 tornado 5633
## 2 excessive heat 1903
## 3 flash flood 978
## 4 heat 937
## 5 lightning 816
## 6 tstm wind 504
## 7 flood 470
## 8 rip current 368
## 9 high wind 248
## 10 avalanche 224
Table2. Top 10 injury evnets
injury[,c("EVTYPE","injur")]
## # A tibble: 10 x 2
## EVTYPE injur
## <fct> <dbl>
## 1 tornado 91346
## 2 tstm wind 6957
## 3 flood 6789
## 4 excessive heat 6525
## 5 lightning 5230
## 6 heat 2100
## 7 ice storm 1975
## 8 flash flood 1777
## 9 thunderstorm wind 1488
## 10 hail 1361
Table3. Top 10 fatal and injury evnets
fetal_injury[,c("EVTYPE","fatInj")]
## # A tibble: 10 x 2
## EVTYPE fatInj
## <fct> <dbl>
## 1 tornado 96979
## 2 excessive heat 8428
## 3 tstm wind 7461
## 4 flood 7259
## 5 lightning 6046
## 6 heat 3037
## 7 flash flood 2755
## 8 ice storm 2064
## 9 thunderstorm wind 1621
## 10 winter storm 1527
Plot for the top events
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
q1<-ggplot(fatal,aes(x=reorder(EVTYPE,-fatal),y=fatal))+
geom_bar(stat = "identity")+
xlab("Event type")+
ylab("Total number of fatalities")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Top 10 fatal events")
q2<-ggplot(injury,aes(x=reorder(EVTYPE,-injur),y=injur))+
geom_bar(stat = "identity")+
xlab("Event type")+
ylab("Total number of injuries")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Top 10 injury events")
q3<-ggplot(fetal_injury,aes(x=reorder(EVTYPE,-fatInj),y=fatInj))+
geom_bar(stat = "identity")+
xlab("Event type")+
ylab("Total number of fatalities and injuries")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Top 10 fatal&injury events")
grid.arrange(q1,q2,q3,nrow=1)
2.Across the United States, which types of sever weather events have the greatest economic consequences? In this part the variable PROPDMG2(property damage) and CROPDMG2(crop damage) are used to evaluate the events
eco<- data %>%
select(EVTYPE,PROPDMG2,CROPDMG2)%>%
group_by(EVTYPE)%>%
summarise(pro_sum=sum(PROPDMG2),crop_sum=sum(CROPDMG2))
eco<-eco[(eco$pro_sum>0|eco$crop_sum>0),]
Property<-head(eco[order(eco$pro_sum,decreasing=T),],10)
Crop<-head(eco[order(eco$crop_sum,decreasing=T),],10)
Table4. Top 10 property damage events
Property[,c("EVTYPE","pro_sum")]
## # A tibble: 10 x 2
## EVTYPE pro_sum
## <fct> <dbl>
## 1 flash flood 6.82e13
## 2 thunderstorm winds 2.09e13
## 3 tornado 1.08e12
## 4 hail 3.16e11
## 5 lightning 1.73e11
## 6 flood 1.45e11
## 7 hurricane typhoon 6.93e10
## 8 flooding 5.92e10
## 9 storm surge 4.33e10
## 10 heavy snow 1.79e10
Table5. Top 10 crop damage events
Crop[,c("EVTYPE","crop_sum")]
## # A tibble: 10 x 2
## EVTYPE crop_sum
## <fct> <dbl>
## 1 drought 13972566000
## 2 flood 5661968450
## 3 river flood 5029459000
## 4 ice storm 5022113500
## 5 hail 3025974480
## 6 hurricane 2741910000
## 7 hurricane typhoon 2607872800
## 8 flash flood 1421317100
## 9 extreme cold 1312973000
## 10 frost freeze 1094186000
Plot for the top events
g1<-ggplot(Property,aes(x=reorder(EVTYPE,-pro_sum),y=pro_sum))+
geom_bar(stat="identity")+
xlab("Event type")+
ylab("Property damage in US dollar")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Top 10 property damage events")
g2<-ggplot(Crop,aes(x=reorder(EVTYPE,-crop_sum),y=crop_sum))+
geom_bar(stat="identity")+
xlab("Event type")+
ylab("Crop damage in US dollar")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Top 10 Crop damage events")
grid.arrange(g1,g2,nrow=1)