This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States between 1950 and 2011, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. (In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.)
This report identified the weather events that bring the highest damage to public health (i.e. injures and fatalities) and economic damages (i.e. Property and crop damages) in the Unied Satates. It showed that Tornado has the highest damage to public health in both injuries and fatalities. For economic losses, floods showed the highest overall economic damage cost. In which, floods have the highest property damage cost while droughts have the highest crop damge cost.
The data used for this analysis was retrieved from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The following links shared the documentation of data.
Data set: Data
Documentation of data: Data documentation
First loading the packages that were used to run the analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Loading the data set into the environment, and understand the properties of the data frame.
Data <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = ",")
colnames(Data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
str(Data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
dim(Data)
## [1] 902297 37
Filtering data: Selecting the columns that are useful for the data analysis.
data <- select(Data, "STATE__", "BGN_DATE", "BGN_TIME", "COUNTY", "STATE",
"EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG",
"CROPDMGEXP")
Checking the levels of PROPEXP and CROPEXP
unique(data$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(data$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
#PROP DATA
data$PROPEXP_int[data$PROPDMGEXP %in% c("+", "-", "?")] <- 0
data$PROPEXP_int[data$PROPDMGEXP == "K"] <- 1000
data$PROPEXP_int[data$PROPDMGEXP == "M"] <- 1e+06
data$PROPEXP_int[data$PROPDMGEXP == ""] <- 1
data$PROPEXP_int[data$PROPDMGEXP == "B"] <- 1e+09
data$PROPEXP_int[data$PROPDMGEXP == "m"] <- 1e+06
data$PROPEXP_int[data$PROPDMGEXP == "0"] <- 1
data$PROPEXP_int[data$PROPDMGEXP == "5"] <- 1e+05
data$PROPEXP_int[data$PROPDMGEXP == "6"] <- 1e+06
data$PROPEXP_int[data$PROPDMGEXP == "4"] <- 10000
data$PROPEXP_int[data$PROPDMGEXP == "2"] <- 100
data$PROPEXP_int[data$PROPDMGEXP == "3"] <- 1000
data$PROPEXP_int[data$PROPDMGEXP == "h"] <- 100
data$PROPEXP_int[data$PROPDMGEXP == "7"] <- 1e+07
data$PROPEXP_int[data$PROPDMGEXP == "H"] <- 100
data$PROPEXP_int[data$PROPDMGEXP == "1"] <- 10
data$PROPEXP_int[data$PROPDMGEXP == "8"] <- 1e+08
#CROP DATA
data$CROPEXP_int[data$CROPDMGEXP %in% c("+", "-", "?")] <- 0
data$CROPEXP_int[data$CROPDMGEXP == "M"] <- 1e+06
data$CROPEXP_int[data$CROPDMGEXP == "K"] <- 1000
data$CROPEXP_int[data$CROPDMGEXP == "m"] <- 1e+06
data$CROPEXP_int[data$CROPDMGEXP == "B"] <- 1e+09
data$CROPEXP_int[data$CROPDMGEXP == "0"] <- 1
data$CROPEXP_int[data$CROPDMGEXP == "k"] <- 1000
data$CROPEXP_int[data$CROPDMGEXP == "2"] <- 100
data$CROPEXP_int[data$CROPDMGEXP == ""] <- 1
data <- data%>%
mutate(PROPDMG_new = PROPDMG*PROPEXP_int)%>%
mutate(CROPDMG_new = CROPDMG* CROPEXP_int)%>%
mutate(harmful = FATALITIES + INJURIES)%>%
mutate(Econharm = PROPDMG_new + CROPDMG_new)
head(data)
## STATE__ BGN_DATE BGN_TIME COUNTY STATE EVTYPE FATALITIES INJURIES
## 1 1 4/18/1950 0:00:00 0130 97 AL TORNADO 0 15
## 2 1 4/18/1950 0:00:00 0145 3 AL TORNADO 0 0
## 3 1 2/20/1951 0:00:00 1600 57 AL TORNADO 0 2
## 4 1 6/8/1951 0:00:00 0900 89 AL TORNADO 0 2
## 5 1 11/15/1951 0:00:00 1500 43 AL TORNADO 0 2
## 6 1 11/15/1951 0:00:00 2000 77 AL TORNADO 0 6
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP PROPEXP_int CROPEXP_int PROPDMG_new
## 1 25.0 K 0 1000 1 25000
## 2 2.5 K 0 1000 1 2500
## 3 25.0 K 0 1000 1 25000
## 4 2.5 K 0 1000 1 2500
## 5 2.5 K 0 1000 1 2500
## 6 2.5 K 0 1000 1 2500
## CROPDMG_new harmful Econharm
## 1 0 15 25000
## 2 0 0 2500
## 3 0 2 25000
## 4 0 2 2500
## 5 0 2 2500
## 6 0 6 2500
sum_each <- data%>%
group_by(EVTYPE)%>%
summarise(sumFATAL = sum(FATALITIES),
sumINJUR = sum(INJURIES),
sum_combine = sum(harmful),
sumPROP = sum(PROPDMG_new),
sumCROP = sum(CROPDMG_new),
sum_Econcombine = sum(Econharm)
)
Finding top 5 events of each variables with the highest damages.
fatal <- sum_each%>%
arrange(desc(sumFATAL))%>%
slice(1:5)
injur <- sum_each%>%
arrange(desc(sumINJUR))%>%
slice(1:5)
Total_dmg <- sum_each%>%
arrange(desc(sum_combine))%>%
slice(1:5)
prop <- sum_each%>%
arrange(desc(sumPROP))%>%
slice(1:5)
crop <- sum_each%>%
arrange(desc(sumCROP))%>%
slice(1:5)
Total_Econdmg <- sum_each%>%
arrange(desc(sum_Econcombine))%>%
slice(1:5)