The National Weather service gathers information regarding significant storm events in the United States. Data is taken from a variety of sources, such as governmental agencies, law enforcement agencies and media organizations, etc. This database includes counts of fatalities, injuries, property damage and crop damage on significant US weather events. The following report contains the r code necessary to process and plot the data, ultimately answering which types of events cause the most fatalities, injuries, property damage and crop damage.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("repdata-data-StormData.csv.bz2")){
download.file(url, destfile = "repdata-data-StormData.csv.bz2")
}
data <- read.csv("repdata-data-StormData.csv.bz2")
names(data) <- tolower(names(data))
data$bgn_date <- as.Date(data$bgn_date, "%d/%m/%Y")
Property and crop damage estimates are recorded as a numeric value and character value???K, M, B, that means thousands, million and billion. Let’s combine this two columns for a more interpretable economic cost.
At first we create function that helps us to build a new columns with numerical values of property and crop damages. After, we summarize these columns and get the result in the form of overall economic losses from each event type.
#At first we create function that helps us to build a new column with numerical values of property and crop damages.
data_dmg <- function(x, y) {
if(x == "K") {
y * 1e3
} else if(x == "M") {
y * 1e6
} else if(y == "B") {
y * 1e9
}else {
y = 0
}
}
#receiving a numerical value of property and crop damage from two columns(propdmgexp and propdmg), and get overall economic losses from each event type.
data$propdmgexp <- as.character(data$propdmgexp)
data$property_dmg <- mapply(data_dmg, data$propdmgexp, data$propdmg)
data$cropdmgexp <- as.character(data$cropdmgexp)
data$crop_dmg <- mapply(data_dmg, data$cropdmgexp, data$cropdmg)
data$economic_dmg <- data$property_dmg + data$crop_dmg
Now we specify the columns that we need for the analysis
data <- data%>%
select(evtype, fatalities, injuries, economic_dmg)
In this dataset we have a huge problem associated with the amount of event types. And most of them are simply a repetition. The first thing we do is try to reduce the number of types.
# Consolidate event types. Change variant names into common ones.
data[grepl("THUND", data$evtype, ignore.case=T), c("evtype")] <- "THUNDERSTORM"
data[grepl("TSTM.*", data$evtype, ignore.case=T), c("evtype")] <- "THUNDERSTORM"
data[grepl("COLD|COOL", data$evtype, ignore.case=T), c("evtype")] <- "COLD"
data[grepl("HURRICANE", data$evtype, ignore.case=T), c("evtype")] <- "HURRICANE"
data[grepl("TORNADO", data$evtype, ignore.case=T), c("evtype")] <- "TORNADO"
data[grepl("NADO", data$evtype, ignore.case=T), c("evtype")] <- "TORNADO"
data[grepl("WA.*ER.*SPOUT", data$evtype, ignore.case=T), c("evtype")] <- "WATERSPOUT"
data[grepl("HAIL", data$evtype, ignore.case=T), c("evtype")] <- "HAIL"
data[grepl("DRY|DRI", data$evtype, ignore.case=T), c("evtype")] <- "DRY WEATHER"
data[grepl("WARM", data$evtype, ignore.case=T), c("evtype")] <- "WARM WEATHER"
data[grepl("ICE|ICY|SLEET|FREEZ|FROST", data$evtype, ignore.case=T), c("evtype")] <- "ICY WEATHER"
data[grepl("TROPICAL STORM", data$evtype, ignore.case=T), c("evtype")] <- "TROPICAL STORM"
data[grepl("WET", data$evtype, ignore.case=T), c("evtype")] <- "WET WEATHER"
data[grepl("TIDE|SURF", data$evtype, ignore.case=T), c("evtype")] <- "TIDES OR SURF"
data[grepl("RAIN|PRECIP", data$evtype, ignore.case=T), c("evtype")] <- "RAINY WEATHER"
data[grepl("COAST.*FLOOD", data$evtype, ignore.case=T), c("evtype")] <- "COASTAL FLOODING"
data[grepl("FLOOD|FLDG", data$evtype, ignore.case=T), c("evtype")] <- "FLOODING"
data[grepl("(RAPI|HIG).*WATER", data$evtype, ignore.case=T), c("evtype")] <- "FLOODING"
data[grepl("BLIZZARD", data$evtype, ignore.case=T), c("evtype")] <- "BLIZZARD"
data[grepl("WIND.*CHILL", data$evtype, ignore.case=T), c("evtype")] <- "WIND CHILL"
data[grepl("WIND", data$evtype, ignore.case=T), c("evtype")] <- "STRONG WINDS"
data[grepl("SNOW", data$evtype, ignore.case=T), c("evtype")] <- "SNOW"
data[grepl("FIRE", data$evtype, ignore.case=T), c("evtype")] <- "WILDFIRE"
data[grepl("FOG|VOG", data$evtype, ignore.case=T), c("evtype")] <- "FOG"
data[grepl("VOLCAN", data$evtype, ignore.case=T), c("evtype")] <- "VOLCANIC ERUPTION OR ASH"
data[grepl("CLOUD", data$evtype, ignore.case=T), c("evtype")] <- "CLOUDS"
data[grepl("HEAT|HOT", data$evtype, ignore.case=T), c("evtype")] <- "HEAT"
data[grepl("LIG.*ING", data$evtype, ignore.case=T), c("evtype")] <- "LIGHTNING"
data[grepl("DUST", data$evtype, ignore.case=T), c("evtype")] <- "DUST STORM"
data[grepl("SURGE", data$evtype, ignore.case=T), c("evtype")] <- "COASTAL FLOODING"
data[grepl("WINT", data$evtype, ignore.case=T), c("evtype")] <- "WINTER WEATHER"
data[grepl("AVALAN", data$evtype, ignore.case=T), c("evtype")] <- "AVALANCHE"
# Clean event types. Summarize all the data on the type of event, and delete all rows with no data
cleandata <- data %>%
group_by(evtype)%>%
summarise(count = n(), deaths = sum(fatalities), injuries = sum(injuries),
economic_dmg = sum(economic_dmg)) %>%
arrange(desc(deaths)) %>%
filter(deaths > 0 & injuries > 0 & economic_dmg > 0)
# The event types
cleandata$evtype
## [1] TORNADO HEAT FLOODING
## [4] LIGHTNING THUNDERSTORM STRONG WINDS
## [7] COLD RIP CURRENT WINTER WEATHER
## [10] AVALANCHE RIP CURRENTS TIDES OR SURF
## [13] SNOW HURRICANE ICY WEATHER
## [16] RAINY WEATHER BLIZZARD WILDFIRE
## [19] FOG TROPICAL STORM LANDSLIDE
## [22] DRY WEATHER TSUNAMI URBAN/SML STREAM FLD
## [25] DUST STORM HAIL COASTAL FLOODING
## [28] WARM WEATHER GLAZE HIGH SEAS
## [31] WATERSPOUT LANDSLIDES Marine Accident
## 192 Levels: ? APACHE COUNTY AVALANCHE BEACH EROSIN ... WND
# The summary of new clean dataset
summary(cleandata)
## evtype count deaths injuries
## AVALANCHE : 1 Min. : 1 Min. : 1.0 Min. : 1
## BLIZZARD : 1 1st Qu.: 313 1st Qu.: 24.0 1st Qu.: 79
## COASTAL FLOODING: 1 Median : 1835 Median : 101.0 Median : 320
## COLD : 1 Mean : 27038 Mean : 457.5 Mean : 4257
## DRY WEATHER : 1 3rd Qu.: 11991 3rd Qu.: 278.0 3rd Qu.: 1608
## DUST STORM : 1 Max. :336822 Max. :5636.0 Max. :91407
## (Other) :27
## economic_dmg
## Min. :1.000e+03
## 1st Qu.:6.893e+06
## Median :7.640e+08
## Mean :5.232e+09
## 3rd Qu.:3.259e+09
## Max. :5.210e+10
##
# First 10 rows
head(cleandata,10)
## # A tibble: 10 <U+00D7> 5
## evtype count deaths injuries economic_dmg
## <fctr> <int> <dbl> <dbl> <dbl>
## 1 TORNADO 60705 5636 91407 52096662590
## 2 HEAT 2662 3138 9224 524795030
## 3 FLOODING 82692 1528 8602 46270347420
## 4 LIGHTNING 15763 817 5231 940791370
## 5 THUNDERSTORM 336822 756 9545 11048485930
## 6 STRONG WINDS 26570 470 1953 5608409390
## 7 COLD 2484 451 320 1668345000
## 8 RIP CURRENT 470 368 232 1000
## 9 WINTER WEATHER 19690 278 1953 1758751750
## 10 AVALANCHE 387 225 170 3721800
As we can see from the last table, after data cleaning, we can build a plot of the most dangerous events to human health. We plot a top 15 Storm Event, from which it is clear that the leader of this ranking is the tornado that during the reporting period took more than 5,500 lives. Only the first three participants this ranking have brought more deaths than 1000 people each. The rest are below of this mark.
cleandata$evtype <- tolower(cleandata$evtype)
top15 <- cleandata[1:15,]
ggplot(data = top15, aes(x = reorder(evtype, deaths), y = deaths))+
geom_point(aes(colour=evtype, size=deaths))+
scale_y_continuous(breaks = seq(0, 6000, by = 1000))+
coord_flip()+
theme_bw()+
xlab("Storm Event")+
ylab("Deaths")+
ggtitle("Deaths By Different Weather Event")
Tornadoes also is the undisputed leader in number of injury to more than 9,000 peple during the reporting period. All other participants in ranking are located below the 1000 injury.
cleandata <- cleandata %>% arrange(desc(injuries))
top15 <- cleandata[1:15,]
ggplot(data = top15, aes(x = reorder(evtype, injuries), y = injuries))+
geom_point(aes(colour=evtype, size=injuries))+
scale_y_continuous(breaks = seq(0, 95000, by = 10000))+
coord_flip()+
theme_bw()+
xlab("Storm Event")+
ylab("Injuries")+
ggtitle("Injuries By Different Weather Event")
Next plot shows us the top 15 economic damage events by the reporting period.
cleandata <- cleandata %>% arrange(desc(economic_dmg))
top15 <- cleandata[1:15,]
ggplot(data = top15, aes(x = reorder(evtype, economic_dmg), y = economic_dmg/1000000000))+
geom_point(aes(colour=evtype, size=economic_dmg))+
coord_flip()+
theme_bw()+
xlab("Storm Event")+
ylab("Economic (Property & Crop) Damage in Billions")+
ggtitle("Economic Damage By Different Weather Event")
top15
## # A tibble: 15 <U+00D7> 5
## evtype count deaths injuries economic_dmg
## <chr> <int> <dbl> <dbl> <dbl>
## 1 tornado 60705 5636 91407 52096662590
## 2 flooding 82692 1528 8602 46270347420
## 3 hail 289283 15 1371 17216134320
## 4 hurricane 288 135 1328 14331472810
## 5 thunderstorm 336822 756 9545 11048485930
## 6 wildfire 4240 90 1608 6364910130
## 7 icy weather 4210 120 2223 5837637360
## 8 strong winds 26570 470 1953 5608409390
## 9 tropical storm 697 66 383 3259286550
## 10 winter weather 19690 278 1953 1758751750
## 11 cold 2484 451 320 1668345000
## 12 rainy weather 11991 108 308 1659508490
## 13 snow 17512 138 1107 1136942790
## 14 lightning 15763 817 5231 940791370
## 15 blizzard 2742 101 806 776973950