Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. You can download the file from the course web site:
We will be focusing en answering: the following questions:
so let’s begin…
library(tidyverse)
## -- Attaching packages ------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Note: we will be working in our current directory, but you can change it if you desired.
# first clean the environment and setup the working directory
rm(list= ls())
# now download file
if (!file.exists("repdata_data_StormData.csv.bz2")) {
fileURL <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(fileURL, destfile='repdata_data_StormData.csv.bz2', method = 'curl')
}
#noaaDF
df_raw <- read.csv(bzfile('repdata_data_StormData.csv.bz2'),header=TRUE, stringsAsFactors = FALSE)
# preparing our sub dataset
harmful_event<- select(df_raw, EVTYPE,FATALITIES,INJURIES) %>%
group_by(EVTYPE) %>% summarize(FATALITIES=sum(FATALITIES, na.rm = TRUE),
INJURIES = sum(INJURIES,na.rm = TRUE)) %>% ungroup() %>%
filter(FATALITIES>0 | INJURIES>0 )
We have the top five fatality event:
harmful_event %>% select(EVTYPE, FATALITIES) %>% arrange(desc(FATALITIES)) %>% head()
## # A tibble: 6 x 2
## EVTYPE FATALITIES
## <chr> <dbl>
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
and injuries respectively:
harmful_event %>% select(EVTYPE, INJURIES) %>% arrange(desc(INJURIES)) %>% head()
## # A tibble: 6 x 2
## EVTYPE INJURIES
## <chr> <dbl>
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
at the same way we can show visually the top 10 of the these two variables like:
fat<-harmful_event %>% select(EVTYPE, FATALITIES) %>%
arrange(desc(FATALITIES)) %>% head(10) %>%
ggplot( aes(x= reorder(EVTYPE, -FATALITIES), y=FATALITIES )) +
geom_bar(stat = "identity") +
xlab("Weather Event Type") +
ylab("Number of Fatalities") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle('Top 10 Fatalities')
inj<-harmful_event %>% select(EVTYPE, INJURIES) %>%
arrange(desc(INJURIES)) %>% head(10) %>%
ggplot( aes(x= reorder(EVTYPE, -INJURIES), y=INJURIES )) +
geom_bar(stat = "identity") +
xlab("Weather Event Type") +
ylab("Number of Injuries") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle('Top 10 Injuries')
grid.arrange(fat, inj, nrow = 1)
PROPDMG and CROPDMG columns to answer the previous question.DMG<- select(df_raw, EVTYPE,PROPDMG,CROPDMG) %>%
group_by(EVTYPE) %>% summarize(PROPDMG =sum(PROPDMG , na.rm = TRUE),
CROPDMG = sum(CROPDMG ,na.rm = TRUE)) %>% ungroup() %>%
filter(PROPDMG>0 | CROPDMG>0 )
We have the top five Property damage:
DMG %>% select(EVTYPE, PROPDMG) %>% arrange(desc(PROPDMG)) %>% head()
## # A tibble: 6 x 2
## EVTYPE PROPDMG
## <chr> <dbl>
## 1 TORNADO 3212258.
## 2 FLASH FLOOD 1420125.
## 3 TSTM WIND 1335966.
## 4 FLOOD 899938.
## 5 THUNDERSTORM WIND 876844.
## 6 HAIL 688693.
and Crop damage respectively:
DMG %>% select(EVTYPE, CROPDMG) %>% arrange(desc(CROPDMG)) %>% head()
## # A tibble: 6 x 2
## EVTYPE CROPDMG
## <chr> <dbl>
## 1 HAIL 579596.
## 2 FLASH FLOOD 179200.
## 3 FLOOD 168038.
## 4 TSTM WIND 109203.
## 5 TORNADO 100019.
## 6 THUNDERSTORM WIND 66791.
at the same way we can show visually the top 10 of the these two variables like:
pro<-DMG %>% select(EVTYPE, PROPDMG) %>%
arrange(desc(PROPDMG)) %>% head(10) %>%
ggplot( aes(x= reorder(EVTYPE, -PROPDMG), y=PROPDMG )) +
geom_bar(stat = "identity") +
xlab("Weather Event Type") +
ylab("Property Damage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle('Top 10 Property Damage')
cro<-DMG %>% select(EVTYPE, CROPDMG) %>%
arrange(desc(CROPDMG)) %>% head(10) %>%
ggplot( aes(x= reorder(EVTYPE, -CROPDMG), y=CROPDMG )) +
geom_bar(stat = "identity") +
xlab("Weather Event Type") +
ylab("Property Damage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle('Top 10 Crop Damage')
grid.arrange(pro, cro, nrow = 1)