This project explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database which tracks characteristics of major storms and weather events in the United States, including when and where they occured, type of event, estimates of relevant fatalities, injuries, and property damage.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(R.utils)
## Warning: package 'R.utils' was built under R version 3.6.3
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.6.2
## Loading required package: R.methodsS3
## Warning: package 'R.methodsS3' was built under R version 3.6.2
## R.methodsS3 v1.8.0 (2020-02-14 07:10:20 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.23.0 successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.9.2 successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
## warnings
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lubridate_1.7.4 R.utils_2.9.2 R.oo_1.23.0 R.methodsS3_1.8.0
## [5] knitr_1.26 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
## [9] purrr_0.3.3 readr_1.3.1 tidyr_1.0.0 tibble_2.1.3
## [13] ggplot2_3.2.1 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.11 haven_2.2.0 lattice_0.20-38
## [5] colorspace_1.4-1 vctrs_0.2.0 generics_0.0.2 htmltools_0.4.0
## [9] yaml_2.2.0 rlang_0.4.1 pillar_1.4.2 glue_1.3.1
## [13] withr_2.1.2 DBI_1.0.0 dbplyr_1.4.2 modelr_0.1.5
## [17] readxl_1.3.1 lifecycle_0.1.0 munsell_0.5.0 gtable_0.3.0
## [21] cellranger_1.1.0 rvest_0.3.5 evaluate_0.14 broom_0.5.2
## [25] Rcpp_1.0.3 scales_1.0.0 backports_1.1.5 jsonlite_1.6
## [29] fs_1.3.1 hms_0.5.2 digest_0.6.22 stringi_1.4.3
## [33] grid_3.6.1 cli_1.1.0 tools_3.6.1 magrittr_1.5
## [37] lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.3 zeallot_0.1.0
## [41] xml2_1.2.2 reprex_0.3.0 assertthat_0.2.1 rmarkdown_1.17
## [45] httr_1.4.1 rstudioapi_0.10 R6_2.4.1 nlme_3.1-140
## [49] compiler_3.6.1
First we will download the file directly from the url provided. This method is preferred to enhance reproducibility.
if(!file.exists("/repdata_data_StormData.csv.bz2")){
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile="./repdata_data_StormData.csv.bz2")
}
Now let’s load the data
stormdata <- read.csv("repdata_data_StormData.csv.bz2", sep=",", header=TRUE)
head(stormdata)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
According to our instructors: “There are 902297 records on 37 variables if you get anything else then you need to check your reading step before you go farther.” Let’s check if all is ok.
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Ok, we are good to go.
As oriented by our instructors we’ll want to filter only the results newer than 1996. The first step to do that is separating the BGN_DATE to a Date format so we can filter it.
stormdata$BGN_DATE <- as.Date(stormdata$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
str(stormdata$BGN_DATE)
## Date[1:902297], format: "1950-04-18" "1950-04-18" "1951-02-20" "1951-06-08" "1951-11-15" ...
Now we filter events from 96 onward, I will use the last day of 1995 as cutting point
stormdata96 <- filter(stormdata, BGN_DATE > "1995-12-31")
Down to 353530 obs.
Now we will select only the variables we are interested
sd96 <- select(stormdata96, BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
#Remove unused dataset
remove(stormdata96)
Now let’s try to clean up the messy data on EVTYPE
sd96 <- sd96 %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Astronomical Low Tide", ignore_case = TRUE)) == TRUE, "Astronomical Low Tide", as.character(EVTYPE))) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Avalanche", ignore_case = TRUE)) == TRUE, "Avalanche", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Blizzard", ignore_case = TRUE)) == TRUE, "Blizzard", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Coastal Flood", ignore_case = TRUE)) == TRUE, "Coastal Flood", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Extreme Cold", ignore_case = TRUE)) == TRUE, "Extreme Cold/Wind Chill", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Cold", ignore_case = TRUE)) == TRUE, "Cold/Wind Chill", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Debris Flow", ignore_case = TRUE)) == TRUE, "Debris Flow", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dense Fog", ignore_case = TRUE)) == TRUE, "Dense Fog", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dense Smoke", ignore_case = TRUE)) == TRUE, "Dense Smoke", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Debris Flow", ignore_case = TRUE)) == TRUE, "Debris Flow", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Drought", ignore_case = TRUE)) == TRUE, "Drought", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dust Devil", ignore_case = TRUE)) == TRUE, "Dust Devil", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dust Devel", ignore_case = TRUE)), "Dust Devil",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dust Storm", ignore_case = TRUE)) == TRUE, "Dust Storm", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Excessive Heat", ignore_case = TRUE)) == TRUE, "Excessive Heat", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Flash Flood", ignore_case = TRUE)) == TRUE, "Flash Flood", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Flood", ignore_case = TRUE)) == TRUE, "Flood", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Frost", ignore_case = TRUE)) == TRUE, "Frost/Freeze", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Freeze", ignore_case = TRUE)) == TRUE, "Frost/Freeze", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Funnel Cloud", ignore_case = TRUE)) == TRUE, "Funnel Cloud", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Freezing Fog", ignore_case = TRUE)) == TRUE, "Freezing Fog", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Hail", ignore_case = TRUE)) == TRUE, "Hail", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Heat", ignore_case = TRUE)) == TRUE, "Heat", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Heavy Rain", ignore_case = TRUE)) == TRUE, "Heavy Rain", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Rain (heavy)", ignore_case = TRUE)) == TRUE, "Heavy Rain", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Heavy Snow", ignore_case = TRUE)) == TRUE, "Heavy Snow", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("High Surf", ignore_case = TRUE)) == TRUE, "High Surf", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("High Wind", ignore_case = TRUE)) == TRUE, "High Wind", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Hurricane", ignore_case = TRUE)) == TRUE, "Hurricane (Typhoon)", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Typhoon", ignore_case = TRUE)), "Hurricane (Typhoon)",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Ice Storm", ignore_case = TRUE)) == TRUE, "Ice Storm", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lake-Effect Snow", ignore_case = TRUE)) == TRUE, "Lake-Effect Snow", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lake Effect Snow", ignore_case = TRUE)), "Lake-Effect Snow",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lakeshore Flood", ignore_case = TRUE)) == TRUE, "Lakeshore Flood", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lightning", ignore_case = TRUE)) == TRUE, "Lightning", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine Hail", ignore_case = TRUE)) == TRUE, "Marine Hail", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine High Wind", ignore_case = TRUE)) == TRUE, "Marine High Wind", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine Strong Wind", ignore_case = TRUE)) == TRUE, "Marine Strong Wind", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine Thunderstorm Wind", ignore_case = TRUE)) == TRUE, "Marine Thunderstorm Wind", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Rip Current", ignore_case = TRUE)) == TRUE, "Rip Current", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Seiche", ignore_case = TRUE)) == TRUE, "Seiche", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Sleet", ignore_case = TRUE)) == TRUE, "Sleet", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Storm Surge", ignore_case = TRUE)) == TRUE, "Storm Surge/Tide", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Strong Wind", ignore_case = TRUE)) == TRUE, "Strong Wind", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Thunderstorm Wind", ignore_case = TRUE)) == TRUE, "Thunderstorm Wind", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tornado", ignore_case = TRUE)) == TRUE, "Tornado", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tropical Depression", ignore_case = TRUE)) == TRUE, "Tropical Depression", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tropical Storm", ignore_case = TRUE)) == TRUE, "Tropical Storm", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tsunami", ignore_case = TRUE)) == TRUE, "Tsunami", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Volcanic Ash", ignore_case = TRUE)) == TRUE, "Volcanic Ash", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Waterspout", ignore_case = TRUE)) == TRUE, "Waterspout", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Wildfire", ignore_case = TRUE)) == TRUE, "Wildfire", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Winter Storm", ignore_case = TRUE)) == TRUE, "Winter Storm", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Winter Weather", ignore_case = TRUE)) == TRUE, "Winter Weather", EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("mix", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("summary", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("record", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("tstm", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("UNSEASONABLY", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("monthly", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("moderate", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("other", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("and", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("unusual", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("urban", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("season", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("damage", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("light", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("damage", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("jam", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("damage", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("forest", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("vog", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("very", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("accu", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("year", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("wall", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("wet", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("burst", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("blow", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("DROWNING", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dry", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("mud", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("WIND", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("gust", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("warm", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("wake", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("advisory", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("torrential", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("WINDS", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("WND", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("erosion", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("spell", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("cool", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("month", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("excessive", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("first", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("drifting", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("early", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("road", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("sea", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("none", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("late", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("metro", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("prolonged", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("mountain", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("glaze", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("drizzle", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("pellet", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("saharan", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("flag", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("patchy", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("slide", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("eruption", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("rough", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("exposure", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("swells", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("ICE", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("FOG", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("heavy", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("SNOW", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("squall", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("rogue", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("RAIN", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("heavy", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("SMOKE", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("high water", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("COASTAL", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("THUNDERSTORM", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("hot", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("ASTRONOMICAL", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("DAM BREAK", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("FLOYD", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("BRUSH", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("stream", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Ice/Snow", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("hazardous", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Black Ice", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("accident", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("gradient", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Coastal Storm", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Whirlwind", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("ice fog", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Freezing rain", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
mutate(EVTYPE = as.factor(EVTYPE))
levels(sd96$EVTYPE)
## [1] "Astronomical Low Tide" "Avalanche" "Blizzard"
## [4] "Cold/Wind Chill" "Dense Fog" "Dense Smoke"
## [7] "Dust Devil" "Dust Storm" "Flood"
## [10] "Freezing Fog" "Freezing Spray" "Frost/Freeze"
## [13] "Funnel Cloud" "Hail" "Heat"
## [16] "High Surf" "High Wind" "Hurricane (Typhoon)"
## [19] "Ice Storm" "Lake-Effect Snow" "non-standardized"
## [22] "Rip Current" "Seiche" "Sleet"
## [25] "Snow" "Storm Surge/Tide" "Strong Wind"
## [28] "Thundersnow shower" "Thunderstorm Wind" "Tornado"
## [31] "Tropical Depression" "Tropical Storm" "Tsunami"
## [34] "Volcanic Ash" "Waterspout" "Wildfire"
## [37] "Wind" "Winter Storm" "Winter Weather"
We are down to 39 event types, It is actually less than the 48 types that exist and I opted to add an “non-standardized” classification for the ones I was not able to identify. Either way if you run the code I used above you will have the same results.
Now let’s deal with the other data inconsistencies Fixing PROPDMGEXP and CROPDMGEXP
sd96 <- sd96 %>% mutate(PROPDMGEXP = as.character(PROPDMGEXP)) %>%
mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("b", ignore_case = TRUE)) == TRUE, "9", PROPDMGEXP)) %>%
mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("m", ignore_case = TRUE)) == TRUE, "6", PROPDMGEXP)) %>%
mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("k", ignore_case = TRUE)) == TRUE, "3", PROPDMGEXP)) %>%
mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("h", ignore_case = TRUE)) == TRUE, "2", PROPDMGEXP)) %>%
mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll(" ", ignore_case = TRUE)) == TRUE, "0", PROPDMGEXP)) %>%
mutate(PROPDMGEXP = as.numeric(PROPDMGEXP)) %>%
mutate(PROPDMGEXP = if_else(is.na(PROPDMGEXP) == TRUE, 0, PROPDMGEXP)) %>%
mutate(TOTALPROPDMG = PROPDMG * 10^PROPDMGEXP)
str(sd96$PROPDMGEXP)
## num [1:653530] 3 3 3 3 3 0 3 3 3 3 ...
str(sd96$TOTALPROPDMG)
## num [1:653530] 380000 100000 3000 5000 2000 0 400000 12000 8000 12000 ...
sd96 <- sd96 %>% mutate(CROPDMGEXP = as.character(CROPDMGEXP)) %>%
mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("b", ignore_case = TRUE)) == TRUE, "9", CROPDMGEXP)) %>%
mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("m", ignore_case = TRUE)) == TRUE, "6", CROPDMGEXP)) %>%
mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("k", ignore_case = TRUE)) == TRUE, "3", CROPDMGEXP)) %>%
mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("h", ignore_case = TRUE)) == TRUE, "2", CROPDMGEXP)) %>%
mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll(" ", ignore_case = TRUE)) == TRUE, "0", CROPDMGEXP)) %>%
mutate(CROPDMGEXP = as.numeric(CROPDMGEXP)) %>%
mutate(CROPDMGEXP = if_else(is.na(CROPDMGEXP) == TRUE, 0, CROPDMGEXP)) %>%
mutate(TOTALCROPDMG = CROPDMG * 10^CROPDMGEXP)
str(sd96$CROPDMGEXP)
## num [1:653530] 3 0 0 0 0 0 0 0 0 0 ...
str(sd96$TOTALCROPDMG)
## num [1:653530] 38000 0 0 0 0 0 0 0 0 0 ...
head(sd96)
## BGN_DATE EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## 1 1996-01-06 Winter Storm 0 0 380 3 38
## 2 1996-01-11 Tornado 0 0 100 3 0
## 3 1996-01-11 non-standardized 0 0 3 3 0
## 4 1996-01-11 non-standardized 0 0 5 3 0
## 5 1996-01-11 non-standardized 0 0 2 3 0
## 6 1996-01-18 Hail 0 0 0 0 0
## CROPDMGEXP TOTALPROPDMG TOTALCROPDMG
## 1 3 380000 38000
## 2 0 100000 0
## 3 0 3000 0
## 4 0 5000 0
## 5 0 2000 0
## 6 0 0 0
Your data analysis must address the following questions:
fatality <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(FATALITIES)) %>% arrange(desc(sum))
top10_fatal <- fatality[1:10, ]
ggplot(top10_fatal, aes(x = reorder(EVTYPE,sum), y = sum, fill = EVTYPE)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Weather Incidents with the most Fatalities", y = "Number of Fatalities", x = "")
injury <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(INJURIES)) %>% arrange(desc(sum))
top10_injury <- injury[1:10, ]
ggplot(top10_injury, aes(x = reorder(EVTYPE,sum), y = sum, fill = EVTYPE)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Weather Incidents with the most Injuries", y = "Number of Injuries", x = "")
propertydmg <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(TOTALPROPDMG)) %>% arrange(desc(sum))
top10_propdmg <- propertydmg[1:10, ]
ggplot(top10_propdmg, aes(x = reorder(EVTYPE,sum), y = sum, fill = EVTYPE)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Weather Incidents with the most property damage", y = "Damage in $", x = "")
cropdmg <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(TOTALCROPDMG)) %>% arrange(desc(sum))
top10_cropdmg <- cropdmg[1:10, ]
top10_cropdmg
## # A tibble: 10 x 2
## EVTYPE sum
## <fct> <dbl>
## 1 non-standardized 14893278520
## 2 Flood 6339575100
## 3 Hurricane (Typhoon) 5350107800
## 4 Hail 2561518700
## 5 Frost/Freeze 1368761000
## 6 Cold/Wind Chill 1339765500
## 7 Tropical Storm 677711000
## 8 High Wind 633561300
## 9 Heat 492578500
## 10 Thunderstorm Wind 398381000