Synopsis

This project explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database which tracks characteristics of major storms and weather events in the United States, including when and where they occured, type of event, estimates of relevant fatalities, injuries, and property damage.

Load needed packages

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(R.utils)
## Warning: package 'R.utils' was built under R version 3.6.3
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.6.2
## Loading required package: R.methodsS3
## Warning: package 'R.methodsS3' was built under R version 3.6.2
## R.methodsS3 v1.8.0 (2020-02-14 07:10:20 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.23.0 successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## R.utils v2.9.2 successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
##     warnings
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Session information provided to enhance the reproducibility

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.7.4   R.utils_2.9.2     R.oo_1.23.0       R.methodsS3_1.8.0
##  [5] knitr_1.26        forcats_0.4.0     stringr_1.4.0     dplyr_0.8.3      
##  [9] purrr_0.3.3       readr_1.3.1       tidyr_1.0.0       tibble_2.1.3     
## [13] ggplot2_3.2.1     tidyverse_1.3.0  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5 xfun_0.11        haven_2.2.0      lattice_0.20-38 
##  [5] colorspace_1.4-1 vctrs_0.2.0      generics_0.0.2   htmltools_0.4.0 
##  [9] yaml_2.2.0       rlang_0.4.1      pillar_1.4.2     glue_1.3.1      
## [13] withr_2.1.2      DBI_1.0.0        dbplyr_1.4.2     modelr_0.1.5    
## [17] readxl_1.3.1     lifecycle_0.1.0  munsell_0.5.0    gtable_0.3.0    
## [21] cellranger_1.1.0 rvest_0.3.5      evaluate_0.14    broom_0.5.2     
## [25] Rcpp_1.0.3       scales_1.0.0     backports_1.1.5  jsonlite_1.6    
## [29] fs_1.3.1         hms_0.5.2        digest_0.6.22    stringi_1.4.3   
## [33] grid_3.6.1       cli_1.1.0        tools_3.6.1      magrittr_1.5    
## [37] lazyeval_0.2.2   crayon_1.3.4     pkgconfig_2.0.3  zeallot_0.1.0   
## [41] xml2_1.2.2       reprex_0.3.0     assertthat_0.2.1 rmarkdown_1.17  
## [45] httr_1.4.1       rstudioapi_0.10  R6_2.4.1         nlme_3.1-140    
## [49] compiler_3.6.1

Data processing

First we will download the file directly from the url provided. This method is preferred to enhance reproducibility.

if(!file.exists("/repdata_data_StormData.csv.bz2")){
  download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
                destfile="./repdata_data_StormData.csv.bz2")
}

Now let’s load the data

stormdata <- read.csv("repdata_data_StormData.csv.bz2", sep=",", header=TRUE)
head(stormdata)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6

According to our instructors: “There are 902297 records on 37 variables if you get anything else then you need to check your reading step before you go farther.” Let’s check if all is ok.

str(stormdata)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Ok, we are good to go.

As oriented by our instructors we’ll want to filter only the results newer than 1996. The first step to do that is separating the BGN_DATE to a Date format so we can filter it.

stormdata$BGN_DATE <- as.Date(stormdata$BGN_DATE, format = "%m/%d/%Y %H:%M:%S")
str(stormdata$BGN_DATE)
##  Date[1:902297], format: "1950-04-18" "1950-04-18" "1951-02-20" "1951-06-08" "1951-11-15" ...

Now we filter events from 96 onward, I will use the last day of 1995 as cutting point

stormdata96 <- filter(stormdata, BGN_DATE > "1995-12-31")

Down to 353530 obs.

Now we will select only the variables we are interested

sd96 <- select(stormdata96, BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
#Remove unused dataset
remove(stormdata96)

Now let’s try to clean up the messy data on EVTYPE

sd96 <- sd96 %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Astronomical Low Tide", ignore_case = TRUE)) == TRUE, "Astronomical Low Tide", as.character(EVTYPE))) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Avalanche", ignore_case = TRUE)) == TRUE, "Avalanche", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Blizzard", ignore_case = TRUE)) == TRUE, "Blizzard", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Coastal Flood", ignore_case = TRUE)) == TRUE, "Coastal Flood", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Extreme Cold", ignore_case = TRUE)) == TRUE, "Extreme Cold/Wind Chill", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Cold", ignore_case = TRUE)) == TRUE, "Cold/Wind Chill", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Debris Flow", ignore_case = TRUE)) == TRUE, "Debris Flow", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dense Fog", ignore_case = TRUE)) == TRUE, "Dense Fog", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dense Smoke", ignore_case = TRUE)) == TRUE, "Dense Smoke", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Debris Flow", ignore_case = TRUE)) == TRUE, "Debris Flow", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Drought", ignore_case = TRUE)) == TRUE, "Drought", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dust Devil", ignore_case = TRUE)) == TRUE, "Dust Devil", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dust Devel", ignore_case = TRUE)), "Dust Devil",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dust Storm", ignore_case = TRUE)) == TRUE, "Dust Storm", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Excessive Heat", ignore_case = TRUE)) == TRUE, "Excessive Heat", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Flash Flood", ignore_case = TRUE)) == TRUE, "Flash Flood", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Flood", ignore_case = TRUE)) == TRUE, "Flood", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Frost", ignore_case = TRUE)) == TRUE, "Frost/Freeze", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Freeze", ignore_case = TRUE)) == TRUE, "Frost/Freeze", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Funnel Cloud", ignore_case = TRUE)) == TRUE, "Funnel Cloud", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Freezing Fog", ignore_case = TRUE)) == TRUE, "Freezing Fog", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Hail", ignore_case = TRUE)) == TRUE, "Hail", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Heat", ignore_case = TRUE)) == TRUE, "Heat", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Heavy Rain", ignore_case = TRUE)) == TRUE, "Heavy Rain", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Rain (heavy)", ignore_case = TRUE)) == TRUE, "Heavy Rain", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Heavy Snow", ignore_case = TRUE)) == TRUE, "Heavy Snow", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("High Surf", ignore_case = TRUE)) == TRUE, "High Surf", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("High Wind", ignore_case = TRUE)) == TRUE, "High Wind", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Hurricane", ignore_case = TRUE)) == TRUE, "Hurricane (Typhoon)", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Typhoon", ignore_case = TRUE)), "Hurricane (Typhoon)",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Ice Storm", ignore_case = TRUE)) == TRUE, "Ice Storm", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lake-Effect Snow", ignore_case = TRUE)) == TRUE, "Lake-Effect Snow", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lake Effect Snow", ignore_case = TRUE)), "Lake-Effect Snow",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lakeshore Flood", ignore_case = TRUE)) == TRUE, "Lakeshore Flood", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Lightning", ignore_case = TRUE)) == TRUE, "Lightning", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine Hail", ignore_case = TRUE)) == TRUE, "Marine Hail", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine High Wind", ignore_case = TRUE)) == TRUE, "Marine High Wind", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine Strong Wind", ignore_case = TRUE)) == TRUE, "Marine Strong Wind", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Marine Thunderstorm Wind", ignore_case = TRUE)) == TRUE, "Marine Thunderstorm Wind", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Rip Current", ignore_case = TRUE)) == TRUE, "Rip Current", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Seiche", ignore_case = TRUE)) == TRUE, "Seiche", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Sleet", ignore_case = TRUE)) == TRUE, "Sleet", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Storm Surge", ignore_case = TRUE)) == TRUE, "Storm Surge/Tide", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Strong Wind", ignore_case = TRUE)) == TRUE, "Strong Wind", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Thunderstorm Wind", ignore_case = TRUE)) == TRUE, "Thunderstorm Wind", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tornado", ignore_case = TRUE)) == TRUE, "Tornado", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tropical Depression", ignore_case = TRUE)) == TRUE, "Tropical Depression", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tropical Storm", ignore_case = TRUE)) == TRUE, "Tropical Storm", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Tsunami", ignore_case = TRUE)) == TRUE, "Tsunami", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Volcanic Ash", ignore_case = TRUE)) == TRUE, "Volcanic Ash", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Waterspout", ignore_case = TRUE)) == TRUE, "Waterspout", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Wildfire", ignore_case = TRUE)) == TRUE, "Wildfire", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Winter Storm", ignore_case = TRUE)) == TRUE, "Winter Storm", EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Winter Weather", ignore_case = TRUE)) == TRUE, "Winter Weather", EVTYPE)) %>% 
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("mix", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("summary", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("record", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("tstm", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("UNSEASONABLY", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("monthly", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("moderate", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("other", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("and", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("unusual", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("urban", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("season", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("damage", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("light", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("damage", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("jam", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("damage", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("forest", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("vog", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("very", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("accu", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("year", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("wall", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("wet", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("burst", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("blow", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("DROWNING", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Dry", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("mud", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("WIND", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("gust", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("warm", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("wake", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("advisory", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("torrential", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("WINDS", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("WND", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("erosion", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("spell", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("cool", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("month", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("excessive", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("first", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("drifting", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("early", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("road", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("sea", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("none", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("late", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("metro", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("prolonged", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("mountain", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("glaze", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("drizzle", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("pellet", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("saharan", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("flag", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("patchy", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("severe", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("slide", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("eruption", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("rough", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("exposure", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("swells", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("ICE", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("FOG", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("heavy", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("SNOW", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("squall", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("rogue", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("RAIN", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("heavy", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("SMOKE", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("high water", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("COASTAL", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("THUNDERSTORM", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("hot", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("ASTRONOMICAL", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("DAM BREAK", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("FLOYD", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("BRUSH", ignore_case = FALSE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("stream", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Ice/Snow", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("hazardous", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Black Ice", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("accident", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("gradient", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Coastal Storm", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Whirlwind", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("ice fog", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = if_else(str_detect(EVTYPE, coll("Freezing rain", ignore_case = TRUE)), "non-standardized",EVTYPE)) %>%
  mutate(EVTYPE = as.factor(EVTYPE))

levels(sd96$EVTYPE)
##  [1] "Astronomical Low Tide" "Avalanche"             "Blizzard"             
##  [4] "Cold/Wind Chill"       "Dense Fog"             "Dense Smoke"          
##  [7] "Dust Devil"            "Dust Storm"            "Flood"                
## [10] "Freezing Fog"          "Freezing Spray"        "Frost/Freeze"         
## [13] "Funnel Cloud"          "Hail"                  "Heat"                 
## [16] "High Surf"             "High Wind"             "Hurricane (Typhoon)"  
## [19] "Ice Storm"             "Lake-Effect Snow"      "non-standardized"     
## [22] "Rip Current"           "Seiche"                "Sleet"                
## [25] "Snow"                  "Storm Surge/Tide"      "Strong Wind"          
## [28] "Thundersnow shower"    "Thunderstorm Wind"     "Tornado"              
## [31] "Tropical Depression"   "Tropical Storm"        "Tsunami"              
## [34] "Volcanic Ash"          "Waterspout"            "Wildfire"             
## [37] "Wind"                  "Winter Storm"          "Winter Weather"

We are down to 39 event types, It is actually less than the 48 types that exist and I opted to add an “non-standardized” classification for the ones I was not able to identify. Either way if you run the code I used above you will have the same results.

Now let’s deal with the other data inconsistencies Fixing PROPDMGEXP and CROPDMGEXP

sd96 <- sd96 %>% mutate(PROPDMGEXP = as.character(PROPDMGEXP)) %>% 
  mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("b", ignore_case = TRUE)) == TRUE, "9", PROPDMGEXP)) %>% 
  mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("m", ignore_case = TRUE)) == TRUE, "6", PROPDMGEXP)) %>%
  mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("k", ignore_case = TRUE)) == TRUE, "3", PROPDMGEXP)) %>%
  mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll("h", ignore_case = TRUE)) == TRUE, "2", PROPDMGEXP)) %>%
  mutate(PROPDMGEXP = if_else(str_detect(PROPDMGEXP, coll(" ", ignore_case = TRUE)) == TRUE, "0", PROPDMGEXP)) %>%
  mutate(PROPDMGEXP = as.numeric(PROPDMGEXP)) %>% 
  mutate(PROPDMGEXP = if_else(is.na(PROPDMGEXP) == TRUE, 0, PROPDMGEXP)) %>% 
  mutate(TOTALPROPDMG = PROPDMG * 10^PROPDMGEXP)

  str(sd96$PROPDMGEXP)
##  num [1:653530] 3 3 3 3 3 0 3 3 3 3 ...
  str(sd96$TOTALPROPDMG)
##  num [1:653530] 380000 100000 3000 5000 2000 0 400000 12000 8000 12000 ...
sd96 <- sd96 %>% mutate(CROPDMGEXP = as.character(CROPDMGEXP)) %>% 
  mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("b", ignore_case = TRUE)) == TRUE, "9", CROPDMGEXP)) %>% 
  mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("m", ignore_case = TRUE)) == TRUE, "6", CROPDMGEXP)) %>%
  mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("k", ignore_case = TRUE)) == TRUE, "3", CROPDMGEXP)) %>%
  mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll("h", ignore_case = TRUE)) == TRUE, "2", CROPDMGEXP)) %>%
  mutate(CROPDMGEXP = if_else(str_detect(CROPDMGEXP, coll(" ", ignore_case = TRUE)) == TRUE, "0", CROPDMGEXP)) %>%
  mutate(CROPDMGEXP = as.numeric(CROPDMGEXP)) %>% 
  mutate(CROPDMGEXP = if_else(is.na(CROPDMGEXP) == TRUE, 0, CROPDMGEXP)) %>% 
  mutate(TOTALCROPDMG = CROPDMG * 10^CROPDMGEXP)

  str(sd96$CROPDMGEXP)
##  num [1:653530] 3 0 0 0 0 0 0 0 0 0 ...
  str(sd96$TOTALCROPDMG)
##  num [1:653530] 38000 0 0 0 0 0 0 0 0 0 ...
  head(sd96)
##     BGN_DATE           EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## 1 1996-01-06     Winter Storm          0        0     380          3      38
## 2 1996-01-11          Tornado          0        0     100          3       0
## 3 1996-01-11 non-standardized          0        0       3          3       0
## 4 1996-01-11 non-standardized          0        0       5          3       0
## 5 1996-01-11 non-standardized          0        0       2          3       0
## 6 1996-01-18             Hail          0        0       0          0       0
##   CROPDMGEXP TOTALPROPDMG TOTALCROPDMG
## 1          3       380000        38000
## 2          0       100000            0
## 3          0         3000            0
## 4          0         5000            0
## 5          0         2000            0
## 6          0            0            0

Questions

Your data analysis must address the following questions:

  1. Across the United States, which types of events (as indicated in the EVTYPR) are most harmful with respect to population health? TOP 10 fatalities
fatality <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(FATALITIES)) %>% arrange(desc(sum))
top10_fatal <- fatality[1:10, ]

ggplot(top10_fatal, aes(x = reorder(EVTYPE,sum), y = sum, fill = EVTYPE)) +
  geom_bar(stat = "identity") + 
  coord_flip() + 
  labs(title = "Top 10 Weather Incidents with the most Fatalities", y = "Number of Fatalities", x = "")

injury <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(INJURIES)) %>% arrange(desc(sum))

top10_injury <- injury[1:10, ]

ggplot(top10_injury, aes(x = reorder(EVTYPE,sum), y = sum, fill = EVTYPE)) +
  geom_bar(stat = "identity") + 
  coord_flip() + 
  labs(title = "Top 10 Weather Incidents with the most Injuries", y = "Number of Injuries", x = "")

  1. Across the United States, which types of events have the greatest economic consequences?
propertydmg <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(TOTALPROPDMG)) %>% arrange(desc(sum))

top10_propdmg <- propertydmg[1:10, ]

ggplot(top10_propdmg, aes(x = reorder(EVTYPE,sum), y = sum, fill = EVTYPE)) +
  geom_bar(stat = "identity") + 
  coord_flip() + 
  labs(title = "Top 10 Weather Incidents with the most property damage", y = "Damage in $", x = "")

cropdmg <- sd96 %>% group_by(EVTYPE) %>% summarise(sum = sum(TOTALCROPDMG)) %>% arrange(desc(sum))

top10_cropdmg <- cropdmg[1:10, ]
top10_cropdmg
## # A tibble: 10 x 2
##    EVTYPE                      sum
##    <fct>                     <dbl>
##  1 non-standardized    14893278520
##  2 Flood                6339575100
##  3 Hurricane (Typhoon)  5350107800
##  4 Hail                 2561518700
##  5 Frost/Freeze         1368761000
##  6 Cold/Wind Chill      1339765500
##  7 Tropical Storm        677711000
##  8 High Wind             633561300
##  9 Heat                  492578500
## 10 Thunderstorm Wind     398381000

CONCLUSION

According to my analysis:

-Heat is the event that causes more deaths

-Tornados is the event that causes more injuries

-Floods cause more property damage

-The sum of event that are coded incorrectly cause the most crop damage, for the ones correctly identified Floods cause the most damage