# loading
library(data.table); library(dplyr); library(tidyr)
library(ggplot2); library(gridExtra); library(plotly)
loader()
storm <- fread("./data/1119_DS-RR-w4_Storm/storm.csv")

# duplicates1
duo<- select(storm, -c(REFNUM,REMARKS)) %>% duplicated()
storm <- storm[!duo,]

# subsetting
storm <- storm[,BGN_DATE:= as.IDate(BGN_DATE, "%m/%d/%Y %H:%M:%S")][
        ,.(REFNUM, YEAR= year(BGN_DATE), BGN_DATE, EVTYPE, STATE,C_ZONE= COUNTY,
           DEATHS=FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG,
           CROPDMGEXP, REMARKS)]

# monetary units
storm <- storm[
  ,`:=` ("PROP ($, mln)"=mult(PROPDMG,PROPDMGEXP),
         "CROP ($, mln)"=mult(CROPDMG,CROPDMGEXP))][
           , c(1:8,14:15,13)]

# duplicates2
duo<- select(storm, -c(REFNUM, REMARKS)) %>% duplicated()
storm <- storm[!duo,]

Chapter 5.1: Statistical measures with boxplots

Look at how harm and damage variables are distributed by constructing boxplots. Since their values are obviously highly skewed towards zero, scale the y-axis as logarithm to base \(10\). First, harm to humans:

storm<- storm[!(REFNUM %in% c(605951, 605943)), ]
DEATHS <- storm[DEATHS >0, c(3,7)]
INJURIES <- storm[INJURIES >0, c(3,8)]
DEATHS <- DEATHS[, 2]; setnames(DEATHS, "DEATHS", "HARM")
INJURIES <- INJURIES[, 2]; setnames(INJURIES, "INJURIES", "HARM")

DEATHS$group <-"DEATHS"
INJURIES$group <-"INJURIES"
HARM <- rbind(DEATHS,INJURIES)
gHARM<-ggplot(HARM, aes(x=factor(group), y=log10(HARM)))+
  geom_boxplot(aes(fill=group), color= "darkgrey",
               outlier.colour="cornflowerblue", outlier.shape=16,
               outlier.size=2, notch=FALSE)+
  scale_fill_manual(name="", values = c("firebrick3","springgreen3"))+
  scale_x_discrete(name="")+
  scale_y_continuous("total harm, log10(count)")+
  labs(title ="HARM to HUMANS \n(interactive)")+
  theme(plot.title = element_text(size = rel(0.9)),
      axis.line = element_line(size = 3, colour = "grey80"))
gHARM<-ggplotly(gHARM)
gHARM$x$data[[1]]$marker$line$color = "cornflowerblue"
gHARM$x$data[[2]]$marker$line$color = "cornflowerblue"
gHARM$x$data[[1]]$marker$outliercolor = "cornflowerblue"
gHARM$x$data[[2]]$marker$outliercolor = "cornflowerblue"
gHARM$x$data[[1]]$marker$color = "cornflowerblue"
gHARM$x$data[[2]]$marker$color = "cornflowerblue"
gHARM

Number of deaths has a median of \(10^0=1\), a maximum of \(10^{2.77}\), and a 3rd quartile of \(10^{0.3}\). So, more than \(75\%\) of non-zero in deaths values \(<\) 1.995. As for injuries, its number has a median of \(10^{0.3}\), a maximum of \(10^{3.23}\), and a 3rd quartile of \(10^{0.6}\), so that more than \(75\%\) of non-zero in injuries values \(<\) 3.98.

Look at crops/ property boxplots:

PROP <- storm[`PROP ($, mln)` >0, c(3,9)]
CROP <- storm[`CROP ($, mln)` >0, c(3,10)]
PROP <- PROP[, 2]; setnames(PROP, "PROP ($, mln)", "DAMAGE($, mln)")
CROP <- CROP[, 2]; setnames(CROP, "CROP ($, mln)", "DAMAGE($, mln)")

PROP$group <-"PROPERTY"
CROP$group <-"CROPS"
DAMAGE <- rbind(CROP,PROP)
gDAM<-ggplot(DAMAGE, aes(x=factor(group), y=log10(`DAMAGE($, mln)`)))+
  geom_boxplot(aes(fill=group), color= "darkgrey",
               outlier.colour="cornflowerblue",
               outlier.shape=16, outlier.size=2, notch=FALSE)+
  scale_fill_manual(name="", values = c("darkorange3","mediumorchid3"))+
  scale_x_discrete(name="")+
  scale_y_continuous("damage, log10($, mln)")+
  labs(title ="DAMAGE to CROPS and PROPERTY \n(interactive)")+
  theme(plot.title = element_text(size = rel(0.9)),
      axis.line = element_line(size = 3, colour = "grey80"))
gDAM<-ggplotly(gDAM)
gDAM$x$data[[1]]$marker$line$color = "cornflowerblue"
gDAM$x$data[[2]]$marker$line$color = "cornflowerblue"
gDAM$x$data[[1]]$marker$outliercolor = "cornflowerblue"
gDAM$x$data[[2]]$marker$outliercolor = "cornflowerblue"
gDAM$x$data[[1]]$marker$color = "cornflowerblue"
gDAM$x$data[[2]]$marker$color = "cornflowerblue"
gDAM

Damage to crops in \(\$\) mln has a median of \(10^{-1.3}\), a maximum of \(10^{3.7}\), and a 3rd quartile of \(10^{-0.6}\). So, more than \(75\%\) of non-zero in crops damage values \(<\$\) 0.25 mln, that is about \(\$250\ 000\).

As for property, damage to it in \(\$\), mln has a median of \(10^{-1.52}\), a maximum of \(10^{5.06}\), and a 3rd quartile of \(10^{-0.82}\), so that more than \(75\%\) of non-zero in property damage values \(<\$150\ 000\).

Thus, while the maximum value of property damage is greater than crops one, the latter is noticeably higher in median and 3rd quartile.


Appendix

loader1

loader <- function() {
        library("R.utils"); library("data.table")
        myload <- function(url, year = "", zip = "gz") {
                dest <- paste0("./data/1119_DS-RR-w4_Storm/storm",
                               year, ".csv.", zip)
                storm <- paste0("./data/1119_DS-RR-w4_Storm/storm",
                                year, ".csv")
                if(!file.exists(dest)) {download.file(url, destfile = dest,
                                                      method = "curl")}
                if(!file.exists(storm)) {ifelse(zip=="gz",
                                              gunzip(dest, remove=FALSE),
                                              bunzip2(dest, remove=FALSE))
                }
        }
        myload(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
               zip = "bz2")
}

mult

mult <- function(d,e) {
        e <- toupper(e)
        d <- ifelse(e %in% c("","-"), d,
                    ifelse(e %in% c("?","+"), as.numeric(d)*10^1,
                           ifelse(e == "H", as.numeric(d)*10^2,
                                  ifelse (e == "K", as.numeric(d)*10^3,
                                          ifelse(e == "M", as.numeric(d)*10^6,
                                                 ifelse (e =="B", as.numeric(d)*10^9,
                                                         paste0(d,e)))))))
        d<- round(as.numeric(d)/10^6,2)
        d
}

sessionInf (for reproducibility)

R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] R.utils_2.10.1    R.oo_1.24.0       R.methodsS3_1.8.1 plotly_4.9.2.1   
[5] gridExtra_2.3     ggplot2_3.3.2     tidyr_1.1.2       dplyr_1.0.2      
[9] data.table_1.13.4

loaded via a namespace (and not attached):
 [1] pillar_1.4.7      compiler_4.0.3    tools_4.0.3       digest_0.6.27    
 [5] jsonlite_1.7.2    evaluate_0.14     lifecycle_0.2.0   tibble_3.0.4     
 [9] gtable_0.3.0      viridisLite_0.3.0 pkgconfig_2.0.3   rlang_0.4.9      
[13] crosstalk_1.1.0.1 yaml_2.2.1        xfun_0.19         withr_2.3.0      
[17] stringr_1.4.0     httr_1.4.2        knitr_1.30        generics_0.1.0   
[21] vctrs_0.3.5       htmlwidgets_1.5.2 grid_4.0.3        tidyselect_1.1.0 
[25] glue_1.4.2        R6_2.5.0          rmarkdown_2.5     purrr_0.3.4      
[29] magrittr_2.0.1    scales_1.1.1      ellipsis_0.3.1    htmltools_0.5.0  
[33] colorspace_2.0-0  labeling_0.4.2    stringi_1.5.3     lazyeval_0.2.2   
[37] munsell_0.5.0     crayon_1.3.4