# loading
library(data.table); library(dplyr); library(tidyr)
library(ggplot2); library(gridExtra); library(plotly)
loader()
storm <- fread("./data/1119_DS-RR-w4_Storm/storm.csv")
# duplicates1
duo<- select(storm, -c(REFNUM,REMARKS)) %>% duplicated()
storm <- storm[!duo,]
# subsetting
storm <- storm[,BGN_DATE:= as.IDate(BGN_DATE, "%m/%d/%Y %H:%M:%S")][
,.(REFNUM, YEAR= year(BGN_DATE), BGN_DATE, EVTYPE, STATE,C_ZONE= COUNTY,
DEATHS=FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG,
CROPDMGEXP, REMARKS)]
# monetary units
storm <- storm[
,`:=` ("PROP ($, mln)"=mult(PROPDMG,PROPDMGEXP),
"CROP ($, mln)"=mult(CROPDMG,CROPDMGEXP))][
, c(1:8,14:15,13)]
# duplicates2
duo<- select(storm, -c(REFNUM, REMARKS)) %>% duplicated()
storm <- storm[!duo,]Look at how harm and damage variables are distributed by constructing boxplots. Since their values are obviously highly skewed towards zero, scale the y-axis as logarithm to base \(10\). First, harm to humans:
storm<- storm[!(REFNUM %in% c(605951, 605943)), ]
DEATHS <- storm[DEATHS >0, c(3,7)]
INJURIES <- storm[INJURIES >0, c(3,8)]
DEATHS <- DEATHS[, 2]; setnames(DEATHS, "DEATHS", "HARM")
INJURIES <- INJURIES[, 2]; setnames(INJURIES, "INJURIES", "HARM")
DEATHS$group <-"DEATHS"
INJURIES$group <-"INJURIES"
HARM <- rbind(DEATHS,INJURIES)
gHARM<-ggplot(HARM, aes(x=factor(group), y=log10(HARM)))+
geom_boxplot(aes(fill=group), color= "darkgrey",
outlier.colour="cornflowerblue", outlier.shape=16,
outlier.size=2, notch=FALSE)+
scale_fill_manual(name="", values = c("firebrick3","springgreen3"))+
scale_x_discrete(name="")+
scale_y_continuous("total harm, log10(count)")+
labs(title ="HARM to HUMANS \n(interactive)")+
theme(plot.title = element_text(size = rel(0.9)),
axis.line = element_line(size = 3, colour = "grey80"))
gHARM<-ggplotly(gHARM)
gHARM$x$data[[1]]$marker$line$color = "cornflowerblue"
gHARM$x$data[[2]]$marker$line$color = "cornflowerblue"
gHARM$x$data[[1]]$marker$outliercolor = "cornflowerblue"
gHARM$x$data[[2]]$marker$outliercolor = "cornflowerblue"
gHARM$x$data[[1]]$marker$color = "cornflowerblue"
gHARM$x$data[[2]]$marker$color = "cornflowerblue"
gHARMNumber of deaths has a median of \(10^0=1\), a maximum of \(10^{2.77}\), and a 3rd quartile of \(10^{0.3}\). So, more than \(75\%\) of non-zero in deaths values \(<\) 1.995. As for injuries, its number has a median of \(10^{0.3}\), a maximum of \(10^{3.23}\), and a 3rd quartile of \(10^{0.6}\), so that more than \(75\%\) of non-zero in injuries values \(<\) 3.98.
Look at crops/ property boxplots:
PROP <- storm[`PROP ($, mln)` >0, c(3,9)]
CROP <- storm[`CROP ($, mln)` >0, c(3,10)]
PROP <- PROP[, 2]; setnames(PROP, "PROP ($, mln)", "DAMAGE($, mln)")
CROP <- CROP[, 2]; setnames(CROP, "CROP ($, mln)", "DAMAGE($, mln)")
PROP$group <-"PROPERTY"
CROP$group <-"CROPS"
DAMAGE <- rbind(CROP,PROP)
gDAM<-ggplot(DAMAGE, aes(x=factor(group), y=log10(`DAMAGE($, mln)`)))+
geom_boxplot(aes(fill=group), color= "darkgrey",
outlier.colour="cornflowerblue",
outlier.shape=16, outlier.size=2, notch=FALSE)+
scale_fill_manual(name="", values = c("darkorange3","mediumorchid3"))+
scale_x_discrete(name="")+
scale_y_continuous("damage, log10($, mln)")+
labs(title ="DAMAGE to CROPS and PROPERTY \n(interactive)")+
theme(plot.title = element_text(size = rel(0.9)),
axis.line = element_line(size = 3, colour = "grey80"))
gDAM<-ggplotly(gDAM)
gDAM$x$data[[1]]$marker$line$color = "cornflowerblue"
gDAM$x$data[[2]]$marker$line$color = "cornflowerblue"
gDAM$x$data[[1]]$marker$outliercolor = "cornflowerblue"
gDAM$x$data[[2]]$marker$outliercolor = "cornflowerblue"
gDAM$x$data[[1]]$marker$color = "cornflowerblue"
gDAM$x$data[[2]]$marker$color = "cornflowerblue"
gDAMDamage to crops in \(\$\) mln has a median of \(10^{-1.3}\), a maximum of \(10^{3.7}\), and a 3rd quartile of \(10^{-0.6}\). So, more than \(75\%\) of non-zero in crops damage values \(<\$\) 0.25 mln, that is about \(\$250\ 000\).
As for property, damage to it in \(\$\), mln has a median of \(10^{-1.52}\), a maximum of \(10^{5.06}\), and a 3rd quartile of \(10^{-0.82}\), so that more than \(75\%\) of non-zero in property damage values \(<\$150\ 000\).
Thus, while the maximum value of property damage is greater than crops one, the latter is noticeably higher in median and 3rd quartile.
loader1loader <- function() {
library("R.utils"); library("data.table")
myload <- function(url, year = "", zip = "gz") {
dest <- paste0("./data/1119_DS-RR-w4_Storm/storm",
year, ".csv.", zip)
storm <- paste0("./data/1119_DS-RR-w4_Storm/storm",
year, ".csv")
if(!file.exists(dest)) {download.file(url, destfile = dest,
method = "curl")}
if(!file.exists(storm)) {ifelse(zip=="gz",
gunzip(dest, remove=FALSE),
bunzip2(dest, remove=FALSE))
}
}
myload(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
zip = "bz2")
}multmult <- function(d,e) {
e <- toupper(e)
d <- ifelse(e %in% c("","-"), d,
ifelse(e %in% c("?","+"), as.numeric(d)*10^1,
ifelse(e == "H", as.numeric(d)*10^2,
ifelse (e == "K", as.numeric(d)*10^3,
ifelse(e == "M", as.numeric(d)*10^6,
ifelse (e =="B", as.numeric(d)*10^9,
paste0(d,e)))))))
d<- round(as.numeric(d)/10^6,2)
d
}sessionInf (for reproducibility)R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] R.utils_2.10.1 R.oo_1.24.0 R.methodsS3_1.8.1 plotly_4.9.2.1
[5] gridExtra_2.3 ggplot2_3.3.2 tidyr_1.1.2 dplyr_1.0.2
[9] data.table_1.13.4
loaded via a namespace (and not attached):
[1] pillar_1.4.7 compiler_4.0.3 tools_4.0.3 digest_0.6.27
[5] jsonlite_1.7.2 evaluate_0.14 lifecycle_0.2.0 tibble_3.0.4
[9] gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.3 rlang_0.4.9
[13] crosstalk_1.1.0.1 yaml_2.2.1 xfun_0.19 withr_2.3.0
[17] stringr_1.4.0 httr_1.4.2 knitr_1.30 generics_0.1.0
[21] vctrs_0.3.5 htmlwidgets_1.5.2 grid_4.0.3 tidyselect_1.1.0
[25] glue_1.4.2 R6_2.5.0 rmarkdown_2.5 purrr_0.3.4
[29] magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.1 htmltools_0.5.0
[33] colorspace_2.0-0 labeling_0.4.2 stringi_1.5.3 lazyeval_0.2.2
[37] munsell_0.5.0 crayon_1.3.4