Data Originally from Kaggle at: https://www.kaggle.com/backblaze/hard-drive-test-data.
But after experiencing issues with 64bit values, the data were obtained from the original source here: https://www.backblaze.com/b2/hard-drive-test-data.html.
Load the relevant libraries.
# rm(list = ls())
# .rs.restartR()
# data munging
library("tidyverse")
library("data.table")
# munge dates
library("lubridate")
# feature engineering and data prep
library("recipes")
library("caret")
library("DMwR")
# to explore missing data
library("visdat")
library("naniar")
# modeling
library("h2o")
Session Info.
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] h2o_3.24.0.1 naniar_0.4.2 visdat_0.5.3 DMwR_0.4.1
[5] caret_6.0-84 lattice_0.20-38 recipes_0.1.5 lubridate_1.7.4
[9] data.table_1.12.2 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1
[13] purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.1
[17] ggplot2_3.1.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] httr_1.4.0 jsonlite_1.6 splines_3.6.0 foreach_1.4.4
[5] gtools_3.8.1 prodlim_2018.04.18 modelr_0.1.4 assertthat_0.2.1
[9] TTR_0.23-4 stats4_3.6.0 cellranger_1.1.0 yaml_2.2.0
[13] ipred_0.9-9 pillar_1.4.0 backports_1.1.4 glue_1.3.1
[17] digest_0.6.19 rvest_0.3.4 colorspace_1.4-1 htmltools_0.3.6
[21] Matrix_1.2-17 plyr_1.8.4 timeDate_3043.102 pkgconfig_2.0.2
[25] broom_0.5.2 haven_2.1.0 scales_1.0.0 gdata_2.18.0
[29] gower_0.2.1 lava_1.6.5 generics_0.0.2 withr_2.1.2
[33] ROCR_1.0-7 nnet_7.3-12 lazyeval_0.2.2 cli_1.1.0
[37] quantmod_0.4-14 survival_2.44-1.1 magrittr_1.5 crayon_1.3.4
[41] readxl_1.3.1 evaluate_0.13 nlme_3.1-140 MASS_7.3-51.4
[45] gplots_3.0.1.1 xts_0.11-2 xml2_1.2.0 class_7.3-15
[49] tools_3.6.0 hms_0.4.2 munsell_0.5.0 packrat_0.5.0
[53] compiler_3.6.0 caTools_1.17.1.2 rlang_0.3.4 RCurl_1.95-4.12
[57] iterators_1.0.10 rstudioapi_0.10 bitops_1.0-6 base64enc_0.1-3
[61] rmarkdown_1.12 gtable_0.3.0 ModelMetrics_1.2.2 codetools_0.2-16
[65] abind_1.4-5 curl_3.3 reshape2_1.4.3 R6_2.4.0
[69] zoo_1.8-5 knitr_1.23 KernSmooth_2.23-15 stringi_1.4.3
[73] Rcpp_1.0.1 rpart_4.1-15 tidyselect_0.2.5 xfun_0.7
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
[1] "/Users/mdturse/Desktop/Analytics/hard_drive_failure"
The site where the data are made available is https://www.backblaze.com/b2/hard-drive-test-data.html.
Download the .zip file.
# base url
url <- "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2018.zip"
# create a temporary directory
td <- tempdir()
# create the placeholder file
tf <- tempfile(tmpdir = td, fileext = ".zip")
# download into the placeholder file
download.file(url, tf)
trying URL 'https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2018.zip'
Content type 'application/zip' length 547619262 bytes (522.3 MB)
==================================================
downloaded 522.3 MB
Download just the relevant .csv files.
# get the names of the files
fname <- unzip(tf, list = TRUE)
head(fname)
fname_csv <-
fname %>%
filter(str_detect(Name, "data_Q4_2018/2018-\\d{2}-\\d{2}\\.csv$")) %>%
arrange(Name) %>%
# head(5) %>%
pull(Name)
# unzip the files to the temporary directory
fname_csv %>%
map(~unzip(tf,
files = .x,
exdir = td,
overwrite = TRUE
)
)
[[1]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-01.csv"
[[2]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-02.csv"
[[3]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-03.csv"
[[4]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-04.csv"
[[5]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-05.csv"
[[6]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-06.csv"
[[7]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-07.csv"
[[8]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-08.csv"
[[9]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-09.csv"
[[10]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-10.csv"
[[11]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-11.csv"
[[12]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-12.csv"
[[13]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-13.csv"
[[14]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-14.csv"
[[15]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-15.csv"
[[16]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-16.csv"
[[17]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-17.csv"
[[18]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-18.csv"
[[19]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-19.csv"
[[20]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-20.csv"
[[21]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-21.csv"
[[22]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-22.csv"
[[23]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-23.csv"
[[24]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-24.csv"
[[25]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-25.csv"
[[26]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-26.csv"
[[27]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-27.csv"
[[28]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-28.csv"
[[29]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-29.csv"
[[30]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-30.csv"
[[31]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-10-31.csv"
[[32]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-01.csv"
[[33]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-02.csv"
[[34]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-03.csv"
[[35]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-04.csv"
[[36]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-05.csv"
[[37]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-06.csv"
[[38]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-07.csv"
[[39]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-08.csv"
[[40]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-09.csv"
[[41]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-10.csv"
[[42]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-11.csv"
[[43]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-12.csv"
[[44]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-13.csv"
[[45]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-14.csv"
[[46]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-15.csv"
[[47]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-16.csv"
[[48]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-17.csv"
[[49]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-18.csv"
[[50]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-19.csv"
[[51]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-20.csv"
[[52]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-21.csv"
[[53]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-22.csv"
[[54]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-23.csv"
[[55]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-24.csv"
[[56]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-25.csv"
[[57]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-26.csv"
[[58]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-27.csv"
[[59]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-28.csv"
[[60]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-29.csv"
[[61]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-11-30.csv"
[[62]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-01.csv"
[[63]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-02.csv"
[[64]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-03.csv"
[[65]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-04.csv"
[[66]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-05.csv"
[[67]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-06.csv"
[[68]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-07.csv"
[[69]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-08.csv"
[[70]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-09.csv"
[[71]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-10.csv"
[[72]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-11.csv"
[[73]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-12.csv"
[[74]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-13.csv"
[[75]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-14.csv"
[[76]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-15.csv"
[[77]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-16.csv"
[[78]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-17.csv"
[[79]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-18.csv"
[[80]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-19.csv"
[[81]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-20.csv"
[[82]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-21.csv"
[[83]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-22.csv"
[[84]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-23.csv"
[[85]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-24.csv"
[[86]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-25.csv"
[[87]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-26.csv"
[[88]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-27.csv"
[[89]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-28.csv"
[[90]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-29.csv"
[[91]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-30.csv"
[[92]]
[1] "/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/data_Q4_2018/2018-12-31.csv"
# fpath is the full path to the extracted files
fpath <-
fname_csv %>%
map(~file.path(td, .x)
)
Read in the .csv files and stack them into a single .rds file.
hd <-
fpath %>%
map(~fread(.x, verbose = FALSE)
)
Previous fread() session was not cleaned up properly. Cleaned up ok at the beginning of this fread() call.|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
hd_dt <- bind_rows(hd)
# fwrite(hd_dt,
# paste0(wd,
# "/Data/Interim/",
# "hd_dt.csv"
# )
# )
# saveRDS(hd_dt,
# paste0(wd,
# "/Data/Interim/",
# "hd_dt.Rds"
# )
# )
# hd_dt <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "hd_dt.Rds"
# )
# )
# message("hd")
# class(hd)
# glimpse(hd[1])
#
# message("hd_dt")
# glimpse(hd_dt)
rm(fname, fpath, fname_csv, td, tf, url, hd)
Experimenting with converting integer64 values to smaller numeric values.
# class(hd_dt)
# hd_dt2 <- hd_dt
# 12,000,138,625,024
# hd_dt$capacity_bytes <- as.numeric(hd_dt$capacity_bytes / 1000000)# * 1000000
# hd_dt$smart_7_raw <- as.numeric(hd_dt$smart_7_raw / 1000000)# * 1000000
# hd_dt$smart_188_raw <- as.numeric(hd_dt$smart_188_raw / 1000000)# * 1000000
# hd_dt$smart_240_raw <- as.numeric(hd_dt$smart_240_raw / 1000000)# * 1000000
# hd_dt$smart_241_raw <- as.numeric(hd_dt$smart_241_raw / 1000000)# * 1000000
# hd_dt$smart_242_raw <- as.numeric(hd_dt$smart_242_raw / 1000000)# * 1000000
# class(hd_dt)
# glimpse(hd_dt)
# hd_dt$capacity_bytes3 <- as.integer(hd_dt$capacity_bytes)
Separate model into pieces - manufacturer and model.
hd2 <-
hd_dt %>%
# head(1000) %>%
separate(col = model,
into = c("manu", "model2"),
sep = "\\s",
fill = "left",
remove = TRUE
) %>%
mutate(date = as_date(date),
serial_number = factor(serial_number),
manu = case_when(is.na(manu) ~ "(Missing)",
TRUE ~ manu
) %>%
factor(),
model2 = factor(model2),
failure = factor(failure)
) %>%
select(-matches("normalized")
) %>%
as.data.table %>%
setkey(manu, model2, serial_number, date)
Expected 2 pieces. Additional pieces discarded in 500 rows [3037766, 3139107, 3240409, 3341661, 3442910, 3544156, 3582868, 3645402, 3684079, 3709513, 3746563, 3746576, 3785233, 3810667, 3847716, 3847729, 3886390, 3911822, 3918813, 3948873, ...].
class(hd2)
[1] "data.table" "data.frame"
glimpse(hd2)
Observations: 9,357,609
Variables: 68
$ date [3m[38;5;246m<date>[39m[23m 2018-10-01, 2018-10-02, 2018-10-03, 2018-10-04, 2018-10-05, 2…
$ serial_number [3m[38;5;246m<fct>[39m[23m ZA20NH66, ZA20NH66, ZA20NH66, ZA20NH66, ZA20NH66, ZA20NH66, ZA…
$ manu [3m[38;5;246m<fct>[39m[23m (Missing), (Missing), (Missing), (Missing), (Missing), (Missin…
$ model2 [3m[38;5;246m<fct>[39m[23m ST10000NM0086, ST10000NM0086, ST10000NM0086, ST10000NM0086, ST…
$ capacity_bytes [3m[38;5;246m<dbl>[39m[23m 4.941067e-311, 4.941067e-311, 4.941067e-311, 4.941067e-311, 4.…
$ failure [3m[38;5;246m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_1_raw [3m[38;5;246m<int>[39m[23m 98368032, 78733432, 74038736, 77143848, 117522120, 68864496, 5…
$ smart_2_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_3_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_4_raw [3m[38;5;246m<int>[39m[23m 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ smart_5_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_7_raw [3m[38;5;246m<integr64>[39m[23m 1543760090, 1547844538, 1552162785, 1557445945, 156214729…
$ smart_8_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_9_raw [3m[38;5;246m<int>[39m[23m 9635, 9659, 9683, 9707, 9731, 9755, 9779, 9803, 9819, 9852, 98…
$ smart_10_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_11_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_12_raw [3m[38;5;246m<int>[39m[23m 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ smart_13_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_15_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_16_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_17_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_22_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_23_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_24_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_168_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_170_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_173_raw [3m[38;5;246m<integr64>[39m[23m 9218868437227407266, 9218868437227407266, 921886843722740…
$ smart_174_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_177_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_179_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_181_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_182_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_183_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_184_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_187_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_188_raw [3m[38;5;246m<integr64>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ smart_189_raw [3m[38;5;246m<int>[39m[23m 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, …
$ smart_190_raw [3m[38;5;246m<int>[39m[23m 23, 23, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 23, 23…
$ smart_191_raw [3m[38;5;246m<int>[39m[23m 105566, 105752, 105987, 106238, 106501, 106664, 106947, 107166…
$ smart_192_raw [3m[38;5;246m<int>[39m[23m 64, 64, 65, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 68…
$ smart_193_raw [3m[38;5;246m<int>[39m[23m 669, 670, 671, 672, 673, 674, 675, 676, 677, 681, 682, 685, 68…
$ smart_194_raw [3m[38;5;246m<int>[39m[23m 23, 23, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 23, 23…
$ smart_195_raw [3m[38;5;246m<int>[39m[23m 98368032, 78733432, 74038736, 77143848, 117522120, 68864496, 5…
$ smart_196_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_197_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_198_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_199_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_200_raw [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smart_201_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_218_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_220_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_222_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_223_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_224_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_225_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_226_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_231_raw [3m[38;5;246m<integr64>[39m[23m 9218868437227407266, 9218868437227407266, 921886843722740…
$ smart_232_raw [3m[38;5;246m<integr64>[39m[23m 9218868437227407266, 9218868437227407266, 921886843722740…
$ smart_233_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_235_raw [3m[38;5;246m<integr64>[39m[23m 9218868437227407266, 9218868437227407266, 921886843722740…
$ smart_240_raw [3m[38;5;246m<integr64>[39m[23m 9546, 9570, 9594, 9618, 9642, 9665, 9690, 9714, 9729, 976…
$ smart_241_raw [3m[38;5;246m<integr64>[39m[23m 46154421893, 46193919421, 46245435021, 46307985421, 46380…
$ smart_242_raw [3m[38;5;246m<integr64>[39m[23m 97015934425, 97203507833, 97391775273, 97577094617, 97791…
$ smart_250_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_251_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_252_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_254_raw [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_255_raw [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
summary(hd2)
date serial_number manu
Min. :2018-10-01 175PP3HDT: 92 (Missing) :7096719
1st Qu.:2018-10-24 175PP3I4T: 92 HGST :1929277
Median :2018-11-17 175PP3I5T: 92 TOSHIBA : 206129
Mean :2018-11-16 175PP3I6T: 92 WDC : 68637
3rd Qu.:2018-12-09 175PP3I8T: 92 ST500LM012: 54777
Max. :2018-12-31 175PP3I9T: 92 Hitachi : 1388
(Other) :9357057 (Other) : 682
model2 capacity_bytes failure smart_1_raw
ST12000NM0007 :2460704 Min. :0 0:9357216 Min. : 0
ST4000DM000 :2156756 1st Qu.:0 1: 393 1st Qu.: 2389506
HMS5C4040BLE640:1335577 Median :0 Median : 83386312
ST8000NM0055 :1312180 Mean :0 Mean : 92613745
ST8000DM002 : 901613 3rd Qu.:0 3rd Qu.:164073018
HMS5C4040ALE640: 426449 Max. :0 Max. :844902462
(Other) : 764330 NA's :363 NA's :363
smart_2_raw smart_3_raw smart_4_raw smart_5_raw
Min. : 0 Min. : 0.0 Min. : 1.000 Min. : 0.00
1st Qu.: 100 1st Qu.: 0.0 1st Qu.: 3.000 1st Qu.: 0.00
Median : 102 Median : 0.0 Median : 5.000 Median : 0.00
Mean : 88 Mean : 229.9 Mean : 8.638 Mean : 5.76
3rd Qu.: 104 3rd Qu.: 0.0 3rd Qu.: 8.000 3rd Qu.: 0.00
Max. :1061 Max. :11042.0 Max. :25118.000 Max. :55320.00
NA's :7165913 NA's :863 NA's :863 NA's :863
smart_7_raw smart_8_raw smart_9_raw smart_10_raw
Min. : 0 Min. : 0 Min. : 0 Min. : 0
1st Qu.: 6874406 1st Qu.:41 1st Qu.: 8913 1st Qu.: 0
Median : 371331439 Median :42 Median :15247 Median : 0
Mean : 1668407196 Mean :35 Mean :17252 Mean : 22
3rd Qu.: 942362808 3rd Qu.:42 3rd Qu.:25131 3rd Qu.: 0
Max. :10649354203107 Max. :45 Max. :70441 Max. :327680
NA's : 863 NA's :7165913 NA's :363 NA's :863
smart_11_raw smart_12_raw smart_13_raw smart_15_raw smart_16_raw
Min. : 0 Min. : 0.000 Mode:logical Mode:logical Min. : 62
1st Qu.: 0 1st Qu.: 2.000 NA's:9357609 NA's:9357609 1st Qu.: 68
Median : 0 Median : 4.000 Median : 79
Mean : 349 Mean : 5.949 Mean : 79
3rd Qu.: 507 3rd Qu.: 7.000 3rd Qu.: 83
Max. :9225 Max. :1231.000 Max. :171
NA's :9234013 NA's :363 NA's :9357109
smart_17_raw smart_22_raw smart_23_raw smart_24_raw
Min. : 62 Min. : 68 Min. :0 Min. :0
1st Qu.: 68 1st Qu.:100 1st Qu.:0 1st Qu.:0
Median : 79 Median :100 Median :0 Median :0
Mean : 79 Mean :100 Mean :0 Mean :0
3rd Qu.: 83 3rd Qu.:100 3rd Qu.:0 3rd Qu.:0
Max. :171 Max. :100 Max. :0 Max. :0
NA's :9357109 NA's :9197269 NA's :9251432 NA's :9251432
smart_168_raw smart_170_raw smart_173_raw smart_174_raw
Min. :0 Min. : 165 Min. : 4294967297 Min. :1
1st Qu.:0 1st Qu.: 319 1st Qu.:9218868437227407266 1st Qu.:1
Median :0 Median : 391 Median :9218868437227407266 Median :1
Mean :0 Mean : 392 Mean :9217302923573511133 Mean :2
3rd Qu.:0 3rd Qu.: 411 3rd Qu.:9218868437227407266 3rd Qu.:2
Max. :0 Max. :1397 Max. :9218868437227407266 Max. :3
NA's :9357109 NA's :9357109 NA's : 6413250 NA's :9357109
smart_177_raw smart_179_raw smart_181_raw smart_182_raw smart_183_raw
Min. :0 Mode:logical Mode:logical Mode:logical Min. : 0
1st Qu.:0 NA's:9357609 NA's:9357609 NA's:9357609 1st Qu.: 0
Median :1 Median : 0
Mean :1 Mean : 3
3rd Qu.:1 3rd Qu.: 0
Max. :4 Max. :37728
NA's :9357109 NA's :7055154
smart_184_raw smart_187_raw smart_188_raw smart_189_raw
Min. : 0 Min. : 0.0 Min. : 0 Min. : 0
1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0
Median : 0 Median : 0.0 Median : 0 Median : 0
Mean : 0 Mean : 0.1 Mean : 80816376 Mean : 7
3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.: 0 3rd Qu.: 0
Max. :72 Max. :524.0 Max. :601305711020 Max. :65535
NA's :4721683 NA's :2261196 NA's : 2261196 NA's :4721683
smart_190_raw smart_191_raw smart_192_raw smart_193_raw
Min. :14.0 Min. : 0 Min. : 0.0 Min. : 1
1st Qu.:24.0 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 361
Median :29.0 Median : 1 Median : 12.0 Median : 2426
Mean :29.2 Mean : 10960 Mean : 167.7 Mean : 15292
3rd Qu.:34.0 3rd Qu.: 9318 3rd Qu.: 93.0 3rd Qu.: 13107
Max. :56.0 Max. :4752298 Max. :65535.0 Max. :1104852
NA's :2261196 NA's :4448060 NA's :762 NA's :56221
smart_194_raw smart_195_raw smart_196_raw smart_197_raw
Min. :12.00 Min. : 0 Min. : 0 Min. : 0.000
1st Qu.:25.00 1st Qu.: 59043538 1st Qu.: 0 1st Qu.: 0.000
Median :29.00 Median :121289036 Median : 0 Median : 0.000
Mean :29.12 Mean :120787181 Mean : 1 Mean : 0.097
3rd Qu.:33.00 3rd Qu.:181994984 3rd Qu.: 0 3rd Qu.: 0.000
Max. :56.00 Max. :244140616 Max. :1759 Max. :10016.000
NA's :363 NA's :4373803 NA's :7097276 NA's :863
smart_198_raw smart_199_raw smart_200_raw smart_201_raw
Min. : 0.000 Min. : 0.000 Min. : 0 Mode:logical
1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0 NA's:9357609
Median : 0.000 Median : 0.000 Median : 0
Mean : 0.092 Mean : 0.616 Mean : 4771
3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0
Max. :10016.000 Max. :4139.000 Max. :850585
NA's :863 NA's :863 NA's :6663044
smart_218_raw smart_220_raw smart_222_raw smart_223_raw
Min. :0 Min. : 0 Min. : 3 Min. : 0
1st Qu.:0 1st Qu.: 0 1st Qu.: 1078 1st Qu.: 0
Median :0 Median : 393217 Median : 2079 Median : 0
Mean :0 Mean : 47079213 Mean : 7378 Mean : 165
3rd Qu.:0 3rd Qu.: 68681730 3rd Qu.:11085 3rd Qu.: 0
Max. :0 Max. :287440904 Max. :33653 Max. :9225
NA's :9357109 NA's :9151485 NA's :9151485 NA's :9096526
smart_224_raw smart_225_raw smart_226_raw smart_231_raw
Min. :0 Min. : 34185 Min. :159 Min. : 109951162777699
1st Qu.:0 1st Qu.: 103221 1st Qu.:261 1st Qu.:9218868437227407266
Median :0 Median : 164134 Median :529 Median :9218868437227407266
Mean :0 Mean : 397460 Mean :410 Mean :9217302942244275773
3rd Qu.:0 3rd Qu.: 639641 3rd Qu.:534 3rd Qu.:9218868437227407266
Max. :0 Max. :2390237 Max. :650 Max. :9218868437227407266
NA's :9151485 NA's :9302650 NA's :9151485 NA's : 6413250
smart_232_raw smart_233_raw smart_235_raw
Min. : 270582939648 Min. : 480 Min. : 1015083328
1st Qu.:9218868437227407266 1st Qu.: 653 1st Qu.:9218868437227407266
Median :9218868437227407266 Median : 902 Median :9218868437227407266
Mean :9217302923633694609 Mean :1178 Mean :9217949159185155570
3rd Qu.:9218868437227407266 3rd Qu.:1380 3rd Qu.:9218868437227407266
Max. :9218868437227407266 Max. :5355 Max. :9218868437227407266
NA's : 6413250 NA's :9357109 NA's : 4689096
smart_240_raw smart_241_raw smart_242_raw
Min. : 0 Min. : 0 Min. : 1
1st Qu.: 7552 1st Qu.: 37510271200 1st Qu.: 67912675784
Median : 12571 Median : 43794509624 Median : 92947379492
Mean : 1796550365729 Mean : 42077709475 Mean : 94024240191
3rd Qu.: 24691 3rd Qu.: 49428172152 3rd Qu.: 111958104848
Max. :281462091840688 Max. :179003923936 Max. :32377330241963
NA's : 2048040 NA's : 2259950 NA's : 2259950
smart_250_raw smart_251_raw smart_252_raw smart_254_raw
Min. : 3153310 Min. : 14518 Min. :0 Min. :0
1st Qu.: 7663916 1st Qu.: 36075 1st Qu.:0 1st Qu.:0
Median :10025026 Median : 49629 Median :0 Median :0
Mean :15392228 Mean : 60436 Mean :0 Mean :0
3rd Qu.:15888884 3rd Qu.: 76800 3rd Qu.:0 3rd Qu.:0
Max. :54639907 Max. :154487 Max. :0 Max. :0
NA's :9357463 NA's :9357463 NA's :9357463 NA's :9350964
smart_255_raw
Mode:logical
NA's:9357609
# View(hd2 %>% filter(model2 == "HDS5C3030ALA630"))
# View(head(hd2, 1000))
Get some basic counts for manufacturer, model, and serial number.
hd2 %>%
count(manu, model2, serial_number) %>%
arrange(desc(n))
hd2 %>%
count(manu) %>%
arrange(desc(n))
hd2 %>%
count(model2) %>%
arrange(desc(n))
NA
Explore failure rates.
table(hd2$failure)
0 1
9357216 393
prop.table(table(hd2$failure))
0 1
9.999580e-01 4.199791e-05
cnts_manu <-
hd2 %>%
count(manu)
cnts_manu_failure <-
hd2 %>%
filter(failure == "1") %>%
count(manu)
manu_failures <-
cnts_manu %>%
rename(cnt_overall = n) %>%
inner_join(y = cnts_manu_failure %>%
rename(cnt_failure = n),
by = c("manu" = "manu")
) %>%
mutate(failure_pct = cnt_failure / cnt_overall) %>%
arrange(desc(failure_pct))
manu_failures
rm(cnts_manu, cnts_manu_failure)
Create train_valid and test datasets.
toshiba <-
hd2 %>%
filter(manu == "TOSHIBA") %>%
mutate_if(is.factor, factor) %>%
select_if(negate(is.logical))
fwrite(toshiba,
paste0(wd,
"/Data/Interim/",
"toshiba.csv"
)
)
# toshiba <-
# fread(paste0(wd,
# "/Data/Interim/",
# "toshiba.csv"
# )
# )
# toshiba[ , date := as_date(date)]
# toshiba[ , serial_number := factor(serial_number)]
# toshiba[ , manu := factor(manu)]
# toshiba[ , model2 := factor(model2)]
str(toshiba)
'data.frame': 206129 obs. of 61 variables:
$ date : Date, format: "2018-10-01" "2018-10-02" ...
$ serial_number : Factor w/ 2343 levels "175PP3HDT","175PP3I4T",..: 1410 1410 1410 1410 1410 1410 1410 1410 1410 1410 ...
$ manu : Factor w/ 1 level "TOSHIBA": 1 1 1 1 1 1 1 1 1 1 ...
$ model2 : Factor w/ 7 levels "HDWE160","HDWF180",..: 1 1 1 1 1 1 1 1 1 1 ...
$ capacity_bytes: num 2.96e-311 2.96e-311 2.96e-311 2.96e-311 2.96e-311 ...
$ failure : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ smart_1_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_2_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_3_raw : int 10345 10345 10345 10345 10345 10345 10345 10345 10345 10345 ...
$ smart_4_raw : int 2 2 2 2 2 2 2 2 2 2 ...
$ smart_5_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_7_raw :integer64 0 0 0 0 0 0 0 0 ...
$ smart_8_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_9_raw : int 4254 4278 4302 4326 4350 4374 4397 4421 4446 4469 ...
$ smart_10_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_11_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_12_raw : int 2 2 2 2 2 2 2 2 2 2 ...
$ smart_16_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_17_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_22_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_23_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_24_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_168_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_170_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_173_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_174_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_177_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_183_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_184_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_187_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_188_raw :integer64 NA NA NA NA NA NA NA NA ...
$ smart_189_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_190_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_191_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_192_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_193_raw : int 4 4 4 4 4 4 4 4 4 4 ...
$ smart_194_raw : int 22 22 22 22 22 22 22 22 22 22 ...
$ smart_195_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_196_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_197_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_198_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_199_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_200_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_218_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_220_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_222_raw : int 4254 4278 4302 4326 4350 4373 4397 4421 4445 4469 ...
$ smart_223_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_224_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_225_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_226_raw : int 630 630 630 630 630 630 630 630 630 630 ...
$ smart_231_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_232_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_233_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_235_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_240_raw :integer64 0 0 0 0 0 0 0 0 ...
$ smart_241_raw :integer64 NA NA NA NA NA NA NA NA ...
$ smart_242_raw :integer64 NA NA NA NA NA NA NA NA ...
$ smart_250_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_251_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_252_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_254_raw : int NA NA NA NA NA NA NA NA NA NA ...
summary(toshiba)
date serial_number manu model2
Min. :2018-10-01 175PP3HDT: 92 TOSHIBA:206129 HDWE160 : 914
1st Qu.:2018-10-26 175PP3I4T: 92 HDWF180 : 1820
Median :2018-11-17 175PP3I5T: 92 MD04ABA400V: 13188
Mean :2018-11-16 175PP3I6T: 92 MD04ABA500V: 4095
3rd Qu.:2018-12-09 175PP3I8T: 92 MG07ACA14TA:106177
Max. :2018-12-31 175PP3I9T: 92 MQ01ABF050 : 48732
(Other) :205577 MQ01ABF050M: 31203
capacity_bytes failure smart_1_raw smart_2_raw smart_3_raw smart_4_raw
Min. :0 0:206108 Min. :0 Min. :0 Min. : 517 Min. : 1.000
1st Qu.:0 1: 21 1st Qu.:0 1st Qu.:0 1st Qu.: 1508 1st Qu.: 2.000
Median :0 Median :0 Median :0 Median : 7562 Median : 3.000
Mean :0 Mean :0 Mean :0 Mean : 5155 Mean : 4.388
3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.: 7816 3rd Qu.: 6.000
Max. :0 Max. :0 Max. :0 Max. :11042 Max. :380.000
NA's :5 NA's :5 NA's :5 NA's :5 NA's :5
smart_5_raw smart_7_raw smart_8_raw smart_9_raw smart_10_raw
Min. : 0.000 Min. :0 Min. :0 Min. : 3 Min. :0
1st Qu.: 0.000 1st Qu.:0 1st Qu.:0 1st Qu.: 1097 1st Qu.:0
Median : 0.000 Median :0 Median :0 Median : 2102 Median :0
Mean : 2.699 Mean :0 Mean :0 Mean : 7404 Mean :0
3rd Qu.: 0.000 3rd Qu.:0 3rd Qu.:0 3rd Qu.:11093 3rd Qu.:0
Max. :2752.000 Max. :0 Max. :0 Max. :33674 Max. :0
NA's :5 NA's :5 NA's :5 NA's :5 NA's :5
smart_11_raw smart_12_raw smart_16_raw smart_17_raw smart_22_raw
Min. : NA Min. : 1.000 Min. : NA Min. : NA Min. : NA
1st Qu.: NA 1st Qu.: 2.000 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
Median : NA Median : 3.000 Median : NA Median : NA Median : NA
Mean :NaN Mean : 4.201 Mean :NaN Mean :NaN Mean :NaN
3rd Qu.: NA 3rd Qu.: 6.000 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. : NA Max. :67.000 Max. : NA Max. : NA Max. : NA
NA's :206129 NA's :5 NA's :206129 NA's :206129 NA's :206129
smart_23_raw smart_24_raw smart_168_raw smart_170_raw
Min. :0 Min. :0 Min. : NA Min. : NA
1st Qu.:0 1st Qu.:0 1st Qu.: NA 1st Qu.: NA
Median :0 Median :0 Median : NA Median : NA
Mean :0 Mean :0 Mean :NaN Mean :NaN
3rd Qu.:0 3rd Qu.:0 3rd Qu.: NA 3rd Qu.: NA
Max. :0 Max. :0 Max. : NA Max. : NA
NA's :99952 NA's :99952 NA's :206129 NA's :206129
smart_173_raw smart_174_raw smart_177_raw smart_183_raw
Min. :9218868437227407266 Min. : NA Min. : NA Min. : NA
1st Qu.:9218868437227407266 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
Median :9218868437227407266 Median : NA Median : NA Median : NA
Mean :9218868437227405446 Mean :NaN Mean :NaN Mean :NaN
3rd Qu.:9218868437227407266 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. :9218868437227407266 Max. : NA Max. : NA Max. : NA
NA's : 143538 NA's :206129 NA's :206129 NA's :206129
smart_184_raw smart_187_raw smart_188_raw smart_189_raw
Min. : NA Min. : NA Min. :9218868437227407266 Min. : NA
1st Qu.: NA 1st Qu.: NA 1st Qu.:9218868437227407266 1st Qu.: NA
Median : NA Median : NA Median : NA Median : NA
Mean :NaN Mean :NaN Mean : NA Mean :NaN
3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. : NA Max. : NA Max. : NA Max. : NA
NA's :206129 NA's :206129 NA's : 206129 NA's :206129
smart_190_raw smart_191_raw smart_192_raw smart_193_raw smart_194_raw
Min. : NA Min. : 0 Min. : 0.0000 Min. : 2 Min. :12.00
1st Qu.: NA 1st Qu.: 0 1st Qu.: 0.0000 1st Qu.: 59 1st Qu.:27.00
Median : NA Median : 0 Median : 0.0000 Median : 80 Median :30.00
Mean :NaN Mean : 2683 Mean : 0.6801 Mean : 2626 Mean :29.74
3rd Qu.: NA 3rd Qu.: 2 3rd Qu.: 1.0000 3rd Qu.: 1487 3rd Qu.:33.00
Max. : NA Max. :4752298 Max. :60.0000 Max. :108361 Max. :49.00
NA's :206129 NA's :5 NA's :5 NA's :5 NA's :5
smart_195_raw smart_196_raw smart_197_raw smart_198_raw
Min. : NA Min. : 0.0000 Min. : 0.00000 Min. : 0.00000
1st Qu.: NA 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000
Median : NA Median : 0.0000 Median : 0.00000 Median : 0.00000
Mean :NaN Mean : 0.4245 Mean : 0.02391 Mean : 0.00249
3rd Qu.: NA 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. : NA Max. :497.0000 Max. :256.00000 Max. :57.00000
NA's :206129 NA's :5 NA's :5 NA's :5
smart_199_raw smart_200_raw smart_218_raw smart_220_raw
Min. : 0.00000 Min. : NA Min. : NA Min. : 0
1st Qu.: 0.00000 1st Qu.: NA 1st Qu.: NA 1st Qu.: 0
Median : 0.00000 Median : NA Median : NA Median : 393217
Mean : 0.04074 Mean :NaN Mean :NaN Mean : 47079213
3rd Qu.: 0.00000 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: 68681730
Max. :32.00000 Max. : NA Max. : NA Max. :287440904
NA's :5 NA's :206129 NA's :206129 NA's :5
smart_222_raw smart_223_raw smart_224_raw smart_225_raw smart_226_raw
Min. : 3 Min. :0 Min. :0 Min. : NA Min. :159.0
1st Qu.: 1078 1st Qu.:0 1st Qu.:0 1st Qu.: NA 1st Qu.:261.0
Median : 2079 Median :0 Median :0 Median : NA Median :529.0
Mean : 7378 Mean :0 Mean :0 Mean :NaN Mean :409.9
3rd Qu.:11085 3rd Qu.:0 3rd Qu.:0 3rd Qu.: NA 3rd Qu.:534.0
Max. :33653 Max. :0 Max. :0 Max. : NA Max. :650.0
NA's :5 NA's :5 NA's :5 NA's :206129 NA's :5
smart_231_raw smart_232_raw smart_233_raw
Min. :9218868437227407266 Min. :9218868437227407266 Min. : NA
1st Qu.:9218868437227407266 1st Qu.:9218868437227407266 1st Qu.: NA
Median :9218868437227407266 Median :9218868437227407266 Median : NA
Mean :9218868437227405446 Mean :9218868437227405446 Mean :NaN
3rd Qu.:9218868437227407266 3rd Qu.:9218868437227407266 3rd Qu.: NA
Max. :9218868437227407266 Max. :9218868437227407266 Max. : NA
NA's : 143538 NA's : 143538 NA's :206129
smart_235_raw smart_240_raw smart_241_raw
Min. :9218868437227407266 Min. :0 Min. :9218868437227407266
1st Qu.:9218868437227407266 1st Qu.:0 1st Qu.:9218868437227407266
Median :9218868437227407266 Median :0 Median : NA
Mean :9218868437227405394 Mean :0 Mean : NA
3rd Qu.:9218868437227407266 3rd Qu.:0 3rd Qu.: NA
Max. :9218868437227407266 Max. :0 Max. : NA
NA's : 104252 NA's :5 NA's : 206129
smart_242_raw smart_250_raw smart_251_raw smart_252_raw
Min. :9218868437227407266 Min. : NA Min. : NA Min. : NA
1st Qu.:9218868437227407266 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
Median : NA Median : NA Median : NA Median : NA
Mean : NA Mean :NaN Mean :NaN Mean :NaN
3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. : NA Max. : NA Max. : NA Max. : NA
NA's : 206129 NA's :206129 NA's :206129 NA's :206129
smart_254_raw
Min. : NA
1st Qu.: NA
Median : NA
Mean :NaN
3rd Qu.: NA
Max. : NA
NA's :206129
One-hot encode the data.
message("toshiba")
toshiba
str(toshiba)
'data.frame': 206129 obs. of 61 variables:
$ date : Date, format: "2018-10-01" "2018-10-02" ...
$ serial_number : Factor w/ 2343 levels "175PP3HDT","175PP3I4T",..: 1410 1410 1410 1410 1410 1410 1410 1410 1410 1410 ...
$ manu : Factor w/ 1 level "TOSHIBA": 1 1 1 1 1 1 1 1 1 1 ...
$ model2 : Factor w/ 7 levels "HDWE160","HDWF180",..: 1 1 1 1 1 1 1 1 1 1 ...
$ capacity_bytes: num 2.96e-311 2.96e-311 2.96e-311 2.96e-311 2.96e-311 ...
$ failure : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ smart_1_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_2_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_3_raw : int 10345 10345 10345 10345 10345 10345 10345 10345 10345 10345 ...
$ smart_4_raw : int 2 2 2 2 2 2 2 2 2 2 ...
$ smart_5_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_7_raw :integer64 0 0 0 0 0 0 0 0 ...
$ smart_8_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_9_raw : int 4254 4278 4302 4326 4350 4374 4397 4421 4446 4469 ...
$ smart_10_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_11_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_12_raw : int 2 2 2 2 2 2 2 2 2 2 ...
$ smart_16_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_17_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_22_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_23_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_24_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_168_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_170_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_173_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_174_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_177_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_183_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_184_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_187_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_188_raw :integer64 NA NA NA NA NA NA NA NA ...
$ smart_189_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_190_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_191_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_192_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_193_raw : int 4 4 4 4 4 4 4 4 4 4 ...
$ smart_194_raw : int 22 22 22 22 22 22 22 22 22 22 ...
$ smart_195_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_196_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_197_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_198_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_199_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_200_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_218_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_220_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_222_raw : int 4254 4278 4302 4326 4350 4373 4397 4421 4445 4469 ...
$ smart_223_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_224_raw : int 0 0 0 0 0 0 0 0 0 0 ...
$ smart_225_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_226_raw : int 630 630 630 630 630 630 630 630 630 630 ...
$ smart_231_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_232_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_233_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_235_raw :integer64 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 9218868437227407266 ...
$ smart_240_raw :integer64 0 0 0 0 0 0 0 0 ...
$ smart_241_raw :integer64 NA NA NA NA NA NA NA NA ...
$ smart_242_raw :integer64 NA NA NA NA NA NA NA NA ...
$ smart_250_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_251_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_252_raw : int NA NA NA NA NA NA NA NA NA NA ...
$ smart_254_raw : int NA NA NA NA NA NA NA NA NA NA ...
one_hot <- dummyVars(failure ~ .,
data = toshiba %>%
select(-date,
-serial_number,
-manu
) %>%
select_if(negate(is.logical)
)
)
toshiba_one_hot <-
toshiba %>%
select(failure,
date,
# serial_number,
manu) %>%
bind_cols(predict(object = one_hot,
newdata = toshiba %>%
select(-date,
-serial_number,
-manu
) %>%
select_if(negate(is.logical)
)
) %>%
as.data.frame()
)
variable 'failure' is not a factor
message("toshiba_one_hot")
toshiba_one_hot
str(toshiba_one_hot)
'data.frame': 206129 obs. of 66 variables:
$ failure : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ date : Date, format: "2018-10-01" "2018-10-02" ...
$ manu : Factor w/ 1 level "TOSHIBA": 1 1 1 1 1 1 1 1 1 1 ...
$ model2.HDWE160 : num 1 1 1 1 1 1 1 1 1 1 ...
$ model2.HDWF180 : num 0 0 0 0 0 0 0 0 0 0 ...
$ model2.MD04ABA400V: num 0 0 0 0 0 0 0 0 0 0 ...
$ model2.MD04ABA500V: num 0 0 0 0 0 0 0 0 0 0 ...
$ model2.MG07ACA14TA: num 0 0 0 0 0 0 0 0 0 0 ...
$ model2.MQ01ABF050 : num 0 0 0 0 0 0 0 0 0 0 ...
$ model2.MQ01ABF050M: num 0 0 0 0 0 0 0 0 0 0 ...
$ capacity_bytes : num 2.96e-311 2.96e-311 2.96e-311 2.96e-311 2.96e-311 ...
$ smart_1_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_2_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_3_raw : num 10345 10345 10345 10345 10345 ...
$ smart_4_raw : num 2 2 2 2 2 2 2 2 2 2 ...
$ smart_5_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_7_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_8_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_9_raw : num 4254 4278 4302 4326 4350 ...
$ smart_10_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_11_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_12_raw : num 2 2 2 2 2 2 2 2 2 2 ...
$ smart_16_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_17_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_22_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_23_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_24_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_168_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_170_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_173_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_174_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_177_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_183_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_184_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_187_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_188_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_189_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_190_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_191_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_192_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_193_raw : num 4 4 4 4 4 4 4 4 4 4 ...
$ smart_194_raw : num 22 22 22 22 22 22 22 22 22 22 ...
$ smart_195_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_196_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_197_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_198_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_199_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_200_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_218_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_220_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_222_raw : num 4254 4278 4302 4326 4350 ...
$ smart_223_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_224_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_225_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_226_raw : num 630 630 630 630 630 630 630 630 630 630 ...
$ smart_231_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_232_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_233_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_235_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_240_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_241_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_242_raw : num 0 0 0 0 0 0 0 0 0 0 ...
$ smart_250_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_251_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_252_raw : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_254_raw : num NA NA NA NA NA NA NA NA NA NA ...
summary(toshiba_one_hot)
failure date manu model2.HDWE160 model2.HDWF180
0:206108 Min. :2018-10-01 TOSHIBA:206129 Min. :0.000000 Min. :0.000000
1: 21 1st Qu.:2018-10-26 1st Qu.:0.000000 1st Qu.:0.000000
Median :2018-11-17 Median :0.000000 Median :0.000000
Mean :2018-11-16 Mean :0.004434 Mean :0.008829
3rd Qu.:2018-12-09 3rd Qu.:0.000000 3rd Qu.:0.000000
Max. :2018-12-31 Max. :1.000000 Max. :1.000000
model2.MD04ABA400V model2.MD04ABA500V model2.MG07ACA14TA model2.MQ01ABF050
Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.00000 Median :0.00000 Median :1.0000 Median :0.0000
Mean :0.06398 Mean :0.01987 Mean :0.5151 Mean :0.2364
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
model2.MQ01ABF050M capacity_bytes smart_1_raw smart_2_raw smart_3_raw
Min. :0.0000 Min. :0 Min. :0 Min. :0 Min. : 517
1st Qu.:0.0000 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.: 1508
Median :0.0000 Median :0 Median :0 Median :0 Median : 7562
Mean :0.1514 Mean :0 Mean :0 Mean :0 Mean : 5155
3rd Qu.:0.0000 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.: 7816
Max. :1.0000 Max. :0 Max. :0 Max. :0 Max. :11042
NA's :5 NA's :5 NA's :5 NA's :5
smart_4_raw smart_5_raw smart_7_raw smart_8_raw smart_9_raw
Min. : 1.000 Min. : 0.000 Min. :0 Min. :0 Min. : 3
1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.:0 1st Qu.:0 1st Qu.: 1097
Median : 3.000 Median : 0.000 Median :0 Median :0 Median : 2102
Mean : 4.388 Mean : 2.699 Mean :0 Mean :0 Mean : 7404
3rd Qu.: 6.000 3rd Qu.: 0.000 3rd Qu.:0 3rd Qu.:0 3rd Qu.:11093
Max. :380.000 Max. :2752.000 Max. :0 Max. :0 Max. :33674
NA's :5 NA's :5 NA's :5 NA's :5
smart_10_raw smart_11_raw smart_12_raw smart_16_raw smart_17_raw
Min. :0 Min. : NA Min. : 1.000 Min. : NA Min. : NA
1st Qu.:0 1st Qu.: NA 1st Qu.: 2.000 1st Qu.: NA 1st Qu.: NA
Median :0 Median : NA Median : 3.000 Median : NA Median : NA
Mean :0 Mean :NaN Mean : 4.201 Mean :NaN Mean :NaN
3rd Qu.:0 3rd Qu.: NA 3rd Qu.: 6.000 3rd Qu.: NA 3rd Qu.: NA
Max. :0 Max. : NA Max. :67.000 Max. : NA Max. : NA
NA's :5 NA's :206129 NA's :5 NA's :206129 NA's :206129
smart_22_raw smart_23_raw smart_24_raw smart_168_raw smart_170_raw
Min. : NA Min. :0 Min. :0 Min. : NA Min. : NA
1st Qu.: NA 1st Qu.:0 1st Qu.:0 1st Qu.: NA 1st Qu.: NA
Median : NA Median :0 Median :0 Median : NA Median : NA
Mean :NaN Mean :0 Mean :0 Mean :NaN Mean :NaN
3rd Qu.: NA 3rd Qu.:0 3rd Qu.:0 3rd Qu.: NA 3rd Qu.: NA
Max. : NA Max. :0 Max. :0 Max. : NA Max. : NA
NA's :206129 NA's :99952 NA's :99952 NA's :206129 NA's :206129
smart_173_raw smart_174_raw smart_177_raw smart_183_raw smart_184_raw
Min. :0 Min. : NA Min. : NA Min. : NA Min. : NA
1st Qu.:0 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
Median :0 Median : NA Median : NA Median : NA Median : NA
Mean :0 Mean :NaN Mean :NaN Mean :NaN Mean :NaN
3rd Qu.:0 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. :0 Max. : NA Max. : NA Max. : NA Max. : NA
NA's :62591 NA's :206129 NA's :206129 NA's :206129 NA's :206129
smart_187_raw smart_188_raw smart_189_raw smart_190_raw smart_191_raw
Min. : NA Min. :0 Min. : NA Min. : NA Min. : 0
1st Qu.: NA 1st Qu.:0 1st Qu.: NA 1st Qu.: NA 1st Qu.: 0
Median : NA Median :0 Median : NA Median : NA Median : 0
Mean :NaN Mean :0 Mean :NaN Mean :NaN Mean : 2683
3rd Qu.: NA 3rd Qu.:0 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: 2
Max. : NA Max. :0 Max. : NA Max. : NA Max. :4752298
NA's :206129 NA's :206129 NA's :206129 NA's :5
smart_192_raw smart_193_raw smart_194_raw smart_195_raw smart_196_raw
Min. : 0.0000 Min. : 2 Min. :12.00 Min. : NA Min. : 0.0000
1st Qu.: 0.0000 1st Qu.: 59 1st Qu.:27.00 1st Qu.: NA 1st Qu.: 0.0000
Median : 0.0000 Median : 80 Median :30.00 Median : NA Median : 0.0000
Mean : 0.6801 Mean : 2626 Mean :29.74 Mean :NaN Mean : 0.4245
3rd Qu.: 1.0000 3rd Qu.: 1487 3rd Qu.:33.00 3rd Qu.: NA 3rd Qu.: 0.0000
Max. :60.0000 Max. :108361 Max. :49.00 Max. : NA Max. :497.0000
NA's :5 NA's :5 NA's :5 NA's :206129 NA's :5
smart_197_raw smart_198_raw smart_199_raw smart_200_raw
Min. : 0.00000 Min. : 0.00000 Min. : 0.00000 Min. : NA
1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: NA
Median : 0.00000 Median : 0.00000 Median : 0.00000 Median : NA
Mean : 0.02391 Mean : 0.00249 Mean : 0.04074 Mean :NaN
3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: NA
Max. :256.00000 Max. :57.00000 Max. :32.00000 Max. : NA
NA's :5 NA's :5 NA's :5 NA's :206129
smart_218_raw smart_220_raw smart_222_raw smart_223_raw smart_224_raw
Min. : NA Min. : 0 Min. : 3 Min. :0 Min. :0
1st Qu.: NA 1st Qu.: 0 1st Qu.: 1078 1st Qu.:0 1st Qu.:0
Median : NA Median : 393217 Median : 2079 Median :0 Median :0
Mean :NaN Mean : 47079213 Mean : 7378 Mean :0 Mean :0
3rd Qu.: NA 3rd Qu.: 68681730 3rd Qu.:11085 3rd Qu.:0 3rd Qu.:0
Max. : NA Max. :287440904 Max. :33653 Max. :0 Max. :0
NA's :206129 NA's :5 NA's :5 NA's :5 NA's :5
smart_225_raw smart_226_raw smart_231_raw smart_232_raw smart_233_raw
Min. : NA Min. :159.0 Min. :0 Min. :0 Min. : NA
1st Qu.: NA 1st Qu.:261.0 1st Qu.:0 1st Qu.:0 1st Qu.: NA
Median : NA Median :529.0 Median :0 Median :0 Median : NA
Mean :NaN Mean :409.9 Mean :0 Mean :0 Mean :NaN
3rd Qu.: NA 3rd Qu.:534.0 3rd Qu.:0 3rd Qu.:0 3rd Qu.: NA
Max. : NA Max. :650.0 Max. :0 Max. :0 Max. : NA
NA's :206129 NA's :5 NA's :62591 NA's :62591 NA's :206129
smart_235_raw smart_240_raw smart_241_raw smart_242_raw smart_250_raw
Min. :0 Min. :0 Min. :0 Min. :0 Min. : NA
1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.: NA
Median :0 Median :0 Median :0 Median :0 Median : NA
Mean :0 Mean :0 Mean :0 Mean :0 Mean :NaN
3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.: NA
Max. :0 Max. :0 Max. :0 Max. :0 Max. : NA
NA's :101877 NA's :206129
smart_251_raw smart_252_raw smart_254_raw
Min. : NA Min. : NA Min. : NA
1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
Median : NA Median : NA Median : NA
Mean :NaN Mean :NaN Mean :NaN
3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. : NA Max. : NA Max. : NA
NA's :206129 NA's :206129 NA's :206129
rm(one_hot, toshiba)
Create separate train_valid and the test data sets.
set.seed(123456789)
train_index <-
createDataPartition(y = toshiba_one_hot$failure,
p = .70,
times = 1,
list = FALSE
)
hd2_train_valid <-
toshiba_one_hot[train_index, ] %>%
select(-date)
hd2_test <-
toshiba_one_hot[-train_index, ] %>%
select(-date)
Create separate train and valid datasets.
set.seed(123456789)
train_index <-
createDataPartition(y = hd2_train_valid$failure,
p = .70,
times = 1,
list = FALSE
)
hd2_train <- hd2_train_valid[train_index, ]
hd2_valid <- hd2_train_valid[-train_index, ]
message("hd2_train$failure")
hd2_train$failure
table(hd2_train$failure)
0 1
100994 11
prop.table(table(hd2_train$failure))
0 1
0.9998910945 0.0001089055
class(hd2_train)
[1] "data.frame"
glimpse(hd2_train)
Observations: 101,005
Variables: 65
$ failure [3m[38;5;246m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ manu [3m[38;5;246m<fct>[39m[23m TOSHIBA, TOSHIBA, TOSHIBA, TOSHIBA, TOSHIBA, TOSHIBA, TOSH…
$ model2.HDWE160 [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ model2.HDWF180 [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ model2.MD04ABA400V [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ model2.MD04ABA500V [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ model2.MG07ACA14TA [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ model2.MQ01ABF050 [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ model2.MQ01ABF050M [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ capacity_bytes [3m[38;5;246m<dbl>[39m[23m 2.964974e-311, 2.964974e-311, 2.964974e-311, 2.964974e-311…
$ smart_1_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_2_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_3_raw [3m[38;5;246m<dbl>[39m[23m 10345, 10345, 10345, 10345, 10345, 10345, 10345, 10345, 10…
$ smart_4_raw [3m[38;5;246m<dbl>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ smart_5_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_7_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_8_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_9_raw [3m[38;5;246m<dbl>[39m[23m 4302, 4326, 4350, 4374, 4421, 4446, 4469, 4494, 4518, 4542…
$ smart_10_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_11_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_12_raw [3m[38;5;246m<dbl>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ smart_16_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_17_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_22_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_23_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_24_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_168_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_170_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_173_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_174_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_177_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_183_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_184_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_187_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_188_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_189_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_190_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_191_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_192_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_193_raw [3m[38;5;246m<dbl>[39m[23m 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
$ smart_194_raw [3m[38;5;246m<dbl>[39m[23m 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22…
$ smart_195_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_196_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_197_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_198_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_199_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_200_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_218_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_220_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_222_raw [3m[38;5;246m<dbl>[39m[23m 4302, 4326, 4350, 4373, 4421, 4445, 4469, 4494, 4518, 4541…
$ smart_223_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_224_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_225_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_226_raw [3m[38;5;246m<dbl>[39m[23m 630, 630, 630, 630, 630, 630, 630, 630, 630, 630, 630, 630…
$ smart_231_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_232_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_233_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_235_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_240_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_241_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_242_raw [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ smart_250_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_251_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_252_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ smart_254_raw [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Use the recipe package for multiple feature engineering steps.
# create the recipe object
rec_obj <- recipe(failure ~ ., data = hd2_train)
# implement each recipe step
rec_steps <-
rec_obj %>%
step_shuffle(all_predictors()) %>%
step_nzv(all_predictors()) %>% # helps reduce the number of variables with near zero variance (including NA values)
step_log(all_predictors(), offset = 1) %>% # puts less emphasis on "outliers"
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_medianimpute(all_predictors())
rec_steps
Data Recipe
Inputs:
Operations:
Shuffled all_predictors()
Sparse, unbalanced variable filter on all_predictors()
Log transformation on all_predictors()
Centering for all_predictors()
Scaling for all_predictors()
Median Imputation for all_predictors()
# create the recipe based on the TRAIN data set
trained_rec <- prep(rec_steps, training = hd2_train)
trained_rec
Data Recipe
Inputs:
Training data contained 101005 data points and 101005 incomplete rows.
Operations:
Shuffled manu, model2.HDWE160, model2.HDWF180, ... [trained]
Sparse, unbalanced variable filter removed manu, model2.HDWE160, ... [trained]
Log transformation on model2.MD04ABA400V, ... [trained]
Centering for model2.MD04ABA400V, ... [trained]
Scaling for model2.MD04ABA400V, ... [trained]
Median Imputation for model2.MD04ABA400V, ... [trained]
# apply the recipe to the the train, valid, and test datasets
train_data <- bake(trained_rec, new_data = hd2_train)
valid_data <- bake(trained_rec, new_data = hd2_valid)
test_data <- bake(trained_rec, new_data = hd2_test)
# View(hd2_train %>% head(1000))
# View(train_data %>% head(1000))
message("train_data")
train_data
dim(train_data)
[1] 101005 16
str(train_data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 101005 obs. of 16 variables:
$ failure : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ model2.MD04ABA400V: num -0.262 -0.262 -0.262 3.822 -0.262 ...
$ model2.MG07ACA14TA: num -1.032 -1.032 0.969 0.969 0.969 ...
$ model2.MQ01ABF050 : num -0.557 -0.557 -0.557 -0.557 -0.557 ...
$ model2.MQ01ABF050M: num -0.42 -0.42 -0.42 -0.42 -0.42 ...
$ capacity_bytes : num NA NA NA NA NA NA NA NA NA NA ...
$ smart_3_raw : num 0.855 -0.937 0.866 0.86 0.84 ...
$ smart_4_raw : num 1.033 3.518 1.033 0.241 0.548 ...
$ smart_9_raw : num 1.149 0.53 -1.952 1.272 -0.429 ...
$ smart_12_raw : num -1.275 -0.596 0.822 1.046 -0.596 ...
$ smart_191_raw : num 1.1738 1.398 -0.5619 0.0564 -0.5619 ...
$ smart_192_raw : num -0.728 -0.728 0.675 -0.728 0.675 ...
$ smart_193_raw : num -0.62 -0.872 -0.896 -0.404 1.154 ...
$ smart_194_raw : num 0.137 0.903 -0.205 1.18 0.61 ...
$ smart_222_raw : num -0.509 -0.506 -0.79 1.125 -0.405 ...
$ smart_226_raw : num 0.837 -0.958 -0.911 0.814 1.081 ...
summary(train_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050 model2.MQ01ABF050M
0:100994 Min. :-0.2617 Min. :-1.0324 Min. :-0.5565 Min. :-0.4204
1: 11 1st Qu.:-0.2617 1st Qu.:-1.0324 1st Qu.:-0.5565 1st Qu.:-0.4204
Median :-0.2617 Median : 0.9686 Median :-0.5565 Median :-0.4204
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.2617 3rd Qu.: 0.9686 3rd Qu.:-0.5565 3rd Qu.:-0.4204
Max. : 3.8218 Max. : 0.9686 Max. : 1.7968 Max. : 2.3786
capacity_bytes smart_3_raw smart_4_raw smart_9_raw
Min. : NA Min. :-2.6661165 Min. :-1.301765 Min. :-4.522747
1st Qu.: NA 1st Qu.:-1.2737431 1st Qu.:-0.618948 1st Qu.:-0.691515
Median : NA Median : 0.8187356 Median :-0.134482 Median :-0.260400
Mean :NaN Mean : 0.0000162 Mean :-0.000003 Mean :-0.000005
3rd Qu.: NA 3rd Qu.: 0.8615058 3rd Qu.: 0.807930 3rd Qu.: 0.873419
Max. : NA Max. : 1.3107509 Max. : 7.538825 Max. : 1.627442
NA's :101005
smart_12_raw smart_191_raw smart_192_raw smart_193_raw
Min. :-1.274805 Min. :-0.561869 Min. :-0.728452 Min. :-2.210091
1st Qu.:-0.596015 1st Qu.:-0.561869 1st Qu.:-0.728452 1st Qu.:-0.704389
Median :-0.114405 Median :-0.561869 Median :-0.728452 Median :-0.553552
Mean :-0.000002 Mean :-0.000011 Mean :-0.000014 Mean :-0.000011
3rd Qu.: 0.822450 3rd Qu.: 0.418071 3rd Qu.: 0.675337 3rd Qu.: 0.906041
Max. : 4.628689 Max. :13.151356 Max. : 7.597056 Max. : 3.064668
smart_194_raw smart_222_raw smart_226_raw
Min. :-4.314398 Min. :-4.390568 Min. :-2.131422
1st Qu.:-0.384310 1st Qu.:-0.677479 1st Qu.:-0.920394
Median : 0.137049 Median :-0.243378 Median : 0.809652
Mean : 0.000003 Mean :-0.000005 Mean : 0.000016
3rd Qu.: 0.610210 3rd Qu.: 0.869443 3rd Qu.: 0.832710
Max. : 2.585678 Max. : 1.604504 Max. : 1.307047
message("valid_data")
valid_data
summary(valid_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050
0:43282 Min. :-0.261653 Min. :-1.032430 Min. :-0.5565456
1: 4 1st Qu.:-0.261653 1st Qu.:-1.032430 1st Qu.:-0.5565456
Median :-0.261653 Median : 0.968579 Median :-0.5565456
Mean :-0.002132 Mean :-0.008719 Mean : 0.0001171
3rd Qu.:-0.261653 3rd Qu.: 0.968579 3rd Qu.:-0.5565456
Max. : 3.821821 Max. : 0.968579 Max. : 1.7967801
model2.MQ01ABF050M capacity_bytes smart_3_raw smart_4_raw
Min. :-0.42041 Min. : NA Min. :-2.666116 Min. :-1.301765
1st Qu.:-0.42041 1st Qu.: NA 1st Qu.:-1.282368 1st Qu.:-0.618948
Median :-0.42041 Median : NA Median : 0.699483 Median :-0.134482
Mean : 0.01374 Mean :NaN Mean :-0.009821 Mean :-0.001311
3rd Qu.:-0.42041 3rd Qu.: NA 3rd Qu.: 0.861672 3rd Qu.: 0.807930
Max. : 2.37859 Max. : NA Max. : 1.310751 Max. : 7.538825
NA's :43286
smart_9_raw smart_12_raw smart_191_raw smart_192_raw
Min. :-3.720713 Min. :-1.274805 Min. :-0.561869 Min. :-0.728452
1st Qu.:-0.703210 1st Qu.:-0.596015 1st Qu.:-0.561869 1st Qu.:-0.728452
Median :-0.254274 Median :-0.114405 Median :-0.561869 Median :-0.728452
Mean : 0.000725 Mean :-0.001094 Mean : 0.005503 Mean :-0.003547
3rd Qu.: 0.870413 3rd Qu.: 0.822450 3rd Qu.: 0.418071 3rd Qu.: 0.675337
Max. : 1.627422 Max. : 4.628689 Max. :13.151554 Max. : 7.597056
smart_193_raw smart_194_raw smart_222_raw smart_226_raw
Min. :-2.210091 Min. :-4.314398 Min. :-3.608709 Min. :-2.131422
1st Qu.:-0.704389 1st Qu.:-0.384310 1st Qu.:-0.677479 1st Qu.:-0.920394
Median :-0.553552 Median : 0.137049 Median :-0.239555 Median : 0.809652
Mean : 0.003183 Mean : 0.008034 Mean : 0.000771 Mean :-0.009393
3rd Qu.: 0.919132 3rd Qu.: 0.758692 3rd Qu.: 0.866452 3rd Qu.: 0.832710
Max. : 3.064668 Max. : 2.482194 Max. : 1.604484 Max. : 1.295670
message("test_data")
test_data
summary(test_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050
0:61832 Min. :-0.261653 Min. :-1.0324299 Min. :-0.5565456
1: 6 1st Qu.:-0.261653 1st Qu.:-1.0324299 1st Qu.:-0.5565456
Median :-0.261653 Median : 0.9685791 Median :-0.5565456
Mean : 0.000176 Mean : 0.0004011 Mean :-0.0006953
3rd Qu.:-0.261653 3rd Qu.: 0.9685791 3rd Qu.:-0.5565456
Max. : 3.821821 Max. : 0.9685791 Max. : 1.7967801
model2.MQ01ABF050M capacity_bytes smart_3_raw smart_4_raw
Min. :-0.420412 Min. : NA Min. :-2.666116 Min. :-1.301765
1st Qu.:-0.420412 1st Qu.: NA 1st Qu.:-1.276325 1st Qu.:-0.618948
Median :-0.420412 Median : NA Median : 0.819079 Median :-0.134482
Mean : 0.001354 Mean :NaN Mean :-0.001042 Mean :-0.003516
3rd Qu.:-0.420412 3rd Qu.: NA 3rd Qu.: 0.861672 3rd Qu.: 0.807930
Max. : 2.378595 Max. : NA Max. : 1.310751 Max. : 7.538825
NA's :61838
smart_9_raw smart_12_raw smart_191_raw smart_192_raw
Min. :-3.1516761 Min. :-1.274805 Min. :-0.561869 Min. :-0.7285
1st Qu.:-0.7038315 1st Qu.:-0.596015 1st Qu.:-0.561869 1st Qu.:-0.7285
Median :-0.2607243 Median :-0.114405 Median :-0.561869 Median :-0.7285
Mean :-0.0009615 Mean :-0.004237 Mean : 0.001789 Mean : 0.0011
3rd Qu.: 0.8703519 3rd Qu.: 0.822450 3rd Qu.: 0.418071 3rd Qu.: 0.6753
Max. : 1.6274017 Max. : 4.628689 Max. :13.151357 Max. : 7.5971
smart_193_raw smart_194_raw smart_222_raw smart_226_raw
Min. :-2.2100905 Min. :-4.314398 Min. :-3.430757 Min. :-2.131422
1st Qu.:-0.7043892 1st Qu.:-0.384310 1st Qu.:-0.679943 1st Qu.:-0.920394
Median :-0.5535520 Median : 0.137049 Median :-0.243698 Median : 0.809652
Mean :-0.0004676 Mean : 0.002912 Mean :-0.000962 Mean : 0.001625
3rd Qu.: 0.9005704 3rd Qu.: 0.610210 3rd Qu.: 0.866392 3rd Qu.: 0.837295
Max. : 3.0646676 Max. : 2.585678 Max. : 1.604484 Max. : 1.314603
Add weight_col, as this is a potential parameter to be used. The vlaue of 100 is arbitrarily chosen to make predicting failure 100 times more important than predicting non_failure.
train_data <-
train_data %>%
mutate(weight_col = case_when(failure == 1 ~ 100,
failure == 0 ~ 1,
TRUE ~ 0
)
)
valid_data <-
valid_data %>%
mutate(weight_col = case_when(failure == 1 ~ 100,
failure == 0 ~ 1,
TRUE ~ 0
)
)
test_data <-
test_data %>%
mutate(weight_col = case_when(failure == 1 ~ 100,
failure == 0 ~ 1,
TRUE ~ 0
)
)
message("train_data")
train_data
dim(train_data)
[1] 101005 17
summary(train_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050 model2.MQ01ABF050M
0:100994 Min. :-0.2617 Min. :-1.0324 Min. :-0.5565 Min. :-0.4204
1: 11 1st Qu.:-0.2617 1st Qu.:-1.0324 1st Qu.:-0.5565 1st Qu.:-0.4204
Median :-0.2617 Median : 0.9686 Median :-0.5565 Median :-0.4204
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.2617 3rd Qu.: 0.9686 3rd Qu.:-0.5565 3rd Qu.:-0.4204
Max. : 3.8218 Max. : 0.9686 Max. : 1.7968 Max. : 2.3786
capacity_bytes smart_3_raw smart_4_raw smart_9_raw
Min. : NA Min. :-2.6661165 Min. :-1.301765 Min. :-4.522747
1st Qu.: NA 1st Qu.:-1.2737431 1st Qu.:-0.618948 1st Qu.:-0.691515
Median : NA Median : 0.8187356 Median :-0.134482 Median :-0.260400
Mean :NaN Mean : 0.0000162 Mean :-0.000003 Mean :-0.000005
3rd Qu.: NA 3rd Qu.: 0.8615058 3rd Qu.: 0.807930 3rd Qu.: 0.873419
Max. : NA Max. : 1.3107509 Max. : 7.538825 Max. : 1.627442
NA's :101005
smart_12_raw smart_191_raw smart_192_raw smart_193_raw
Min. :-1.274805 Min. :-0.561869 Min. :-0.728452 Min. :-2.210091
1st Qu.:-0.596015 1st Qu.:-0.561869 1st Qu.:-0.728452 1st Qu.:-0.704389
Median :-0.114405 Median :-0.561869 Median :-0.728452 Median :-0.553552
Mean :-0.000002 Mean :-0.000011 Mean :-0.000014 Mean :-0.000011
3rd Qu.: 0.822450 3rd Qu.: 0.418071 3rd Qu.: 0.675337 3rd Qu.: 0.906041
Max. : 4.628689 Max. :13.151356 Max. : 7.597056 Max. : 3.064668
smart_194_raw smart_222_raw smart_226_raw weight_col
Min. :-4.314398 Min. :-4.390568 Min. :-2.131422 Min. : 1.000
1st Qu.:-0.384310 1st Qu.:-0.677479 1st Qu.:-0.920394 1st Qu.: 1.000
Median : 0.137049 Median :-0.243378 Median : 0.809652 Median : 1.000
Mean : 0.000003 Mean :-0.000005 Mean : 0.000016 Mean : 1.011
3rd Qu.: 0.610210 3rd Qu.: 0.869443 3rd Qu.: 0.832710 3rd Qu.: 1.000
Max. : 2.585678 Max. : 1.604504 Max. : 1.307047 Max. :100.000
message("valid_data")
valid_data
summary(valid_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050
0:43282 Min. :-0.261653 Min. :-1.032430 Min. :-0.5565456
1: 4 1st Qu.:-0.261653 1st Qu.:-1.032430 1st Qu.:-0.5565456
Median :-0.261653 Median : 0.968579 Median :-0.5565456
Mean :-0.002132 Mean :-0.008719 Mean : 0.0001171
3rd Qu.:-0.261653 3rd Qu.: 0.968579 3rd Qu.:-0.5565456
Max. : 3.821821 Max. : 0.968579 Max. : 1.7967801
model2.MQ01ABF050M capacity_bytes smart_3_raw smart_4_raw
Min. :-0.42041 Min. : NA Min. :-2.666116 Min. :-1.301765
1st Qu.:-0.42041 1st Qu.: NA 1st Qu.:-1.282368 1st Qu.:-0.618948
Median :-0.42041 Median : NA Median : 0.699483 Median :-0.134482
Mean : 0.01374 Mean :NaN Mean :-0.009821 Mean :-0.001311
3rd Qu.:-0.42041 3rd Qu.: NA 3rd Qu.: 0.861672 3rd Qu.: 0.807930
Max. : 2.37859 Max. : NA Max. : 1.310751 Max. : 7.538825
NA's :43286
smart_9_raw smart_12_raw smart_191_raw smart_192_raw
Min. :-3.720713 Min. :-1.274805 Min. :-0.561869 Min. :-0.728452
1st Qu.:-0.703210 1st Qu.:-0.596015 1st Qu.:-0.561869 1st Qu.:-0.728452
Median :-0.254274 Median :-0.114405 Median :-0.561869 Median :-0.728452
Mean : 0.000725 Mean :-0.001094 Mean : 0.005503 Mean :-0.003547
3rd Qu.: 0.870413 3rd Qu.: 0.822450 3rd Qu.: 0.418071 3rd Qu.: 0.675337
Max. : 1.627422 Max. : 4.628689 Max. :13.151554 Max. : 7.597056
smart_193_raw smart_194_raw smart_222_raw smart_226_raw
Min. :-2.210091 Min. :-4.314398 Min. :-3.608709 Min. :-2.131422
1st Qu.:-0.704389 1st Qu.:-0.384310 1st Qu.:-0.677479 1st Qu.:-0.920394
Median :-0.553552 Median : 0.137049 Median :-0.239555 Median : 0.809652
Mean : 0.003183 Mean : 0.008034 Mean : 0.000771 Mean :-0.009393
3rd Qu.: 0.919132 3rd Qu.: 0.758692 3rd Qu.: 0.866452 3rd Qu.: 0.832710
Max. : 3.064668 Max. : 2.482194 Max. : 1.604484 Max. : 1.295670
weight_col
Min. : 1.000
1st Qu.: 1.000
Median : 1.000
Mean : 1.009
3rd Qu.: 1.000
Max. :100.000
message("test_data")
test_data
summary(test_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050
0:61832 Min. :-0.261653 Min. :-1.0324299 Min. :-0.5565456
1: 6 1st Qu.:-0.261653 1st Qu.:-1.0324299 1st Qu.:-0.5565456
Median :-0.261653 Median : 0.9685791 Median :-0.5565456
Mean : 0.000176 Mean : 0.0004011 Mean :-0.0006953
3rd Qu.:-0.261653 3rd Qu.: 0.9685791 3rd Qu.:-0.5565456
Max. : 3.821821 Max. : 0.9685791 Max. : 1.7967801
model2.MQ01ABF050M capacity_bytes smart_3_raw smart_4_raw
Min. :-0.420412 Min. : NA Min. :-2.666116 Min. :-1.301765
1st Qu.:-0.420412 1st Qu.: NA 1st Qu.:-1.276325 1st Qu.:-0.618948
Median :-0.420412 Median : NA Median : 0.819079 Median :-0.134482
Mean : 0.001354 Mean :NaN Mean :-0.001042 Mean :-0.003516
3rd Qu.:-0.420412 3rd Qu.: NA 3rd Qu.: 0.861672 3rd Qu.: 0.807930
Max. : 2.378595 Max. : NA Max. : 1.310751 Max. : 7.538825
NA's :61838
smart_9_raw smart_12_raw smart_191_raw smart_192_raw
Min. :-3.1516761 Min. :-1.274805 Min. :-0.561869 Min. :-0.7285
1st Qu.:-0.7038315 1st Qu.:-0.596015 1st Qu.:-0.561869 1st Qu.:-0.7285
Median :-0.2607243 Median :-0.114405 Median :-0.561869 Median :-0.7285
Mean :-0.0009615 Mean :-0.004237 Mean : 0.001789 Mean : 0.0011
3rd Qu.: 0.8703519 3rd Qu.: 0.822450 3rd Qu.: 0.418071 3rd Qu.: 0.6753
Max. : 1.6274017 Max. : 4.628689 Max. :13.151357 Max. : 7.5971
smart_193_raw smart_194_raw smart_222_raw smart_226_raw
Min. :-2.2100905 Min. :-4.314398 Min. :-3.430757 Min. :-2.131422
1st Qu.:-0.7043892 1st Qu.:-0.384310 1st Qu.:-0.679943 1st Qu.:-0.920394
Median :-0.5535520 Median : 0.137049 Median :-0.243698 Median : 0.809652
Mean :-0.0004676 Mean : 0.002912 Mean :-0.000962 Mean : 0.001625
3rd Qu.: 0.9005704 3rd Qu.: 0.610210 3rd Qu.: 0.866392 3rd Qu.: 0.837295
Max. : 3.0646676 Max. : 2.585678 Max. : 1.604484 Max. : 1.314603
weight_col
Min. : 1.00
1st Qu.: 1.00
Median : 1.00
Mean : 1.01
3rd Qu.: 1.00
Max. :100.00
Explore missing data, and remove any variables where more more than 50% of the values are missing.
# Visualize missing data
vis_miss(train_data %>% sample_frac(size = .10))
miss_var_summary(train_data)
# Remove variables where more than 50% of the values are missing. This is done for train, valid, and test datasets.
miss_50pct_below <-
miss_var_summary(train_data) %>%
filter(pct_miss <= 50) %>%
pull(variable)
miss_50pct_below
[1] "failure" "model2.MD04ABA400V" "model2.MG07ACA14TA" "model2.MQ01ABF050"
[5] "model2.MQ01ABF050M" "smart_3_raw" "smart_4_raw" "smart_9_raw"
[9] "smart_12_raw" "smart_191_raw" "smart_192_raw" "smart_193_raw"
[13] "smart_194_raw" "smart_222_raw" "smart_226_raw" "weight_col"
train_data <-
train_data %>%
select(one_of(miss_50pct_below)
)
valid_data <-
valid_data %>%
select(one_of(miss_50pct_below)
)
test_data <-
test_data %>%
select(one_of(miss_50pct_below)
)
dim(train_data)
[1] 101005 16
dim(valid_data)
[1] 43286 16
dim(test_data)
[1] 61838 16
Use DMwR::SMOTE to rebalance the clasess of failure.
message("train_data$failure")
train_data$failure
str(train_data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 101005 obs. of 16 variables:
$ failure : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ model2.MD04ABA400V: num -0.262 -0.262 -0.262 3.822 -0.262 ...
$ model2.MG07ACA14TA: num -1.032 -1.032 0.969 0.969 0.969 ...
$ model2.MQ01ABF050 : num -0.557 -0.557 -0.557 -0.557 -0.557 ...
$ model2.MQ01ABF050M: num -0.42 -0.42 -0.42 -0.42 -0.42 ...
$ smart_3_raw : num 0.855 -0.937 0.866 0.86 0.84 ...
$ smart_4_raw : num 1.033 3.518 1.033 0.241 0.548 ...
$ smart_9_raw : num 1.149 0.53 -1.952 1.272 -0.429 ...
$ smart_12_raw : num -1.275 -0.596 0.822 1.046 -0.596 ...
$ smart_191_raw : num 1.1738 1.398 -0.5619 0.0564 -0.5619 ...
$ smart_192_raw : num -0.728 -0.728 0.675 -0.728 0.675 ...
$ smart_193_raw : num -0.62 -0.872 -0.896 -0.404 1.154 ...
$ smart_194_raw : num 0.137 0.903 -0.205 1.18 0.61 ...
$ smart_222_raw : num -0.509 -0.506 -0.79 1.125 -0.405 ...
$ smart_226_raw : num 0.837 -0.958 -0.911 0.814 1.081 ...
$ weight_col : num 1 1 1 1 1 1 1 1 1 1 ...
summary(train_data)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050 model2.MQ01ABF050M
0:100994 Min. :-0.2617 Min. :-1.0324 Min. :-0.5565 Min. :-0.4204
1: 11 1st Qu.:-0.2617 1st Qu.:-1.0324 1st Qu.:-0.5565 1st Qu.:-0.4204
Median :-0.2617 Median : 0.9686 Median :-0.5565 Median :-0.4204
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.2617 3rd Qu.: 0.9686 3rd Qu.:-0.5565 3rd Qu.:-0.4204
Max. : 3.8218 Max. : 0.9686 Max. : 1.7968 Max. : 2.3786
smart_3_raw smart_4_raw smart_9_raw smart_12_raw
Min. :-2.6661165 Min. :-1.301765 Min. :-4.522747 Min. :-1.274805
1st Qu.:-1.2737431 1st Qu.:-0.618948 1st Qu.:-0.691515 1st Qu.:-0.596015
Median : 0.8187356 Median :-0.134482 Median :-0.260400 Median :-0.114405
Mean : 0.0000162 Mean :-0.000003 Mean :-0.000005 Mean :-0.000002
3rd Qu.: 0.8615058 3rd Qu.: 0.807930 3rd Qu.: 0.873419 3rd Qu.: 0.822450
Max. : 1.3107509 Max. : 7.538825 Max. : 1.627442 Max. : 4.628689
smart_191_raw smart_192_raw smart_193_raw smart_194_raw
Min. :-0.561869 Min. :-0.728452 Min. :-2.210091 Min. :-4.314398
1st Qu.:-0.561869 1st Qu.:-0.728452 1st Qu.:-0.704389 1st Qu.:-0.384310
Median :-0.561869 Median :-0.728452 Median :-0.553552 Median : 0.137049
Mean :-0.000011 Mean :-0.000014 Mean :-0.000011 Mean : 0.000003
3rd Qu.: 0.418071 3rd Qu.: 0.675337 3rd Qu.: 0.906041 3rd Qu.: 0.610210
Max. :13.151356 Max. : 7.597056 Max. : 3.064668 Max. : 2.585678
smart_222_raw smart_226_raw weight_col
Min. :-4.390568 Min. :-2.131422 Min. : 1.000
1st Qu.:-0.677479 1st Qu.:-0.920394 1st Qu.: 1.000
Median :-0.243378 Median : 0.809652 Median : 1.000
Mean :-0.000005 Mean : 0.000016 Mean : 1.011
3rd Qu.: 0.869443 3rd Qu.: 0.832710 3rd Qu.: 1.000
Max. : 1.604504 Max. : 1.307047 Max. :100.000
table(train_data$failure)
0 1
100994 11
prop.table(table(train_data$failure))
0 1
0.9998910945 0.0001089055
set.seed(123456789)
train_data_SMOTE <-
SMOTE(failure ~ .,
data = as.data.frame(train_data),
k = 10,
perc.over = 100000,
perc.under = 1300
)
message("train_data_SMOTE$failure")
train_data_SMOTE$failure
table(train_data_SMOTE$failure)
0 1
143000 11011
prop.table(table(train_data_SMOTE$failure))
0 1
0.92850511 0.07149489
class(train_data_SMOTE)
[1] "data.frame"
glimpse(train_data_SMOTE)
Observations: 154,011
Variables: 16
$ failure [3m[38;5;246m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ model2.MD04ABA400V [3m[38;5;246m<dbl>[39m[23m -0.2616528, -0.2616528, -0.2616528, -0.2616528, -0.2616528…
$ model2.MG07ACA14TA [3m[38;5;246m<dbl>[39m[23m -1.0324299, -1.0324299, 0.9685791, 0.9685791, -1.0324299, …
$ model2.MQ01ABF050 [3m[38;5;246m<dbl>[39m[23m -0.5565456, -0.5565456, 1.7967801, 1.7967801, -0.5565456, …
$ model2.MQ01ABF050M [3m[38;5;246m<dbl>[39m[23m -0.4204122, -0.4204122, -0.4204122, -0.4204122, 2.3785946,…
$ smart_3_raw [3m[38;5;246m<dbl>[39m[23m 0.8432514, 0.8975866, -1.2980393, 0.8353004, -1.3981441, 0…
$ smart_4_raw [3m[38;5;246m<dbl>[39m[23m 0.5483349, -0.6189480, 0.8079298, -0.1344818, -1.3017648, …
$ smart_9_raw [3m[38;5;246m<dbl>[39m[23m -0.40651706, 0.04642314, -0.70507547, -0.53517251, 1.61166…
$ smart_12_raw [3m[38;5;246m<dbl>[39m[23m 1.4195612, -0.5960145, 0.2591607, 0.8224503, -1.2748053, 0…
$ smart_191_raw [3m[38;5;246m<dbl>[39m[23m -0.56186948, -0.56186948, -0.56186948, -0.56186948, -0.561…
$ smart_192_raw [3m[38;5;246m<dbl>[39m[23m -0.7284523, -0.7284523, -0.7284523, -0.7284523, 1.4965019,…
$ smart_193_raw [3m[38;5;246m<dbl>[39m[23m -0.61275161, -0.14665595, 1.33890134, -0.61978132, 0.13065…
$ smart_194_raw [3m[38;5;246m<dbl>[39m[23m 0.2996742, 0.1370488, 0.2996742, -0.2045625, 0.2996742, -0…
$ smart_222_raw [3m[38;5;246m<dbl>[39m[23m -0.7689613, -0.5777751, 1.5570162, -0.9099636, -0.6433306,…
$ smart_226_raw [3m[38;5;246m<dbl>[39m[23m -1.9681499022, 0.8189011527, -1.5469111861, 0.8510008909, …
$ weight_col [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
summary(train_data_SMOTE)
failure model2.MD04ABA400V model2.MG07ACA14TA model2.MQ01ABF050 model2.MQ01ABF050M
0:143000 Min. :-0.26165 Min. :-1.03243 Min. :-0.5565 Min. :-0.420412
1: 11011 1st Qu.:-0.26165 1st Qu.:-1.03243 1st Qu.:-0.5565 1st Qu.:-0.420412
Median :-0.26165 Median : 0.96858 Median :-0.5565 Median :-0.420412
Mean :-0.01743 Mean : 0.01308 Mean :-0.0050 Mean :-0.009432
3rd Qu.:-0.26165 3rd Qu.: 0.96858 3rd Qu.:-0.5565 3rd Qu.:-0.420412
Max. : 3.82182 Max. : 0.96858 Max. : 1.7968 Max. : 2.378595
smart_3_raw smart_4_raw smart_9_raw smart_12_raw
Min. :-2.666116 Min. :-1.30177 Min. :-4.52275 Min. :-1.27480
1st Qu.:-0.986453 1st Qu.:-0.61895 1st Qu.:-0.66166 1st Qu.:-0.59601
Median : 0.617853 Median :-0.13448 Median :-0.15333 Median :-0.11441
Mean :-0.004027 Mean :-0.03874 Mean : 0.03135 Mean : 0.01031
3rd Qu.: 0.860508 3rd Qu.: 0.54834 3rd Qu.: 0.88166 3rd Qu.: 0.82245
Max. : 1.310751 Max. : 7.53883 Max. : 1.62742 Max. : 4.62869
smart_191_raw smart_192_raw smart_193_raw smart_194_raw
Min. :-0.56187 Min. :-0.728452 Min. :-2.21009 Min. :-4.31440
1st Qu.:-0.56187 1st Qu.:-0.728452 1st Qu.:-0.70439 1st Qu.:-0.57059
Median :-0.56187 Median :-0.728452 Median :-0.55355 Median : 0.13705
Mean : 0.04032 Mean : 0.000833 Mean :-0.02579 Mean :-0.00806
3rd Qu.: 0.41807 3rd Qu.: 0.675337 3rd Qu.: 0.81290 3rd Qu.: 0.61021
Max. :13.15136 Max. : 7.597056 Max. : 3.06467 Max. : 2.48219
smart_222_raw smart_226_raw weight_col
Min. :-3.25973 Min. :-2.131422 Min. : 1.000
1st Qu.:-0.69114 1st Qu.:-0.911040 1st Qu.: 1.000
Median :-0.25596 Median : 0.805015 Median : 1.000
Mean :-0.02421 Mean :-0.009757 Mean : 8.078
3rd Qu.: 0.85049 3rd Qu.: 0.832710 3rd Qu.: 1.000
Max. : 1.60450 Max. : 1.295670 Max. :100.000
Start h2o.
h2o.init()
H2O is not running yet, starting it now...
Note: In case of errors look at the following log files:
/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/h2o_mdturse_started_from_r.out
/var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//Rtmpco3HiE/h2o_mdturse_started_from_r.err
java version "1.8.0_121"
Java(TM) SE Runtime Environment (build 1.8.0_121-b13)
Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)
Starting H2O JVM and connecting: ... Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 4 seconds 9 milliseconds
H2O cluster timezone: America/New_York
H2O data parsing timezone: UTC
H2O cluster version: 3.24.0.1
H2O cluster version age: 1 month and 21 days
H2O cluster name: H2O_started_from_R_mdturse_qhg316
H2O cluster total nodes: 1
H2O cluster total memory: 1.78 GB
H2O cluster total cores: 4
H2O cluster allowed cores: 4
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
R Version: R version 3.6.0 (2019-04-26)
h2o.no_progress() # Turn off progress bars
Convert to the datasets to h2o objects.
# Multiple sets (train, train_no_weight, SMOTE, SMOTE_no_weight) are used to test the effects of these different datasets.
train_h2o <- as.h2o(train_data)
train_h2o_no_weight <- as.h2o(train_data %>% select(-weight_col))
train_h2o_SMOTE <- as.h2o(train_data_SMOTE)
train_h2o_SMOTE_no_weight <- as.h2o(train_data_SMOTE %>% select(-weight_col))
valid_h2o <- as.h2o(valid_data)
valid_h2o_no_weight <- as.h2o(valid_data %>% select(-weight_col))
test_h2o <- as.h2o(test_data)
test_h2o_no_weight <- as.h2o(test_data %>% select(-weight_col))
# Save the data
saveRDS(train_data,
paste0(wd,
"/Data/Processed/",
"train_data.rds"
)
)
saveRDS(train_data %>%
select(-weight_col),
paste0(wd,
"/Data/Processed/",
"train_data_no_weight.rds"
)
)
saveRDS(valid_data,
paste0(wd,
"/Data/Processed/",
"valid_data.rds"
)
)
saveRDS(valid_data %>%
select(-weight_col),
paste0(wd,
"/Data/Processed/",
"valid_data_no_weight.rds"
)
)
saveRDS(test_data,
paste0(wd,
"/Data/Processed/",
"test_data.rds"
)
)
saveRDS(test_data %>%
select(-weight_col),
paste0(wd,
"/Data/Processed/",
"test_data_no_weight.rds"
)
)
Set the relevant variable names.
y <- "failure"
x <-
setdiff(names(train_h2o),
# setdiff(names(train_h2o_no_weight),
# setdiff(names(train_h2o_SMOTE),
# setdiff(names(train_h2o_SMOTE_no_weight),
y
)
Run h2o.automl.
NOTES
train_h2o using weight_col & balance_classes = TRUE does not function - finds a quick “solution”
train_h2o NOT using weight_col & balance_classes = TRUE does function
MODEL USED: 3) train_h2o using weight_col & balance_classes = FALSE
train_h2o_SMOTE using weight_col & balance_classes = FALSE train_h2o_SMOTE_no_weight NOT using weight_col & balance_classes = TRUE
# user system elapsed
# 22.355 7.826 1310.738
# ~ 22 min
start <- proc.time()
automl_models_h2o <-
h2o.automl(x = x,
y = y,
training_frame = train_h2o,
# training_frame = train_h2o_SMOTE_no_weight,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
nfolds = 10,
# balance_classes = TRUE,
balance_classes = FALSE,
weights_column = "weight_col",
max_runtime_secs = 3600, # 1 hour
# max_runtime_secs = 60,
max_models = 10,
stopping_metric = "AUTO",
seed = 123456789
)
h2o.time <- proc.time() - start
h2o.time
user system elapsed
20.010 7.500 1303.079
rm(start)
Pull out the “leader” model.
automl_leader <- automl_models_h2o@leader
automl_leader
Model Details:
==============
H2OBinomialModel: gbm
Model ID: GBM_4_AutoML_20190522_081032
Model Summary:
H2OBinomialMetrics: gbm
** Reported on training data. **
MSE: 1.121729e-05
RMSE: 0.003349223
LogLoss: 0.0007321303
Mean Per-Class Error: 0
AUC: 1
pr_auc: 0.9090909
Gini: 1
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
Maximum Metrics: Maximum metrics at their respective thresholds
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
H2OBinomialMetrics: gbm
** Reported on validation data. **
MSE: 0.009422029
RMSE: 0.09706713
LogLoss: 0.0699779
Mean Per-Class Error: 0.1479252
AUC: 0.7775172
pr_auc: 0.01888206
Gini: 0.5550344
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
Maximum Metrics: Maximum metrics at their respective thresholds
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
H2OBinomialMetrics: gbm
** Reported on cross-validation data. **
** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
MSE: 0.0109958
RMSE: 0.1048609
LogLoss: 0.08324178
Mean Per-Class Error: 0.427777
AUC: 0.4566617
pr_auc: 0.008994093
Gini: -0.08667662
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
Maximum Metrics: Maximum metrics at their respective thresholds
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
Cross-Validation Metrics Summary:
# save each individual model
model_path_h2o <-
h2o.saveModel(object = automl_leader,
path = paste0(wd,
"/Models/"
),
force = TRUE
)
saveRDS(model_path_h2o,
paste0(wd,
"/Models/",
"model_path_h2o.rds"
)
)
# load the model
# model_path_h2o <-
# read_rds(path = paste0(wd,
# "/Models/",
# "model_path_h2o.rds"
# )
# )
#
# print(model_path_h2o)
#
# automl_leader <-
# h2o.loadModel(path = model_path_h2o)
Inspect the leaderboard.
automl_models_h2o@leaderboard
[12 rows x 6 columns]
Investigate variable importance of the leader model.
leader_models_varimp <- h2o.varimp(object = automl_leader)
leader_models_varimp_plot <- h2o.varimp_plot(model = automl_leader, num_of_features = 10)
leader_models_varimp
Variable Importances:
leader_models_varimp_plot
[,1]
[1,] 1.5
[2,] 3.5
[3,] 5.5
[4,] 7.5
[5,] 9.5
[6,] 11.5
[7,] 13.5
[8,] 15.5
[9,] 17.5
[10,] 19.5
Create predictions from the models.
pred_h2o <- h2o.predict(automl_leader, newdata = test_h2o)
# save as a .rds file
saveRDS(pred_h2o,
paste0(wd,
"/Models/",
"pred_h2o.rds"
)
)
# pred_h2o <-
# read_rds(path = paste0(wd,
# "/Models/",
# "pred_h2o.rds"
# )
# )
Look at performance stats using the test data.
perf_stats_test_h2o <- h2o.performance(automl_leader, newdata = test_h2o)
perf_stats_test_h2o
H2OBinomialMetrics: gbm
MSE: 0.009857633
RMSE: 0.09928561
LogLoss: 0.07323695
Mean Per-Class Error: 0.3187832
AUC: 0.7210951
pr_auc: 0.0195803
Gini: 0.4421901
Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
Maximum Metrics: Maximum metrics at their respective thresholds
Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# save as a .rds file
saveRDS(perf_stats_test_h2o,
paste0(wd,
"/Models/",
"perf_stats_test_h2o.rds"
)
)
# perf_stats_test_h2o <-
# read_rds(path = paste0(wd,
# "/Models/",
# "perf_stats_test_h2o.rds"
# )
# )
Investigate test error.
error_tbl_h2o <-
test_data %>%
bind_cols(pred_h2o %>% as_tibble()
) %>%
rename(obs = failure,
pred = predict
)
confusionMatrix(data = error_tbl_h2o$pred,
reference = error_tbl_h2o$obs,
positive = "1",
mode = "everything"
)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 43901 2
1 17931 4
Accuracy : 0.71
95% CI : (0.7064, 0.7136)
No Information Rate : 0.9999
P-Value [Acc > NIR] : 1
Kappa : 3e-04
Mcnemar's Test P-Value : <2e-16
Sensitivity : 6.667e-01
Specificity : 7.100e-01
Pos Pred Value : 2.230e-04
Neg Pred Value : 1.000e+00
Precision : 2.230e-04
Recall : 6.667e-01
F1 : 4.459e-04
Prevalence : 9.703e-05
Detection Rate : 6.469e-05
Detection Prevalence : 2.900e-01
Balanced Accuracy : 6.883e-01
'Positive' Class : 1
# save as a .rds file
saveRDS(error_tbl_h2o,
paste0(wd,
"/Models/",
"error_tbl_h2o.rds"
)
)
# error_tbl_h2o <-
# read_rds(path = paste0(wd,
# "/Models/",
# "error_tbl_h2o.rds"
# )
# )