Desc Stat
library(lares)
library(beepr)
library(dplyr)
### SOURCE: https://rpubs.com/laresbernardo/freqs-distr-corr
# The data we'll use is the Titanic dataset
data(dft)
dim(dft)
## [1] 891 11
head(dft)
## PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare
## 1 1 FALSE 3 male 22 1 0 A/5 21171 7.2500
## 2 2 TRUE 1 female 38 1 0 PC 17599 71.2833
## 3 3 TRUE 3 female 26 0 0 STON/O2. 3101282 7.9250
## 4 4 TRUE 1 female 35 1 0 113803 53.1000
## 5 5 FALSE 3 male 35 0 0 373450 8.0500
## 6 6 FALSE 3 male NA 0 0 330877 8.4583
## Cabin Embarked
## 1 S
## 2 C85 C
## 3 S
## 4 C123 S
## 5 S
## 6 Q
df_str(dft, return = "plot")

dft %>% freqs(Survived, plot = T, results = F)

dft %>% freqs(Survived, Pclass, plot = T, results = F)

# Per sex and class, how many survived?
dft %>% freqs(Sex, Pclass, Survived, plot = T, results = F)

dft %>% freqs(Ticket, plot = T, results = F)

# Let's customize the plot a bit....
dft %>%
mutate(Survived = ifelse(Survived == 1, "Did survive", "Did not survive")) %>%
freqs(Pclass, Survived, plot = T,
title = "People who survived the Titanic by Class",
subtitle = paste("Bernardo Lares:", Sys.Date()),
results = F)

corr_cross(dft)

Classification
df <- subset(dft, select = -c(Ticket, PassengerId, Cabin))
r <- h2o_automl(df, y = Survived, max_models = 1, impute = FALSE, target = "TRUE")
## # A tibble: 2 × 5
## tag n p order pcum
## <lgl> <int> <dbl> <int> <dbl>
## 1 FALSE 549 61.6 1 61.6
## 2 TRUE 342 38.4 2 100
## train_size test_size
## 623 268
## | | | 0% | |======================================================================| 100%
## model_id auc logloss aucpr
## 1 GLM_1_AutoML_15_20250219_220529 0.8327892 0.4805857 0.8091657
## mean_per_class_error rmse mse
## 1 0.236994 0.3945223 0.1556479
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## Model (1/1): GLM_1_AutoML_15_20250219_220529
## Independent Variable: Survived
## Type: Classification (2 classes)
## Algorithm: GLM
## Split: 70% training data (of 891 observations)
## Seed: 0
##
## Test metrics:
## " AUC = 0.89006"
## " ACC = 0.14925"
## " PRC = 0.097701"
## " TPR = 0.19318"
## " TNR = 0.12778"
##
## Most important variables:
## " Sex.female (20.9%)"
## " Sex.male (20.7%)"
## " Pclass.3 (16.7%)"
## " Pclass.1 (14.5%)"
## " Age (7.1%)"
r$plots$metrics
## $gains

##
## $response

##
## $conf_matrix

##
## $ROC

r$metrics
## $dictionary
## [1] "AUC: Area Under the Curve"
## [2] "ACC: Accuracy"
## [3] "PRC: Precision = Positive Predictive Value"
## [4] "TPR: Sensitivity = Recall = Hit rate = True Positive Rate"
## [5] "TNR: Specificity = Selectivity = True Negative Rate"
## [6] "Logloss (Error): Logarithmic loss [Neutral classification: 0.69315]"
## [7] "Gain: When best n deciles selected, what % of the real target observations are picked?"
## [8] "Lift: When best n deciles selected, how much better than random is?"
##
## $confusion_matrix
## Pred
## Real FALSE TRUE
## FALSE 23 157
## TRUE 71 17
##
## $gain_lift
## # A tibble: 10 × 10
## percentile value random target total gain optimal lift response score
## <fct> <chr> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 TRUE 10.1 27 27 30.7 30.7 205. 30.7 83.8
## 2 2 TRUE 20.1 23 27 56.8 61.4 182. 26.1 70.4
## 3 3 TRUE 30.2 16 27 75 92.0 148. 18.2 59.2
## 4 4 TRUE 39.9 7 26 83.0 100 108. 7.95 42.9
## 5 5 TRUE 50 4 27 87.5 100 75 4.55 30.2
## 6 6 TRUE 60.1 3 27 90.9 100 51.3 3.41 20.2
## 7 7 TRUE 69.8 2 26 93.2 100 33.5 2.27 16.1
## 8 8 TRUE 79.9 2 27 95.5 100 19.5 2.27 13.8
## 9 9 TRUE 89.9 3 27 98.9 100 9.94 3.41 11.5
## 10 10 TRUE 100 1 27 100 100 0 1.14 3.84
##
## $metrics
## AUC ACC PRC TPR TNR
## 1 0.89006 0.14925 0.097701 0.19318 0.12778
##
## $cv_metrics
## # A tibble: 22 × 8
## metric mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 accuracy 0.790 0.0589 0.744 0.784 0.792 0.742 0.887
## 2 auc 0.836 0.0305 0.818 0.806 0.825 0.851 0.882
## 3 err 0.210 0.0589 0.256 0.216 0.208 0.258 0.113
## 4 err_cou… 26.2 7.36 32 27 26 32 14
## 5 f0point5 0.748 0.0819 0.690 0.759 0.712 0.695 0.887
## 6 f1 0.752 0.0495 0.719 0.716 0.745 0.742 0.837
## 7 f2 0.760 0.0494 0.751 0.677 0.782 0.796 0.793
## 8 lift_to… 2.20 0.505 2.36 2.40 1.33 2.25 2.64
## 9 logloss 0.477 0.0464 0.509 0.520 0.474 0.482 0.401
## 10 max_per… 0.282 0.0574 0.278 0.346 0.218 0.333 0.234
## # ℹ 12 more rows
##
## $max_metrics
## metric threshold value idx
## 1 max f1 0.35665939 0.7262774 224
## 2 max f2 0.12098587 0.7957474 351
## 3 max f0point5 0.68872487 0.7793765 111
## 4 max accuracy 0.58866640 0.7913323 145
## 5 max precision 0.97613179 1.0000000 0
## 6 max recall 0.07555791 1.0000000 389
## 7 max specificity 0.97613179 1.0000000 0
## 8 max absolute_mcc 0.58866640 0.5636429 145
## 9 max min_per_class_accuracy 0.37918777 0.7559055 214
## 10 max mean_per_class_accuracy 0.58866640 0.7655667 145
## 11 max tns 0.97613179 369.0000000 0
## 12 max fns 0.97613179 253.0000000 0
## 13 max fps 0.02157964 369.0000000 399
## 14 max tps 0.07555791 254.0000000 389
## 15 max tnr 0.97613179 1.0000000 0
## 16 max fnr 0.97613179 0.9960630 0
## 17 max fpr 0.02157964 1.0000000 399
## 18 max tpr 0.07555791 1.0000000 389
head(r$importance)
## variable relative_importance scaled_importance importance
## 1 Sex.female 1.1970952 1.0000000 0.20920596
## 2 Sex.male 1.1867412 0.9913508 0.20739650
## 3 Pclass.3 0.9572464 0.7996410 0.16728967
## 4 Pclass.1 0.8288406 0.6923765 0.14484929
## 5 Age 0.4074980 0.3404057 0.07121490
## 6 Embarked.S 0.2552351 0.2132121 0.04460524
r$plots$importance

r <- h2o_automl(df, Pclass, ignore = c("Fare", "Cabin"), max_time = 30, plots = FALSE)
## # A tibble: 3 × 5
## tag n p order pcum
## <fct> <int> <dbl> <int> <dbl>
## 1 n_3 491 55.1 1 55.1
## 2 n_1 216 24.2 2 79.4
## 3 n_2 184 20.6 3 100
## train_size test_size
## 623 268
## model_id mean_per_class_error logloss rmse
## 1 GLM_1_AutoML_16_20250219_220538 0.4742325 0.8170807 0.5409388
## 2 GBM_1_AutoML_16_20250219_220538 0.5056661 0.8593457 0.5595994
## 3 DRF_1_AutoML_16_20250219_220538 0.5057530 0.9395588 0.5459812
## mse
## 1 0.2926147
## 2 0.3131515
## 3 0.2980955
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## Model (1/3): GLM_1_AutoML_16_20250219_220538
## Independent Variable: Pclass
## Type: Classification (3 classes)
## Algorithm: GLM
## Split: 70% training data (of 891 observations)
## Seed: 0
##
## Test metrics:
## " AUC = 0.76337"
## " ACC = 0.64179"
##
## Most important variables:
## " Embarked.Q (25.3%)"
## " Embarked.C (13.5%)"
## " Embarked.S (13.3%)"
## " Age (11.9%)"
## " Survived.FALSE (10.6%)"
r$plots$metrics
## NULL
r$metrics
## $dictionary
## [1] "AUC: Area Under the Curve"
## [2] "ACC: Accuracy"
## [3] "PRC: Precision = Positive Predictive Value"
## [4] "TPR: Sensitivity = Recall = Hit rate = True Positive Rate"
## [5] "TNR: Specificity = Selectivity = True Negative Rate"
## [6] "Logloss (Error): Logarithmic loss [Neutral classification: 0.69315]"
## [7] "Gain: When best n deciles selected, what % of the real target observations are picked?"
## [8] "Lift: When best n deciles selected, how much better than random is?"
##
## $confusion_matrix
## # A tibble: 3 × 4
## `Real x Pred` n_3 n_1 n_2
## <fct> <int> <int> <int>
## 1 n_3 127 10 12
## 2 n_1 18 37 8
## 3 n_2 32 16 8
##
## $metrics
## AUC ACC
## 1 0.76337 0.64179
##
## $metrics_tags
## # A tibble: 3 × 9
## tag n p AUC order ACC PRC TPR TNR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 n_3 149 55.6 0.810 1 0.731 0.718 0.852 0.580
## 2 n_1 63 23.5 0.829 2 0.806 0.587 0.587 0.873
## 3 n_2 56 20.9 0.651 3 0.746 0.286 0.143 0.906
##
## $cv_metrics
## # A tibble: 14 × 8
## metric mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 accur… 0.641 0.0541 0.56 0.696 0.616 0.677 0.653
## 2 auc NaN 0 NaN NaN NaN NaN NaN
## 3 err 0.359 0.0541 0.44 0.304 0.384 0.323 0.347
## 4 err_c… 44.8 6.83 55 38 48 40 43
## 5 loglo… 0.815 0.0807 0.911 0.703 0.839 0.767 0.853
## 6 max_p… 0.847 0.0639 0.829 0.947 0.783 0.810 0.867
## 7 mean_… 0.520 0.0243 0.479 0.529 0.516 0.536 0.538
## 8 mean_… 0.480 0.0243 0.521 0.471 0.484 0.464 0.462
## 9 mse 0.292 0.0332 0.337 0.249 0.298 0.274 0.304
## 10 null_… 251. 13.3 268. 249. 254. 231. 252.
## 11 pr_auc NaN 0 NaN NaN NaN NaN NaN
## 12 r2 0.575 0.0744 0.485 0.682 0.606 0.568 0.535
## 13 resid… 203. 20.2 228. 176. 210. 190. 212.
## 14 rmse 0.540 0.0307 0.581 0.499 0.546 0.523 0.551
##
## $hit_ratio
## k hit_ratio
## 1 1 0.6436597
## 2 2 0.8780096
## 3 3 1.0000000
head(r$importance)
## variable relative_importance scaled_importance importance
## 1 Embarked.Q 2.841431 1.0000000 0.2525001
## 2 Embarked.C 1.516046 0.5335503 0.1347215
## 3 Embarked.S 1.494055 0.5258107 0.1327673
## 4 Age 1.342508 0.4724759 0.1193002
## 5 Survived.FALSE 1.191114 0.4191951 0.1058468
## 6 Survived.TRUE 1.156447 0.4069947 0.1027662
r$plots$importance
## NULL
Importance
data("starwars")
# Let's get rid of the lists inside the dataframe
df <- select(starwars, -starships, -vehicles, -films)
head(df)
## # A tibble: 6 × 11
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke Sky… 172 77 blond fair blue 19 male mascu…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu…
## 3 R2-D2 96 32 <NA> white, bl… red 33 none mascu…
## 4 Darth Va… 202 136 none white yellow 41.9 male mascu…
## 5 Leia Org… 150 49 brown light brown 19 fema… femin…
## 6 Owen Lars 178 120 brown, gr… light blue 52 male mascu…
## # ℹ 2 more variables: homeworld <chr>, species <chr>
corr_cross(df)
## Returning only the top 25. You may override with the 'top' argument

dft <- select(dft, -Cabin, -Ticket)
corr_cross(dft, top = 15)
## Returning only the top 15. You may override with the 'top' argument

corr_cross(df, type = 2)

corr_cross(df, type = 2, contains = "species")
