Desc Stat

library(lares)
library(beepr)
library(dplyr)
### SOURCE: https://rpubs.com/laresbernardo/freqs-distr-corr
# The data we'll use is the Titanic dataset
data(dft)
dim(dft)
## [1] 891  11
head(dft)
##   PassengerId Survived Pclass    Sex Age SibSp Parch           Ticket    Fare
## 1           1    FALSE      3   male  22     1     0        A/5 21171  7.2500
## 2           2     TRUE      1 female  38     1     0         PC 17599 71.2833
## 3           3     TRUE      3 female  26     0     0 STON/O2. 3101282  7.9250
## 4           4     TRUE      1 female  35     1     0           113803 53.1000
## 5           5    FALSE      3   male  35     0     0           373450  8.0500
## 6           6    FALSE      3   male  NA     0     0           330877  8.4583
##   Cabin Embarked
## 1              S
## 2   C85        C
## 3              S
## 4  C123        S
## 5              S
## 6              Q
df_str(dft, return = "plot")

dft %>% freqs(Survived, plot = T, results = F)

dft %>% freqs(Survived, Pclass, plot = T, results = F)

# Per sex and class, how many survived?
dft %>% freqs(Sex, Pclass, Survived, plot = T, results = F)

dft %>% freqs(Ticket, plot = T, results = F)

# Let's customize the plot a bit....
dft %>% 
  mutate(Survived = ifelse(Survived == 1, "Did survive", "Did not survive")) %>%
  freqs(Pclass, Survived, plot = T, 
        title = "People who survived the Titanic by Class",
        subtitle = paste("Bernardo Lares:", Sys.Date()),
        results = F)

corr_cross(dft)

Classification

df <- subset(dft, select = -c(Ticket, PassengerId, Cabin))
r <- h2o_automl(df, y = Survived, max_models = 1, impute = FALSE, target = "TRUE")
## # A tibble: 2 × 5
##   tag       n     p order  pcum
##   <lgl> <int> <dbl> <int> <dbl>
## 1 FALSE   549  61.6     1  61.6
## 2 TRUE    342  38.4     2 100  
## train_size  test_size 
##        623        268 
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
##                          model_id       auc   logloss     aucpr
## 1 GLM_1_AutoML_15_20250219_220529 0.8327892 0.4805857 0.8091657
##   mean_per_class_error      rmse       mse
## 1             0.236994 0.3945223 0.1556479
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
## Model (1/1): GLM_1_AutoML_15_20250219_220529
## Independent Variable: Survived
## Type: Classification (2 classes)
## Algorithm: GLM
## Split: 70% training data (of 891 observations)
## Seed: 0
## 
## Test metrics:
## "   AUC = 0.89006"
## "   ACC = 0.14925"
## "   PRC = 0.097701"
## "   TPR = 0.19318"
## "   TNR = 0.12778"
## 
## Most important variables:
## "   Sex.female (20.9%)"
## "   Sex.male (20.7%)"
## "   Pclass.3 (16.7%)"
## "   Pclass.1 (14.5%)"
## "   Age (7.1%)"
r$plots$metrics
## $gains

## 
## $response

## 
## $conf_matrix

## 
## $ROC

r$metrics
## $dictionary
## [1] "AUC: Area Under the Curve"                                                             
## [2] "ACC: Accuracy"                                                                         
## [3] "PRC: Precision = Positive Predictive Value"                                            
## [4] "TPR: Sensitivity = Recall = Hit rate = True Positive Rate"                             
## [5] "TNR: Specificity = Selectivity = True Negative Rate"                                   
## [6] "Logloss (Error): Logarithmic loss [Neutral classification: 0.69315]"                   
## [7] "Gain: When best n deciles selected, what % of the real target observations are picked?"
## [8] "Lift: When best n deciles selected, how much better than random is?"                   
## 
## $confusion_matrix
##        Pred
## Real    FALSE TRUE
##   FALSE    23  157
##   TRUE     71   17
## 
## $gain_lift
## # A tibble: 10 × 10
##    percentile value random target total  gain optimal   lift response score
##    <fct>      <chr>  <dbl>  <int> <int> <dbl>   <dbl>  <dbl>    <dbl> <dbl>
##  1 1          TRUE    10.1     27    27  30.7    30.7 205.      30.7  83.8 
##  2 2          TRUE    20.1     23    27  56.8    61.4 182.      26.1  70.4 
##  3 3          TRUE    30.2     16    27  75      92.0 148.      18.2  59.2 
##  4 4          TRUE    39.9      7    26  83.0   100   108.       7.95 42.9 
##  5 5          TRUE    50        4    27  87.5   100    75        4.55 30.2 
##  6 6          TRUE    60.1      3    27  90.9   100    51.3      3.41 20.2 
##  7 7          TRUE    69.8      2    26  93.2   100    33.5      2.27 16.1 
##  8 8          TRUE    79.9      2    27  95.5   100    19.5      2.27 13.8 
##  9 9          TRUE    89.9      3    27  98.9   100     9.94     3.41 11.5 
## 10 10         TRUE   100        1    27 100     100     0        1.14  3.84
## 
## $metrics
##       AUC     ACC      PRC     TPR     TNR
## 1 0.89006 0.14925 0.097701 0.19318 0.12778
## 
## $cv_metrics
## # A tibble: 22 × 8
##    metric     mean     sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
##    <chr>     <dbl>  <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 accuracy  0.790 0.0589      0.744      0.784      0.792      0.742      0.887
##  2 auc       0.836 0.0305      0.818      0.806      0.825      0.851      0.882
##  3 err       0.210 0.0589      0.256      0.216      0.208      0.258      0.113
##  4 err_cou… 26.2   7.36       32         27         26         32         14    
##  5 f0point5  0.748 0.0819      0.690      0.759      0.712      0.695      0.887
##  6 f1        0.752 0.0495      0.719      0.716      0.745      0.742      0.837
##  7 f2        0.760 0.0494      0.751      0.677      0.782      0.796      0.793
##  8 lift_to…  2.20  0.505       2.36       2.40       1.33       2.25       2.64 
##  9 logloss   0.477 0.0464      0.509      0.520      0.474      0.482      0.401
## 10 max_per…  0.282 0.0574      0.278      0.346      0.218      0.333      0.234
## # ℹ 12 more rows
## 
## $max_metrics
##                         metric  threshold       value idx
## 1                       max f1 0.35665939   0.7262774 224
## 2                       max f2 0.12098587   0.7957474 351
## 3                 max f0point5 0.68872487   0.7793765 111
## 4                 max accuracy 0.58866640   0.7913323 145
## 5                max precision 0.97613179   1.0000000   0
## 6                   max recall 0.07555791   1.0000000 389
## 7              max specificity 0.97613179   1.0000000   0
## 8             max absolute_mcc 0.58866640   0.5636429 145
## 9   max min_per_class_accuracy 0.37918777   0.7559055 214
## 10 max mean_per_class_accuracy 0.58866640   0.7655667 145
## 11                     max tns 0.97613179 369.0000000   0
## 12                     max fns 0.97613179 253.0000000   0
## 13                     max fps 0.02157964 369.0000000 399
## 14                     max tps 0.07555791 254.0000000 389
## 15                     max tnr 0.97613179   1.0000000   0
## 16                     max fnr 0.97613179   0.9960630   0
## 17                     max fpr 0.02157964   1.0000000 399
## 18                     max tpr 0.07555791   1.0000000 389
head(r$importance)
##     variable relative_importance scaled_importance importance
## 1 Sex.female           1.1970952         1.0000000 0.20920596
## 2   Sex.male           1.1867412         0.9913508 0.20739650
## 3   Pclass.3           0.9572464         0.7996410 0.16728967
## 4   Pclass.1           0.8288406         0.6923765 0.14484929
## 5        Age           0.4074980         0.3404057 0.07121490
## 6 Embarked.S           0.2552351         0.2132121 0.04460524
r$plots$importance

r <- h2o_automl(df, Pclass, ignore = c("Fare", "Cabin"), max_time = 30, plots = FALSE)
## # A tibble: 3 × 5
##   tag       n     p order  pcum
##   <fct> <int> <dbl> <int> <dbl>
## 1 n_3     491  55.1     1  55.1
## 2 n_1     216  24.2     2  79.4
## 3 n_2     184  20.6     3 100  
## train_size  test_size 
##        623        268 
##                          model_id mean_per_class_error   logloss      rmse
## 1 GLM_1_AutoML_16_20250219_220538            0.4742325 0.8170807 0.5409388
## 2 GBM_1_AutoML_16_20250219_220538            0.5056661 0.8593457 0.5595994
## 3 DRF_1_AutoML_16_20250219_220538            0.5057530 0.9395588 0.5459812
##         mse
## 1 0.2926147
## 2 0.3131515
## 3 0.2980955
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
## Model (1/3): GLM_1_AutoML_16_20250219_220538
## Independent Variable: Pclass
## Type: Classification (3 classes)
## Algorithm: GLM
## Split: 70% training data (of 891 observations)
## Seed: 0
## 
## Test metrics:
## "   AUC = 0.76337"
## "   ACC = 0.64179"
## 
## Most important variables:
## "   Embarked.Q (25.3%)"
## "   Embarked.C (13.5%)"
## "   Embarked.S (13.3%)"
## "   Age (11.9%)"
## "   Survived.FALSE (10.6%)"
r$plots$metrics
## NULL
r$metrics
## $dictionary
## [1] "AUC: Area Under the Curve"                                                             
## [2] "ACC: Accuracy"                                                                         
## [3] "PRC: Precision = Positive Predictive Value"                                            
## [4] "TPR: Sensitivity = Recall = Hit rate = True Positive Rate"                             
## [5] "TNR: Specificity = Selectivity = True Negative Rate"                                   
## [6] "Logloss (Error): Logarithmic loss [Neutral classification: 0.69315]"                   
## [7] "Gain: When best n deciles selected, what % of the real target observations are picked?"
## [8] "Lift: When best n deciles selected, how much better than random is?"                   
## 
## $confusion_matrix
## # A tibble: 3 × 4
##   `Real x Pred`   n_3   n_1   n_2
##   <fct>         <int> <int> <int>
## 1 n_3             127    10    12
## 2 n_1              18    37     8
## 3 n_2              32    16     8
## 
## $metrics
##       AUC     ACC
## 1 0.76337 0.64179
## 
## $metrics_tags
## # A tibble: 3 × 9
##   tag       n     p   AUC order   ACC   PRC   TPR   TNR
##   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 n_3     149  55.6 0.810     1 0.731 0.718 0.852 0.580
## 2 n_1      63  23.5 0.829     2 0.806 0.587 0.587 0.873
## 3 n_2      56  20.9 0.651     3 0.746 0.286 0.143 0.906
## 
## $cv_metrics
## # A tibble: 14 × 8
##    metric    mean      sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
##    <chr>    <dbl>   <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 accur…   0.641  0.0541      0.56       0.696      0.616      0.677      0.653
##  2 auc    NaN      0         NaN        NaN        NaN        NaN        NaN    
##  3 err      0.359  0.0541      0.44       0.304      0.384      0.323      0.347
##  4 err_c…  44.8    6.83       55         38         48         40         43    
##  5 loglo…   0.815  0.0807      0.911      0.703      0.839      0.767      0.853
##  6 max_p…   0.847  0.0639      0.829      0.947      0.783      0.810      0.867
##  7 mean_…   0.520  0.0243      0.479      0.529      0.516      0.536      0.538
##  8 mean_…   0.480  0.0243      0.521      0.471      0.484      0.464      0.462
##  9 mse      0.292  0.0332      0.337      0.249      0.298      0.274      0.304
## 10 null_… 251.    13.3       268.       249.       254.       231.       252.   
## 11 pr_auc NaN      0         NaN        NaN        NaN        NaN        NaN    
## 12 r2       0.575  0.0744      0.485      0.682      0.606      0.568      0.535
## 13 resid… 203.    20.2       228.       176.       210.       190.       212.   
## 14 rmse     0.540  0.0307      0.581      0.499      0.546      0.523      0.551
## 
## $hit_ratio
##   k hit_ratio
## 1 1 0.6436597
## 2 2 0.8780096
## 3 3 1.0000000
head(r$importance)
##         variable relative_importance scaled_importance importance
## 1     Embarked.Q            2.841431         1.0000000  0.2525001
## 2     Embarked.C            1.516046         0.5335503  0.1347215
## 3     Embarked.S            1.494055         0.5258107  0.1327673
## 4            Age            1.342508         0.4724759  0.1193002
## 5 Survived.FALSE            1.191114         0.4191951  0.1058468
## 6  Survived.TRUE            1.156447         0.4069947  0.1027662
r$plots$importance
## NULL

Importance

data("starwars")
# Let's get rid of the lists inside the dataframe
df <- select(starwars, -starships, -vehicles, -films)
head(df)
## # A tibble: 6 × 11
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
## 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
## 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
## 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
## 5 Leia Org…    150    49 brown      light      brown           19   fema… femin…
## 6 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
## # ℹ 2 more variables: homeworld <chr>, species <chr>
corr_cross(df)
## Returning only the top 25. You may override with the 'top' argument

dft <- select(dft, -Cabin, -Ticket)
corr_cross(dft, top = 15)
## Returning only the top 15. You may override with the 'top' argument

corr_cross(df, type = 2)

corr_cross(df, type = 2, contains = "species")