——————————————————————————

Feature Engineering and Selection: A Practical Approach for Predictive Models

by Max Kuhn and Kjell Johnson

——————————————————————————

Code for Section 4.3 at

https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html

——————————————————————————

Code requires these packages:

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.10     ✔ recipes      1.3.1 
## ✔ dials        1.4.2      ✔ rsample      1.3.1 
## ✔ dplyr        1.1.4      ✔ tailor       0.1.0 
## ✔ ggplot2      4.0.0      ✔ tidyr        1.3.1 
## ✔ infer        1.0.9      ✔ tune         2.0.0 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.3.3      ✔ workflowsets 1.1.1 
## ✔ purrr        1.1.0      ✔ yardstick    1.3.2
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tailor' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(scales)
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.4.3
library(vcd)
## Warning: package 'vcd' was built under R version 4.4.3
## Loading required package: grid
library(colorspace)
## Warning: package 'colorspace' was built under R version 4.4.2
l10_breaks <- scales::trans_breaks("log10", function(x) 10^x)
l10_labels <- scales::trans_format("log10", scales::math_format(10^.x))

theme_set(theme_bw())
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/okc.RData")
binom_stats <- function(x, ...) {
  x <- x$Class[!is.na(x$Class)]
  res <- prop.test(x = sum(x == "stem"), n = length(x), ...)
  data.frame(Proportion  = unname(res$estimate), 
             Lower = res$conf.int[1],
             Upper = res$conf.int[2])
}

stem_rate <- mean(okc_train$Class == "stem")

religion_rates <- 
  okc_train %>%
  group_by(religion) %>%
  do(binom_stats(.)) %>%
  arrange(Proportion) %>%
  ungroup() %>%
  mutate(religion = gsub("religion_", "", religion),
         religion = reorder(factor(religion), Proportion))

okc_train <- 
  okc_train %>% 
  mutate(
    religion2 = gsub("religion_", "", as.character(religion)),
    religion2 = factor(religion2, levels = as.character(religion_rates$religion))
  )

bars <- 
  ggplot(okc_train, aes(x = religion2, fill = Class)) +
  geom_bar(position = position_dodge()) + scale_fill_brewer(palette = "Paired") +
  xlab("") +
  theme(legend.position = "top", axis.text = element_text(size = 8)) +
  ggtitle("(a)")

stacked_vars <-
  ggplot(okc_train, aes(x = religion2, fill = Class)) + geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Paired") +
  xlab("") + ylab("Proportion") +
  theme(legend.position = "none", axis.text = element_text(size = 8)) +
  ggtitle("(b)")

ci_plots <- 
  ggplot(religion_rates, aes(x = religion, y = Proportion)) +
  geom_hline(yintercept = stem_rate, col = "red", alpha = .35, lty = 2) + 
  geom_point() +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = .1) +
  theme(axis.text = element_text(size = 8)) +
  xlab("") +
  ggtitle("(c)")

# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-religion
# grid.arrange(bars, stacked_vars, ci_plots, ncol = 1, heights= c(4, 3, 3))
gam_dat <- 
  okc_train %>% 
  dplyr::select(essay_length, Class) %>% 
  arrange(essay_length)

gam_small <- 
  gam_dat %>%
  distinct(essay_length) 

gam_mod <- mgcv::gam(Class ~ s(essay_length), data = gam_dat, family = binomial())

gam_small <- gam_small %>%
  mutate(
    link = -predict(gam_mod, gam_small, type = "link"),
    se = predict(gam_mod, gam_small, type = "link", se.fit = TRUE)$se.fit,
    upper = link + qnorm(.975) * se,
    lower = link - qnorm(.975) * se,
    lower = binomial()$linkinv(lower),
    upper = binomial()$linkinv(upper),
    probability = binomial()$linkinv(link)
  )

brks <- l10_breaks(exp(okc_train$essay_length))

essay_hist <- 
  ggplot(okc_train, aes(x = exp(essay_length))) + 
  geom_histogram(binwidth = .1, col = "#FEB24C", fill = "#FED976") + 
  facet_wrap(~ Class, ncol = 1) + 
  scale_x_log10(breaks = brks, labels = l10_labels) +
  xlab("Essay Character Length") + 
  theme_bw() +
  theme(plot.margin = unit(c(0,1,0,1.2), "cm")) + 
  ggtitle("(a)")

essay_gam <- 
  ggplot(gam_small, aes(x = exp(essay_length))) + 
  geom_line(aes(y = probability)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey", alpha = .5) + 
  geom_hline(yintercept = stem_rate, col = "red", alpha = .35, lty = 2)  + 
  scale_x_log10(breaks = brks, labels = l10_labels) +
  theme_bw() + 
  xlab("") +
  theme(plot.margin = unit(c(0,1,0,1.2), "cm"))+ 
  ggtitle("(b)")

# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-essay-length
# grid.arrange(essay_hist, essay_gam, ncol = 1, heights= c(2, 1.25))
okc_train <- 
  okc_train %>% 
  mutate(
    drugs = factor(as.character(drugs),
                   levels = c("drugs_missing", "never", "sometimes", "often")),
    drinks = factor(as.character(drinks),
                    levels = c("drinks_missing", "not_at_all", "rarely", 
                               "socially", "often", "very_often", "desperately"))
  )

dd_tab <- table(okc_train$drugs, okc_train$drinks, dnn = c("Drugs", "Alcohol"))


# Formatting for slightly better printing
plot_tab <- dd_tab
dimnames(plot_tab)[[1]][1] <- "missing"
dimnames(plot_tab)[[2]] <- gsub("_", " ", dimnames(plot_tab)[[2]])
dimnames(plot_tab)[[2]][1] <- "missing"
dimnames(plot_tab)[[2]][6] <- "often\n"
dimnames(plot_tab)[[2]][6] <- "very often"
dimnames(plot_tab)[[2]][7] <- "\ndesperately"

# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-mosaic
mosaic(
  t(plot_tab),
  highlighting = TRUE,
  highlighting_fill = rainbow_hcl,
  margins = unit(c(6, 1, 1, 8), "lines"),
  labeling = labeling_border(
    rot_labels = c(90, 0, 0, 0),
    just_labels = c("left", "right",
                    "center",  "right"),
    offset_varnames = unit(c(3, 1, 1, 4), "lines")
  ),
  keep_aspect_ratio = FALSE
)

ca_obj <- CA(dd_tab, graph = FALSE)

ca_drugs <- as.data.frame(ca_obj$row$coord)
ca_drugs$label <- gsub("_", " ", rownames(ca_drugs))
ca_drugs$Variable <- "Drugs"

ca_drinks <- as.data.frame(ca_obj$col$coord)
ca_drinks$label <- gsub("_", " ", rownames(ca_drinks))
ca_drinks$Variable <- "Alcohol"

ca_rng <- extendrange(c(ca_drinks$`Dim 1`, ca_drinks$`Dim 2`))
ca_x <- paste0("Dimension #1 (",
               round(ca_obj$eig["dim 1", "percentage of variance"], 0),
               "%)")
ca_y <- paste0("Dimension #2 (",
               round(ca_obj$eig["dim 2", "percentage of variance"], 0),
               "%)")

ca_coord <- rbind(ca_drugs, ca_drinks)

# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-ca
ca_plot <-
  ggplot(ca_coord, aes(x = `Dim 1`, y = `Dim 2`, col = Variable)) + 
  geom_vline(xintercept = 0) + 
  geom_hline(yintercept = 0) + 
  geom_text(aes(label = label)) + 
  xlim(ca_rng) + ylim(ca_rng) + 
  xlab(ca_x) + ylab(ca_y) + 
  coord_equal()
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_Indonesia.utf8  LC_CTYPE=English_Indonesia.utf8   
## [3] LC_MONETARY=English_Indonesia.utf8 LC_NUMERIC=C                      
## [5] LC_TIME=English_Indonesia.utf8    
## 
## time zone: Asia/Jakarta
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] colorspace_2.1-1   vcd_1.4-13         FactoMineR_2.12    mgcv_1.9-1        
##  [5] nlme_3.1-164       gridExtra_2.3      yardstick_1.3.2    workflowsets_1.1.1
##  [9] workflows_1.3.0    tune_2.0.0         tidyr_1.3.1        tailor_0.1.0      
## [13] rsample_1.3.1      recipes_1.3.1      purrr_1.1.0        parsnip_1.3.3     
## [17] modeldata_1.5.1    infer_1.0.9        ggplot2_4.0.0      dplyr_1.1.4       
## [21] dials_1.4.2        scales_1.4.0       broom_1.0.10       tidymodels_1.4.1  
## 
## loaded via a namespace (and not attached):
##  [1] rlang_1.1.6          magrittr_2.0.3       furrr_0.3.1         
##  [4] compiler_4.4.1       vctrs_0.6.5          lhs_1.2.0           
##  [7] pkgconfig_2.0.3      fastmap_1.2.0        backports_1.5.0     
## [10] rmarkdown_2.29       prodlim_2025.04.28   xfun_0.49           
## [13] cachem_1.1.0         jsonlite_1.8.9       flashClust_1.01-2   
## [16] parallel_4.4.1       cluster_2.1.6        R6_2.5.1            
## [19] bslib_0.8.0          RColorBrewer_1.1-3   parallelly_1.45.1   
## [22] rpart_4.1.23         lmtest_0.9-40        lubridate_1.9.4     
## [25] jquerylib_0.1.4      estimability_1.5.1   Rcpp_1.0.13-1       
## [28] knitr_1.49           future.apply_1.20.0  zoo_1.8-12          
## [31] Matrix_1.7-0         splines_4.4.1        nnet_7.3-19         
## [34] timechange_0.3.0     tidyselect_1.2.1     rstudioapi_0.17.1   
## [37] yaml_2.3.10          timeDate_4041.110    codetools_0.2-20    
## [40] listenv_0.9.1        lattice_0.22-6       tibble_3.3.0        
## [43] withr_3.0.2          S7_0.2.0             evaluate_1.0.1      
## [46] future_1.67.0        survival_3.6-4       pillar_1.11.0       
## [49] DT_0.34.0            generics_0.1.3       globals_0.18.0      
## [52] xtable_1.8-4         leaps_3.2            class_7.3-22        
## [55] glue_1.8.0           emmeans_1.11.2-8     scatterplot3d_0.3-44
## [58] tools_4.4.1          data.table_1.16.2    gower_1.0.2         
## [61] mvtnorm_1.3-3        ipred_0.9-15         cli_3.6.5           
## [64] DiceDesign_1.10      lava_1.8.1           gtable_0.3.6        
## [67] GPfit_1.0-9          sass_0.4.9           digest_0.6.37       
## [70] ggrepel_0.9.6        htmlwidgets_1.6.4    farver_2.1.2        
## [73] htmltools_0.5.8.1    lifecycle_1.0.4      hardhat_1.4.2       
## [76] multcompView_0.1-10  MASS_7.3-60.2