Tugas 5.4 Supervised Encoding Methods

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.3

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──

## ✔ broom        1.0.10     ✔ recipes      1.3.1 
## ✔ dials        1.4.2      ✔ rsample      1.3.1 
## ✔ dplyr        1.1.4      ✔ tailor       0.1.0 
## ✔ ggplot2      4.0.0      ✔ tidyr        1.3.1 
## ✔ infer        1.0.9      ✔ tune         2.0.0 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.3.3      ✔ workflowsets 1.1.1 
## ✔ purrr        1.1.0      ✔ yardstick    1.3.2

## Warning: package 'dials' was built under R version 4.4.3

## Warning: package 'scales' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'infer' was built under R version 4.4.3

## Warning: package 'modeldata' was built under R version 4.4.3

## Warning: package 'parsnip' was built under R version 4.4.3

## Warning: package 'purrr' was built under R version 4.4.3

## Warning: package 'recipes' was built under R version 4.4.3

## Warning: package 'rsample' was built under R version 4.4.3

## Warning: package 'tailor' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.2

## Warning: package 'tune' was built under R version 4.4.3

## Warning: package 'workflows' was built under R version 4.4.3

## Warning: package 'workflowsets' was built under R version 4.4.3

## Warning: package 'yardstick' was built under R version 4.4.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()

library(embed)

## Warning: package 'embed' was built under R version 4.4.3

library(plotly)

## Warning: package 'plotly' was built under R version 4.4.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.4.2

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

options(width = 120)

load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/okc.RData")
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/okc_binary.RData")

partial_rec <- 
  recipe(Class ~ ., data = okc_train) %>%
  step_lencode_glm(
    where_town,
    outcome = vars(Class)
  ) %>%
  prep()
# Get raw rates and log-odds
okc_props <- 
  okc_train %>%
  group_by(where_town) %>%
  summarise(
    rate = mean(Class == "stem"),
    raw  = log(rate/(1-rate)),
    n = length(Class)
  ) %>%
  mutate(where_town = as.character(where_town))

okc_props

## # A tibble: 51 × 4
##    where_town          rate   raw     n
##    <chr>              <dbl> <dbl> <int>
##  1 alameda           0.157  -1.68   616
##  2 albany            0.192  -1.44   146
##  3 belmont           0.234  -1.19   167
##  4 belvedere_tiburon 0.0857 -2.37    35
##  5 benicia           0.107  -2.13   122
##  6 berkeley          0.163  -1.64  2676
##  7 burlingame        0.181  -1.51   248
##  8 castro_valley     0.124  -1.95   225
##  9 corte_madera      0.111  -2.08    54
## 10 daly_city         0.134  -1.86   417
## # ℹ 41 more rows

library(tidymodels)
library(embed)

# Gabungkan data
keywords <- names(okc_train_binary)[-1]

okc_embed <- okc_train %>% 
  dplyr::select(Class, where_town, profile) %>%
  full_join(okc_train_binary, by = "profile")

# Pastikan keywords numeric
okc_embed[, keywords] <- apply(okc_embed[, keywords], 2, as.numeric)

# Pakai GLM encoding
set.seed(355)
nnet_rec <- 
  recipe(Class ~ ., data = okc_embed) %>% 
  step_lencode_glm(where_town, outcome = vars(Class)) %>%
  prep()

# Transform data sesuai recipe
okc_embed_ready <- bake(nnet_rec, new_data = okc_embed)

partial_pooled <- 
  tidy(partial_rec, number = 1) %>%
  dplyr::select(-terms, -id) %>%
  setNames(c("where_town", "partial"))

glm_enc <- 
  tidy(nnet_rec, number = 1) %>%
  dplyr::select(-terms, -id) %>%
  setNames(c("where_town", "glm_encoding"))

all_est <- 
  partial_pooled %>%
  full_join(okc_props, by = "where_town") %>%
  inner_join(glm_enc, by = "where_town") %>%
  dplyr::select(where_town, rate, n, raw, partial, glm_encoding)

odds_rng <- extendrange(c(all_est$raw, all_est$partial, all_est$glm_encoding), f = 0.01)

odds_1 <- 
  ggplot(all_est) +
  aes(x = raw, y = partial, size = log10(n)) + 
  scale_size(range = c(.1, 6)) +
  geom_abline(alpha = .4, lty = 2)  +
  xlim(odds_rng) +
  ylim(odds_rng) +
  xlab("Raw Log-Odds") +
  ylab("Shrunken Log-Odds") + 
  geom_point(aes(text = gsub("_", " ", where_town)), alpha = .4)

## Warning in geom_point(aes(text = gsub("_", " ", where_town)), alpha = 0.4): Ignoring unknown aesthetics: text

odds_2 <- 
  ggplot(all_est) +
  aes(x = .5*(raw + partial), y = raw - partial, size = log10(n)) + 
  scale_size(range= c(.1, 6)) + 
  geom_hline(alpha = .4, lty = 2, yintercept = 0) + 
  xlab("Average Estimate") +
  ylab("Raw - Shrunken") + 
  geom_point(aes(text = gsub("_", " ", where_town)), alpha = .4)

## Warning in geom_point(aes(text = gsub("_", " ", where_town)), alpha = 0.4): Ignoring unknown aesthetics: text

odds_1 <- ggplotly(odds_1, tooltip = "text")
odds_2 <- ggplotly(odds_2, tooltip = "text") 

plotly::subplot(odds_1, odds_2, nrows = 1, margin = .05, titleX = TRUE, titleY = TRUE)

glm_plot <- 
  glm_enc %>%
  full_join(okc_props, by = "where_town") %>%
  mutate(location = gsub("_", " ", where_town)) %>%
  ggplot(aes(x = glm_encoding, y = raw)) + 
  geom_point(aes(size = log10(n), text = location), alpha = .4) +
  xlab("GLM Encoding") +
  ylab("Raw Odds-Ratio") +
  scale_size(range = c(.1, 6)) +
  theme(legend.position = "top")

## Warning in geom_point(aes(size = log10(n), text = location), alpha = 0.4): Ignoring unknown aesthetics: text

ggplotly(glm_plot, tooltip = "text")

Tugas 5.4 Supervised Encoding Methods

pradytha galuh

2025-09-14

Including Plots