summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.10 ✔ recipes 1.3.1
## ✔ dials 1.4.2 ✔ rsample 1.3.1
## ✔ dplyr 1.1.4 ✔ tailor 0.1.0
## ✔ ggplot2 4.0.0 ✔ tidyr 1.3.1
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ purrr 1.1.0 ✔ yardstick 1.3.2
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tailor' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(embed)
## Warning: package 'embed' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
options(width = 120)
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/okc.RData")
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/okc_binary.RData")
partial_rec <-
recipe(Class ~ ., data = okc_train) %>%
step_lencode_glm(
where_town,
outcome = vars(Class)
) %>%
prep()
# Get raw rates and log-odds
okc_props <-
okc_train %>%
group_by(where_town) %>%
summarise(
rate = mean(Class == "stem"),
raw = log(rate/(1-rate)),
n = length(Class)
) %>%
mutate(where_town = as.character(where_town))
okc_props
## # A tibble: 51 × 4
## where_town rate raw n
## <chr> <dbl> <dbl> <int>
## 1 alameda 0.157 -1.68 616
## 2 albany 0.192 -1.44 146
## 3 belmont 0.234 -1.19 167
## 4 belvedere_tiburon 0.0857 -2.37 35
## 5 benicia 0.107 -2.13 122
## 6 berkeley 0.163 -1.64 2676
## 7 burlingame 0.181 -1.51 248
## 8 castro_valley 0.124 -1.95 225
## 9 corte_madera 0.111 -2.08 54
## 10 daly_city 0.134 -1.86 417
## # ℹ 41 more rows
library(tidymodels)
library(embed)
# Gabungkan data
keywords <- names(okc_train_binary)[-1]
okc_embed <- okc_train %>%
dplyr::select(Class, where_town, profile) %>%
full_join(okc_train_binary, by = "profile")
# Pastikan keywords numeric
okc_embed[, keywords] <- apply(okc_embed[, keywords], 2, as.numeric)
# Pakai GLM encoding
set.seed(355)
nnet_rec <-
recipe(Class ~ ., data = okc_embed) %>%
step_lencode_glm(where_town, outcome = vars(Class)) %>%
prep()
# Transform data sesuai recipe
okc_embed_ready <- bake(nnet_rec, new_data = okc_embed)
partial_pooled <-
tidy(partial_rec, number = 1) %>%
dplyr::select(-terms, -id) %>%
setNames(c("where_town", "partial"))
glm_enc <-
tidy(nnet_rec, number = 1) %>%
dplyr::select(-terms, -id) %>%
setNames(c("where_town", "glm_encoding"))
all_est <-
partial_pooled %>%
full_join(okc_props, by = "where_town") %>%
inner_join(glm_enc, by = "where_town") %>%
dplyr::select(where_town, rate, n, raw, partial, glm_encoding)
odds_rng <- extendrange(c(all_est$raw, all_est$partial, all_est$glm_encoding), f = 0.01)
odds_1 <-
ggplot(all_est) +
aes(x = raw, y = partial, size = log10(n)) +
scale_size(range = c(.1, 6)) +
geom_abline(alpha = .4, lty = 2) +
xlim(odds_rng) +
ylim(odds_rng) +
xlab("Raw Log-Odds") +
ylab("Shrunken Log-Odds") +
geom_point(aes(text = gsub("_", " ", where_town)), alpha = .4)
## Warning in geom_point(aes(text = gsub("_", " ", where_town)), alpha = 0.4): Ignoring unknown aesthetics: text
odds_2 <-
ggplot(all_est) +
aes(x = .5*(raw + partial), y = raw - partial, size = log10(n)) +
scale_size(range= c(.1, 6)) +
geom_hline(alpha = .4, lty = 2, yintercept = 0) +
xlab("Average Estimate") +
ylab("Raw - Shrunken") +
geom_point(aes(text = gsub("_", " ", where_town)), alpha = .4)
## Warning in geom_point(aes(text = gsub("_", " ", where_town)), alpha = 0.4): Ignoring unknown aesthetics: text
odds_1 <- ggplotly(odds_1, tooltip = "text")
odds_2 <- ggplotly(odds_2, tooltip = "text")
plotly::subplot(odds_1, odds_2, nrows = 1, margin = .05, titleX = TRUE, titleY = TRUE)
glm_plot <-
glm_enc %>%
full_join(okc_props, by = "where_town") %>%
mutate(location = gsub("_", " ", where_town)) %>%
ggplot(aes(x = glm_encoding, y = raw)) +
geom_point(aes(size = log10(n), text = location), alpha = .4) +
xlab("GLM Encoding") +
ylab("Raw Odds-Ratio") +
scale_size(range = c(.1, 6)) +
theme(legend.position = "top")
## Warning in geom_point(aes(size = log10(n), text = location), alpha = 0.4): Ignoring unknown aesthetics: text
ggplotly(glm_plot, tooltip = "text")