Code requires these packages:
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.10 ✔ recipes 1.3.1
## ✔ dials 1.4.2 ✔ rsample 1.3.1
## ✔ dplyr 1.1.4 ✔ tailor 0.1.0
## ✔ ggplot2 4.0.0 ✔ tidyr 1.3.1
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ purrr 1.1.0 ✔ yardstick 1.3.2
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tailor' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(scales)
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.4.3
library(vcd)
## Warning: package 'vcd' was built under R version 4.4.3
## Loading required package: grid
library(colorspace)
## Warning: package 'colorspace' was built under R version 4.4.2
l10_breaks <- scales::trans_breaks("log10", function(x) 10^x)
l10_labels <- scales::trans_format("log10", scales::math_format(10^.x))
theme_set(theme_bw())
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/okc.RData")
binom_stats <- function(x, ...) {
x <- x$Class[!is.na(x$Class)]
res <- prop.test(x = sum(x == "stem"), n = length(x), ...)
data.frame(Proportion = unname(res$estimate),
Lower = res$conf.int[1],
Upper = res$conf.int[2])
}
stem_rate <- mean(okc_train$Class == "stem")
religion_rates <-
okc_train %>%
group_by(religion) %>%
do(binom_stats(.)) %>%
arrange(Proportion) %>%
ungroup() %>%
mutate(religion = gsub("religion_", "", religion),
religion = reorder(factor(religion), Proportion))
okc_train <-
okc_train %>%
mutate(
religion2 = gsub("religion_", "", as.character(religion)),
religion2 = factor(religion2, levels = as.character(religion_rates$religion))
)
bars <-
ggplot(okc_train, aes(x = religion2, fill = Class)) +
geom_bar(position = position_dodge()) + scale_fill_brewer(palette = "Paired") +
xlab("") +
theme(legend.position = "top", axis.text = element_text(size = 8)) +
ggtitle("(a)")
stacked_vars <-
ggplot(okc_train, aes(x = religion2, fill = Class)) + geom_bar(position = "fill") +
scale_fill_brewer(palette = "Paired") +
xlab("") + ylab("Proportion") +
theme(legend.position = "none", axis.text = element_text(size = 8)) +
ggtitle("(b)")
ci_plots <-
ggplot(religion_rates, aes(x = religion, y = Proportion)) +
geom_hline(yintercept = stem_rate, col = "red", alpha = .35, lty = 2) +
geom_point() +
geom_errorbar(aes(ymin = Lower, ymax = Upper), width = .1) +
theme(axis.text = element_text(size = 8)) +
xlab("") +
ggtitle("(c)")
# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-religion
# grid.arrange(bars, stacked_vars, ci_plots, ncol = 1, heights= c(4, 3, 3))
gam_dat <-
okc_train %>%
dplyr::select(essay_length, Class) %>%
arrange(essay_length)
gam_small <-
gam_dat %>%
distinct(essay_length)
gam_mod <- mgcv::gam(Class ~ s(essay_length), data = gam_dat, family = binomial())
gam_small <- gam_small %>%
mutate(
link = -predict(gam_mod, gam_small, type = "link"),
se = predict(gam_mod, gam_small, type = "link", se.fit = TRUE)$se.fit,
upper = link + qnorm(.975) * se,
lower = link - qnorm(.975) * se,
lower = binomial()$linkinv(lower),
upper = binomial()$linkinv(upper),
probability = binomial()$linkinv(link)
)
brks <- l10_breaks(exp(okc_train$essay_length))
essay_hist <-
ggplot(okc_train, aes(x = exp(essay_length))) +
geom_histogram(binwidth = .1, col = "#FEB24C", fill = "#FED976") +
facet_wrap(~ Class, ncol = 1) +
scale_x_log10(breaks = brks, labels = l10_labels) +
xlab("Essay Character Length") +
theme_bw() +
theme(plot.margin = unit(c(0,1,0,1.2), "cm")) +
ggtitle("(a)")
essay_gam <-
ggplot(gam_small, aes(x = exp(essay_length))) +
geom_line(aes(y = probability)) +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey", alpha = .5) +
geom_hline(yintercept = stem_rate, col = "red", alpha = .35, lty = 2) +
scale_x_log10(breaks = brks, labels = l10_labels) +
theme_bw() +
xlab("") +
theme(plot.margin = unit(c(0,1,0,1.2), "cm"))+
ggtitle("(b)")
# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-essay-length
# grid.arrange(essay_hist, essay_gam, ncol = 1, heights= c(2, 1.25))
okc_train <-
okc_train %>%
mutate(
drugs = factor(as.character(drugs),
levels = c("drugs_missing", "never", "sometimes", "often")),
drinks = factor(as.character(drinks),
levels = c("drinks_missing", "not_at_all", "rarely",
"socially", "often", "very_often", "desperately"))
)
dd_tab <- table(okc_train$drugs, okc_train$drinks, dnn = c("Drugs", "Alcohol"))
# Formatting for slightly better printing
plot_tab <- dd_tab
dimnames(plot_tab)[[1]][1] <- "missing"
dimnames(plot_tab)[[2]] <- gsub("_", " ", dimnames(plot_tab)[[2]])
dimnames(plot_tab)[[2]][1] <- "missing"
dimnames(plot_tab)[[2]][6] <- "often\n"
dimnames(plot_tab)[[2]][6] <- "very often"
dimnames(plot_tab)[[2]][7] <- "\ndesperately"
# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-mosaic
mosaic(
t(plot_tab),
highlighting = TRUE,
highlighting_fill = rainbow_hcl,
margins = unit(c(6, 1, 1, 8), "lines"),
labeling = labeling_border(
rot_labels = c(90, 0, 0, 0),
just_labels = c("left", "right",
"center", "right"),
offset_varnames = unit(c(3, 1, 1, 4), "lines")
),
keep_aspect_ratio = FALSE
)

ca_obj <- CA(dd_tab, graph = FALSE)
ca_drugs <- as.data.frame(ca_obj$row$coord)
ca_drugs$label <- gsub("_", " ", rownames(ca_drugs))
ca_drugs$Variable <- "Drugs"
ca_drinks <- as.data.frame(ca_obj$col$coord)
ca_drinks$label <- gsub("_", " ", rownames(ca_drinks))
ca_drinks$Variable <- "Alcohol"
ca_rng <- extendrange(c(ca_drinks$`Dim 1`, ca_drinks$`Dim 2`))
ca_x <- paste0("Dimension #1 (",
round(ca_obj$eig["dim 1", "percentage of variance"], 0),
"%)")
ca_y <- paste0("Dimension #2 (",
round(ca_obj$eig["dim 2", "percentage of variance"], 0),
"%)")
ca_coord <- rbind(ca_drugs, ca_drinks)
# https://bookdown.org/max/FES/visualizations-for-categorical-data-exploring-the-okcupid-data.html#fig:eda-ca
ca_plot <-
ggplot(ca_coord, aes(x = `Dim 1`, y = `Dim 2`, col = Variable)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_text(aes(label = label)) +
xlim(ca_rng) + ylim(ca_rng) +
xlab(ca_x) + ylab(ca_y) +
coord_equal()
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_Indonesia.utf8 LC_CTYPE=English_Indonesia.utf8
## [3] LC_MONETARY=English_Indonesia.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Indonesia.utf8
##
## time zone: Asia/Jakarta
## tzcode source: internal
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] colorspace_2.1-1 vcd_1.4-13 FactoMineR_2.12 mgcv_1.9-1
## [5] nlme_3.1-164 gridExtra_2.3 yardstick_1.3.2 workflowsets_1.1.1
## [9] workflows_1.3.0 tune_2.0.0 tidyr_1.3.1 tailor_0.1.0
## [13] rsample_1.3.1 recipes_1.3.1 purrr_1.1.0 parsnip_1.3.3
## [17] modeldata_1.5.1 infer_1.0.9 ggplot2_4.0.0 dplyr_1.1.4
## [21] dials_1.4.2 scales_1.4.0 broom_1.0.10 tidymodels_1.4.1
##
## loaded via a namespace (and not attached):
## [1] rlang_1.1.6 magrittr_2.0.3 furrr_0.3.1
## [4] compiler_4.4.1 vctrs_0.6.5 lhs_1.2.0
## [7] pkgconfig_2.0.3 fastmap_1.2.0 backports_1.5.0
## [10] rmarkdown_2.29 prodlim_2025.04.28 xfun_0.49
## [13] cachem_1.1.0 jsonlite_1.8.9 flashClust_1.01-2
## [16] parallel_4.4.1 cluster_2.1.6 R6_2.5.1
## [19] bslib_0.8.0 RColorBrewer_1.1-3 parallelly_1.45.1
## [22] rpart_4.1.23 lmtest_0.9-40 lubridate_1.9.4
## [25] jquerylib_0.1.4 estimability_1.5.1 Rcpp_1.0.13-1
## [28] knitr_1.49 future.apply_1.20.0 zoo_1.8-12
## [31] Matrix_1.7-0 splines_4.4.1 nnet_7.3-19
## [34] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.17.1
## [37] yaml_2.3.10 timeDate_4041.110 codetools_0.2-20
## [40] listenv_0.9.1 lattice_0.22-6 tibble_3.3.0
## [43] withr_3.0.2 S7_0.2.0 evaluate_1.0.1
## [46] future_1.67.0 survival_3.6-4 pillar_1.11.0
## [49] DT_0.34.0 generics_0.1.3 globals_0.18.0
## [52] xtable_1.8-4 leaps_3.2 class_7.3-22
## [55] glue_1.8.0 emmeans_1.11.2-8 scatterplot3d_0.3-44
## [58] tools_4.4.1 data.table_1.16.2 gower_1.0.2
## [61] mvtnorm_1.3-3 ipred_0.9-15 cli_3.6.5
## [64] DiceDesign_1.10 lava_1.8.1 gtable_0.3.6
## [67] GPfit_1.0-9 sass_0.4.9 digest_0.6.37
## [70] ggrepel_0.9.6 htmlwidgets_1.6.4 farver_2.1.2
## [73] htmltools_0.5.8.1 lifecycle_1.0.4 hardhat_1.4.2
## [76] multcompView_0.1-10 MASS_7.3-60.2