Postingan ini diadaptasi dari postingan Alex Hayes yang tersedia di sini[1]. Di dalamnya dia meninjau cara seseorang dapat menggunakan Stan untuk meninjau distribusi tanggapan untuk item survei kategoris dan kemudian menggunakannya untuk memprediksi probabilitas kelas. Ini adalah cara yang rapi untuk melihat item survei daripada mengubah respons menjadi skala numerik dan memperlakukannya sebagai berkelanjutan. Jauh lebih benar untuk data dan banyak yang telah ditulis tentang bahaya menggunakan pendekatan berkelanjutan untuk data katgorikal.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
set.seed(336)
Q <- 3 # Three Question (items)
A <- 5 # 5 Possible responses
alpha <- rep(4, A)
theta_0 <- gtools::rdirichlet(Q, alpha)
theta_1 <- gtools::rdirichlet(Q, alpha)
sample_one <- function(theta) {
R <- numeric(Q)
for (q in 1:Q)
R[q] <- sample(1:A, 1, prob = theta[q, ])
names(R) <- paste0("q", 1:Q)
as.list(R)
}
sample_n <- function(theta, n, id) {
samples <- map_dfr(1:n, ~sample_one(theta))
samples <- add_column(samples, y = id, .before = TRUE)
mutate_all(samples, as.integer)
}
(df <- sample_n(theta_0, 35, id = 0) %>%
bind_rows(sample_n(theta_1, 35, id = 1)))
## # A tibble: 70 x 4
## y q1 q2 q3
## <int> <int> <int> <int>
## 1 0 2 5 2
## 2 0 3 5 3
## 3 0 1 3 1
## 4 0 2 4 3
## 5 0 4 1 4
## 6 0 1 2 1
## 7 0 5 2 1
## 8 0 5 2 3
## 9 0 2 5 3
## 10 0 2 2 1
## # … with 60 more rows
new_df <- sample_n(theta_0, 35, id = 0) %>%
bind_rows(sample_n(theta_1, 35, id = 1))
new_df <- sample_n(theta_0, 35, id = 0)
new_df
## # A tibble: 35 x 4
## y q1 q2 q3
## <int> <int> <int> <int>
## 1 0 1 1 1
## 2 0 5 5 1
## 3 0 1 4 2
## 4 0 5 2 2
## 5 0 5 4 2
## 6 0 4 2 2
## 7 0 5 5 5
## 8 0 1 3 3
## 9 0 1 2 2
## 10 0 4 3 1
## # … with 25 more rows
bind_rows(sample_n(theta_1, 35, id = 1))
## # A tibble: 35 x 4
## y q1 q2 q3
## <int> <int> <int> <int>
## 1 1 2 1 1
## 2 1 2 1 5
## 3 1 4 1 3
## 4 1 1 4 4
## 5 1 2 1 4
## 6 1 2 1 2
## 7 1 4 1 4
## 8 1 4 1 5
## 9 1 3 1 2
## 10 1 1 4 5
## # … with 25 more rows
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
## extract
rstan_options(auto_write = TRUE)
new_data <- list(
R = as.matrix(dplyr::select(df, -y)),
N = nrow(df),
y = df$y,
new_R = as.matrix(dplyr::select(new_df, -y)),
new_N = nrow(new_df),
Q = Q,
A = A,
alpha = alpha
)
new_data
## $R
## q1 q2 q3
## [1,] 2 5 2
## [2,] 3 5 3
## [3,] 1 3 1
## [4,] 2 4 3
## [5,] 4 1 4
## [6,] 1 2 1
## [7,] 5 2 1
## [8,] 5 2 3
## [9,] 2 5 3
## [10,] 2 2 1
## [11,] 1 2 2
## [12,] 4 5 2
## [13,] 2 3 2
## [14,] 2 3 3
## [15,] 1 3 3
## [16,] 1 4 5
## [17,] 2 3 2
## [18,] 2 3 2
## [19,] 3 1 3
## [20,] 1 1 1
## [21,] 3 2 3
## [22,] 1 3 4
## [23,] 1 3 3
## [24,] 1 5 1
## [25,] 1 3 5
## [26,] 5 1 2
## [27,] 1 2 5
## [28,] 4 1 1
## [29,] 1 3 4
## [30,] 1 3 3
## [31,] 3 3 5
## [32,] 5 5 5
## [33,] 4 5 5
## [34,] 2 3 1
## [35,] 1 5 2
## [36,] 4 1 1
## [37,] 1 1 1
## [38,] 5 1 1
## [39,] 5 1 4
## [40,] 5 1 2
## [41,] 5 1 5
## [42,] 1 1 1
## [43,] 4 1 5
## [44,] 3 1 3
## [45,] 3 1 5
## [46,] 4 4 2
## [47,] 1 2 2
## [48,] 2 1 2
## [49,] 1 1 2
## [50,] 2 4 4
## [51,] 1 1 4
## [52,] 1 5 3
## [53,] 5 1 5
## [54,] 1 3 5
## [55,] 4 1 5
## [56,] 4 4 3
## [57,] 1 1 3
## [58,] 4 3 3
## [59,] 1 3 4
## [60,] 5 1 1
## [61,] 4 1 3
## [62,] 1 1 2
## [63,] 1 1 5
## [64,] 4 1 2
## [65,] 3 1 4
## [66,] 5 1 2
## [67,] 1 4 2
## [68,] 2 1 2
## [69,] 1 1 4
## [70,] 1 1 1
##
## $N
## [1] 70
##
## $y
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $new_R
## q1 q2 q3
## [1,] 1 1 1
## [2,] 5 5 1
## [3,] 1 4 2
## [4,] 5 2 2
## [5,] 5 4 2
## [6,] 4 2 2
## [7,] 5 5 5
## [8,] 1 3 3
## [9,] 1 2 2
## [10,] 4 3 1
## [11,] 2 3 1
## [12,] 2 5 3
## [13,] 1 2 3
## [14,] 1 3 4
## [15,] 2 1 5
## [16,] 5 3 2
## [17,] 5 3 1
## [18,] 3 5 3
## [19,] 1 1 2
## [20,] 1 2 2
## [21,] 5 1 3
## [22,] 2 5 2
## [23,] 2 3 2
## [24,] 5 5 2
## [25,] 1 2 3
## [26,] 1 2 2
## [27,] 1 2 4
## [28,] 1 3 1
## [29,] 1 5 3
## [30,] 1 5 2
## [31,] 2 2 5
## [32,] 2 1 2
## [33,] 5 5 2
## [34,] 5 2 3
## [35,] 2 3 3
##
## $new_N
## [1] 35
##
## $Q
## [1] 3
##
## $A
## [1] 5
##
## $alpha
## [1] 4 4 4 4 4
model <- stan_model("stan_categorical_responses.stan")
fit <- sampling(model, new_data, cores = 3, iter = 2000, chains = 3, refresh=0)
traceplot(fit)
## 'pars' not specified. Showing first 10 parameters by default.
library(bayesplot)
## This is bayesplot version 1.8.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
library(tidybayes)
print(fit, pars = "theta_0")
## Inference for Stan model: stan_categorical_responses.
## 3 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=3000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## theta_0[1,1] 0.33 0 0.07 0.21 0.28 0.33 0.37 0.46 7652 1
## theta_0[1,2] 0.24 0 0.05 0.14 0.20 0.23 0.27 0.35 6203 1
## theta_0[1,3] 0.14 0 0.05 0.07 0.11 0.14 0.17 0.25 6652 1
## theta_0[1,4] 0.15 0 0.05 0.07 0.11 0.14 0.18 0.25 6473 1
## theta_0[1,5] 0.14 0 0.05 0.07 0.11 0.14 0.17 0.24 7147 1
## theta_0[2,1] 0.16 0 0.05 0.08 0.13 0.16 0.20 0.28 6240 1
## theta_0[2,2] 0.20 0 0.06 0.10 0.16 0.20 0.24 0.31 7657 1
## theta_0[2,3] 0.31 0 0.06 0.20 0.27 0.31 0.35 0.44 6419 1
## theta_0[2,4] 0.11 0 0.04 0.04 0.08 0.10 0.13 0.20 6198 1
## theta_0[2,5] 0.22 0 0.06 0.12 0.18 0.22 0.25 0.34 6271 1
## theta_0[3,1] 0.22 0 0.06 0.12 0.18 0.21 0.25 0.33 5744 1
## theta_0[3,2] 0.22 0 0.06 0.12 0.18 0.22 0.25 0.33 7443 1
## theta_0[3,3] 0.25 0 0.06 0.15 0.21 0.25 0.29 0.38 5932 1
## theta_0[3,4] 0.13 0 0.04 0.06 0.10 0.12 0.15 0.22 7133 1
## theta_0[3,5] 0.18 0 0.05 0.09 0.14 0.18 0.22 0.29 6049 1
##
## Samples were drawn using NUTS(diag_e) at Thu Jan 21 17:59:33 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
print(fit, pars = "theta_1")
## Inference for Stan model: stan_categorical_responses.
## 3 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=3000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## theta_1[1,1] 0.33 0 0.06 0.21 0.28 0.33 0.37 0.46 7663 1
## theta_1[1,2] 0.13 0 0.04 0.05 0.09 0.12 0.16 0.23 6568 1
## theta_1[1,3] 0.13 0 0.05 0.05 0.10 0.12 0.15 0.23 7304 1
## theta_1[1,4] 0.22 0 0.05 0.12 0.18 0.21 0.25 0.33 6568 1
## theta_1[1,5] 0.20 0 0.05 0.11 0.16 0.20 0.23 0.31 6066 1
## theta_1[2,1] 0.55 0 0.07 0.41 0.50 0.55 0.59 0.67 7562 1
## theta_1[2,2] 0.09 0 0.04 0.03 0.06 0.08 0.11 0.18 5638 1
## theta_1[2,3] 0.13 0 0.04 0.05 0.10 0.12 0.16 0.23 5607 1
## theta_1[2,4] 0.15 0 0.05 0.07 0.11 0.14 0.18 0.25 6089 1
## theta_1[2,5] 0.09 0 0.04 0.03 0.06 0.09 0.11 0.18 5452 1
## theta_1[3,1] 0.18 0 0.05 0.09 0.14 0.18 0.22 0.29 7821 1
## theta_1[3,2] 0.26 0 0.06 0.15 0.21 0.25 0.30 0.38 6759 1
## theta_1[3,3] 0.18 0 0.05 0.09 0.15 0.18 0.21 0.29 6943 1
## theta_1[3,4] 0.18 0 0.05 0.09 0.14 0.18 0.21 0.29 7639 1
## theta_1[3,5] 0.20 0 0.05 0.11 0.16 0.19 0.23 0.31 5857 1
##
## Samples were drawn using NUTS(diag_e) at Thu Jan 21 17:59:33 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
pred <- fit %>%
spread_draws(new_pred[i]) %>%
median_qi(new_pred[i]) %>%
mutate(.pred = if_else(`new_pred[i]` > 0.5, 0, 1))
acc <- round(mean(pred$.pred == new_df$y), 2)
cat("The classification accuracy is:", acc)
## The classification accuracy is: 0.03
theta_0_draws <- fit %>%
spread_draws(theta_0[i, j])
theta_1_draws <- fit %>%
spread_draws(theta_1[i, j])
theta_draws <- theta_0_draws %>%
left_join(theta_1_draws)
## Joining, by = c("i", "j", ".chain", ".iteration", ".draw")
theta_draws %>%
gather(group, theta, theta_0, theta_1) %>%
mutate(
group = if_else(group == "theta_0", "Group 0", "Group 1"),
question = i,
response = j
) %>%
ggplot(aes(theta, fill = group, color = group)) +
geom_density(alpha = 0.5) +
facet_grid(
rows = vars(question),
cols = vars(response)
) +
labs(
title = "Estimated probability of each response by question and group",
subtitle = "Columns correspond to response, rows to questions",
x = "Probability of response") +
theme(
legend.position = "none",
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
theme(panel.grid = element_blank(),
panel.background = element_rect(fill="white"))
Daftar pustka :