Distribusi Tanggapan Kategoris

Postingan ini diadaptasi dari postingan Alex Hayes yang tersedia di sini[1]. Di dalamnya dia meninjau cara seseorang dapat menggunakan Stan untuk meninjau distribusi tanggapan untuk item survei kategoris dan kemudian menggunakannya untuk memprediksi probabilitas kelas. Ini adalah cara yang rapi untuk melihat item survei daripada mengubah respons menjadi skala numerik dan memperlakukannya sebagai berkelanjutan. Jauh lebih benar untuk data dan banyak yang telah ditulis tentang bahaya menggunakan pendekatan berkelanjutan untuk data katgorikal.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

set.seed(336)

Q <- 3 # Three Question (items)
A <- 5 # 5 Possible responses

alpha <- rep(4, A)

theta_0 <- gtools::rdirichlet(Q, alpha)
theta_1 <- gtools::rdirichlet(Q, alpha)

sample_one <- function(theta) {

  R <- numeric(Q)

  for (q in 1:Q)
    R[q] <- sample(1:A, 1, prob = theta[q, ])

  names(R) <- paste0("q", 1:Q)
  as.list(R)
}

sample_n <- function(theta, n, id) {
  samples <- map_dfr(1:n, ~sample_one(theta))
  samples <- add_column(samples, y = id, .before = TRUE)
  mutate_all(samples, as.integer)
}

(df <- sample_n(theta_0, 35, id = 0) %>%
  bind_rows(sample_n(theta_1, 35, id = 1)))

## # A tibble: 70 x 4
##        y    q1    q2    q3
##    <int> <int> <int> <int>
##  1     0     2     5     2
##  2     0     3     5     3
##  3     0     1     3     1
##  4     0     2     4     3
##  5     0     4     1     4
##  6     0     1     2     1
##  7     0     5     2     1
##  8     0     5     2     3
##  9     0     2     5     3
## 10     0     2     2     1
## # … with 60 more rows

new_df <- sample_n(theta_0, 35, id = 0) %>%
  bind_rows(sample_n(theta_1, 35, id = 1))

new_df <- sample_n(theta_0, 35, id = 0) 
new_df

## # A tibble: 35 x 4
##        y    q1    q2    q3
##    <int> <int> <int> <int>
##  1     0     1     1     1
##  2     0     5     5     1
##  3     0     1     4     2
##  4     0     5     2     2
##  5     0     5     4     2
##  6     0     4     2     2
##  7     0     5     5     5
##  8     0     1     3     3
##  9     0     1     2     2
## 10     0     4     3     1
## # … with 25 more rows

bind_rows(sample_n(theta_1, 35, id = 1))

## # A tibble: 35 x 4
##        y    q1    q2    q3
##    <int> <int> <int> <int>
##  1     1     2     1     1
##  2     1     2     1     5
##  3     1     4     1     3
##  4     1     1     4     4
##  5     1     2     1     4
##  6     1     2     1     2
##  7     1     4     1     4
##  8     1     4     1     5
##  9     1     3     1     2
## 10     1     1     4     5
## # … with 25 more rows

library(rstan)

## Loading required package: StanHeaders

## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)

## 
## Attaching package: 'rstan'

## The following object is masked from 'package:tidyr':
## 
##     extract

rstan_options(auto_write = TRUE)

new_data <- list(
  R = as.matrix(dplyr::select(df, -y)),
  N = nrow(df),
  y = df$y,
  new_R = as.matrix(dplyr::select(new_df, -y)),
  new_N = nrow(new_df),
  Q = Q,
  A = A,
  alpha = alpha
)

new_data

## $R
##       q1 q2 q3
##  [1,]  2  5  2
##  [2,]  3  5  3
##  [3,]  1  3  1
##  [4,]  2  4  3
##  [5,]  4  1  4
##  [6,]  1  2  1
##  [7,]  5  2  1
##  [8,]  5  2  3
##  [9,]  2  5  3
## [10,]  2  2  1
## [11,]  1  2  2
## [12,]  4  5  2
## [13,]  2  3  2
## [14,]  2  3  3
## [15,]  1  3  3
## [16,]  1  4  5
## [17,]  2  3  2
## [18,]  2  3  2
## [19,]  3  1  3
## [20,]  1  1  1
## [21,]  3  2  3
## [22,]  1  3  4
## [23,]  1  3  3
## [24,]  1  5  1
## [25,]  1  3  5
## [26,]  5  1  2
## [27,]  1  2  5
## [28,]  4  1  1
## [29,]  1  3  4
## [30,]  1  3  3
## [31,]  3  3  5
## [32,]  5  5  5
## [33,]  4  5  5
## [34,]  2  3  1
## [35,]  1  5  2
## [36,]  4  1  1
## [37,]  1  1  1
## [38,]  5  1  1
## [39,]  5  1  4
## [40,]  5  1  2
## [41,]  5  1  5
## [42,]  1  1  1
## [43,]  4  1  5
## [44,]  3  1  3
## [45,]  3  1  5
## [46,]  4  4  2
## [47,]  1  2  2
## [48,]  2  1  2
## [49,]  1  1  2
## [50,]  2  4  4
## [51,]  1  1  4
## [52,]  1  5  3
## [53,]  5  1  5
## [54,]  1  3  5
## [55,]  4  1  5
## [56,]  4  4  3
## [57,]  1  1  3
## [58,]  4  3  3
## [59,]  1  3  4
## [60,]  5  1  1
## [61,]  4  1  3
## [62,]  1  1  2
## [63,]  1  1  5
## [64,]  4  1  2
## [65,]  3  1  4
## [66,]  5  1  2
## [67,]  1  4  2
## [68,]  2  1  2
## [69,]  1  1  4
## [70,]  1  1  1
## 
## $N
## [1] 70
## 
## $y
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## $new_R
##       q1 q2 q3
##  [1,]  1  1  1
##  [2,]  5  5  1
##  [3,]  1  4  2
##  [4,]  5  2  2
##  [5,]  5  4  2
##  [6,]  4  2  2
##  [7,]  5  5  5
##  [8,]  1  3  3
##  [9,]  1  2  2
## [10,]  4  3  1
## [11,]  2  3  1
## [12,]  2  5  3
## [13,]  1  2  3
## [14,]  1  3  4
## [15,]  2  1  5
## [16,]  5  3  2
## [17,]  5  3  1
## [18,]  3  5  3
## [19,]  1  1  2
## [20,]  1  2  2
## [21,]  5  1  3
## [22,]  2  5  2
## [23,]  2  3  2
## [24,]  5  5  2
## [25,]  1  2  3
## [26,]  1  2  2
## [27,]  1  2  4
## [28,]  1  3  1
## [29,]  1  5  3
## [30,]  1  5  2
## [31,]  2  2  5
## [32,]  2  1  2
## [33,]  5  5  2
## [34,]  5  2  3
## [35,]  2  3  3
## 
## $new_N
## [1] 35
## 
## $Q
## [1] 3
## 
## $A
## [1] 5
## 
## $alpha
## [1] 4 4 4 4 4

model <- stan_model("stan_categorical_responses.stan")

fit <- sampling(model, new_data, cores = 3, iter = 2000, chains = 3, refresh=0)

traceplot(fit)

## 'pars' not specified. Showing first 10 parameters by default.

library(bayesplot)

## This is bayesplot version 1.8.0

## - Online documentation and vignettes at mc-stan.org/bayesplot

## - bayesplot theme set to bayesplot::theme_default()

##    * Does _not_ affect other ggplot2 plots

##    * See ?bayesplot_theme_set for details on theme setting

library(tidybayes)

print(fit, pars = "theta_0")

## Inference for Stan model: stan_categorical_responses.
## 3 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=3000.
## 
##              mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
## theta_0[1,1] 0.33       0 0.07 0.21 0.28 0.33 0.37  0.46  7652    1
## theta_0[1,2] 0.24       0 0.05 0.14 0.20 0.23 0.27  0.35  6203    1
## theta_0[1,3] 0.14       0 0.05 0.07 0.11 0.14 0.17  0.25  6652    1
## theta_0[1,4] 0.15       0 0.05 0.07 0.11 0.14 0.18  0.25  6473    1
## theta_0[1,5] 0.14       0 0.05 0.07 0.11 0.14 0.17  0.24  7147    1
## theta_0[2,1] 0.16       0 0.05 0.08 0.13 0.16 0.20  0.28  6240    1
## theta_0[2,2] 0.20       0 0.06 0.10 0.16 0.20 0.24  0.31  7657    1
## theta_0[2,3] 0.31       0 0.06 0.20 0.27 0.31 0.35  0.44  6419    1
## theta_0[2,4] 0.11       0 0.04 0.04 0.08 0.10 0.13  0.20  6198    1
## theta_0[2,5] 0.22       0 0.06 0.12 0.18 0.22 0.25  0.34  6271    1
## theta_0[3,1] 0.22       0 0.06 0.12 0.18 0.21 0.25  0.33  5744    1
## theta_0[3,2] 0.22       0 0.06 0.12 0.18 0.22 0.25  0.33  7443    1
## theta_0[3,3] 0.25       0 0.06 0.15 0.21 0.25 0.29  0.38  5932    1
## theta_0[3,4] 0.13       0 0.04 0.06 0.10 0.12 0.15  0.22  7133    1
## theta_0[3,5] 0.18       0 0.05 0.09 0.14 0.18 0.22  0.29  6049    1
## 
## Samples were drawn using NUTS(diag_e) at Thu Jan 21 17:59:33 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

print(fit, pars = "theta_1")

## Inference for Stan model: stan_categorical_responses.
## 3 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=3000.
## 
##              mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
## theta_1[1,1] 0.33       0 0.06 0.21 0.28 0.33 0.37  0.46  7663    1
## theta_1[1,2] 0.13       0 0.04 0.05 0.09 0.12 0.16  0.23  6568    1
## theta_1[1,3] 0.13       0 0.05 0.05 0.10 0.12 0.15  0.23  7304    1
## theta_1[1,4] 0.22       0 0.05 0.12 0.18 0.21 0.25  0.33  6568    1
## theta_1[1,5] 0.20       0 0.05 0.11 0.16 0.20 0.23  0.31  6066    1
## theta_1[2,1] 0.55       0 0.07 0.41 0.50 0.55 0.59  0.67  7562    1
## theta_1[2,2] 0.09       0 0.04 0.03 0.06 0.08 0.11  0.18  5638    1
## theta_1[2,3] 0.13       0 0.04 0.05 0.10 0.12 0.16  0.23  5607    1
## theta_1[2,4] 0.15       0 0.05 0.07 0.11 0.14 0.18  0.25  6089    1
## theta_1[2,5] 0.09       0 0.04 0.03 0.06 0.09 0.11  0.18  5452    1
## theta_1[3,1] 0.18       0 0.05 0.09 0.14 0.18 0.22  0.29  7821    1
## theta_1[3,2] 0.26       0 0.06 0.15 0.21 0.25 0.30  0.38  6759    1
## theta_1[3,3] 0.18       0 0.05 0.09 0.15 0.18 0.21  0.29  6943    1
## theta_1[3,4] 0.18       0 0.05 0.09 0.14 0.18 0.21  0.29  7639    1
## theta_1[3,5] 0.20       0 0.05 0.11 0.16 0.19 0.23  0.31  5857    1
## 
## Samples were drawn using NUTS(diag_e) at Thu Jan 21 17:59:33 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pred <- fit %>%
  spread_draws(new_pred[i]) %>%
  median_qi(new_pred[i]) %>% 
  mutate(.pred = if_else(`new_pred[i]` > 0.5, 0, 1))

acc <- round(mean(pred$.pred == new_df$y), 2)
cat("The classification accuracy is:", acc)

## The classification accuracy is: 0.03

theta_0_draws <- fit %>%  
  spread_draws(theta_0[i, j])

theta_1_draws <- fit %>%  
  spread_draws(theta_1[i, j])

theta_draws <- theta_0_draws %>% 
  left_join(theta_1_draws)

## Joining, by = c("i", "j", ".chain", ".iteration", ".draw")

theta_draws %>% 
  gather(group, theta, theta_0, theta_1) %>% 
  mutate(
    group = if_else(group == "theta_0", "Group 0", "Group 1"),
    question = i,
    response = j
  ) %>% 
  ggplot(aes(theta, fill = group, color = group)) +
  geom_density(alpha = 0.5) +
  facet_grid(
    rows = vars(question),
    cols = vars(response)
  ) +
  labs(
    title = "Estimated probability of each response by question and group",
    subtitle = "Columns correspond to response, rows to questions",
    x = "Probability of response") +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
  theme(panel.grid = element_blank(), 
        panel.background = element_rect(fill="white"))

Daftar pustka :

Distribusi Tanggapan Kategoris

Copy Right @ Prof Dr Suhartono M.Kom @ Magister Informatika UIN Maulana Malik Ibrahim@ (http://suhartono.lecturer.uin-malang.ac.id/riwayat-hidup/)

19 Januari 2021