Using Ordered, Hierarchical Logistic Regression

Pretend this is the scenario:

The fake data looks like:

head(data)
## Source: local data frame [6 x 4]
## 
##      title performance response.2 response.1
## 1 Engineer           C     -1.385     -1.312
## 2 Engineer           A     -0.501     -1.667
## 3 Engineer           A     -0.485     -1.373
## 4 Engineer           B     -0.462     -0.733
## 5 Engineer           B     -1.846      0.279
## 6 Engineer           B     -1.297     -0.602

And the fake data is created so response.2 is a random number and response.1 is ACTUALLY related to performance.

Response.1 is related in a different way for each employee title though, but in all cases the performance goes up as that response goes up (perhaps the question was like “How hard do you work?” – so it should be indicative of performance). It just goes up more quickly for some jobs more than others:

# Average response values by title and performance:
# note that performance goes up as response.1 goes up
library(reshape2)
data %>% 
  group_by(title, performance) %>% 
  summarise(
    avg_response.1=round(mean(response.1), 2), 
    avg_response.2=round(mean(response.2), 2)
  ) %>% melt(id.vars=c('title', 'performance')) %>%
  ggplot(aes(x=performance, y=value)) + geom_bar(stat='identity', position='dodge') + 
  facet_grid(variable~title)

Again response.2 is just nonsense and response.1 has an actual relationship with performance defined like:

# These are the values the model should be able to recover
engineer.slope <- .9
lunch.lady.slope <- 2.1
sales.guy.slope <- 5.1

Creating a Model

The point then will be to determine how as the response values change, the performance changes as well PER job title.

One way to do this with stan is to use an ordered logistic model where the slope per job title varies:

### Create a 'Performance' model based on employee survey responses, 
### with a hierarchical separation for each job title

stan.model <- '
  data {
    int<lower=2> N_PERF_VALS;                // # of possible performance measures like "A" or "B", etc.
    int<lower=0> N_OBS;                      // # of employees in question (i.e. number of observations)
    int<lower=1> N_VARS;                     // # of survey responses
    int<lower=1> N_JOBS;                     // # of different titles possible
    int<lower=1,upper=N_PERF_VALS> y[N_OBS]; // Vector of performance evaluations
    int<lower=1> g[N_OBS];                   // "Group" or rather "Job title" for each employee
    row_vector[N_VARS] x[N_OBS];             // Matrix of survey responses for each employee
  }
  parameters {
    vector[N_VARS] beta[N_JOBS];   // Slope estimates by job title
    ordered[N_PERF_VALS-1] cutpoints; 
  }
  model {
    // Assign normal priors to coefficients to estimate
    for (i in 1:N_JOBS)
      beta[i]  ~ normal(0,10);

    // Create model where the output is predicted by the per-job-title 
    // intercept plus the per-job-title slope times each survey response value
    for (i in 1:N_OBS)
      y[i] ~ ordered_logistic(x[i] * beta[g[i]], cutpoints);
  }
'

Now feed the data to Stan and run the model:

# Convert the raw data created into Stan data
d <- list(
  N_PERF_VALS = length(levels(data$performance)), # One for each performance grade (A, B, or C)
  N_OBS = nrow(data),
  N_VARS = 2,  # One for each of the two survey responses
  N_JOBS = 3,  # One for each of the job titles
  y = as.integer(data$performance),         # These are the performance grades
  x = data[,c('response.1', 'response.2')], # These are the survey responses
  g = as.integer(data$title)                # And these are integers corresponding to each job title
)

# Run the sampler
fit <- stan(model_code = stan.model, data = d, warmup = 100, iter = 1000, thin = 5, chains = 1, verbose = FALSE)
## 
## TRANSLATING MODEL 'stan.model' FROM Stan CODE TO C++ CODE NOW.
## COMPILING THE C++ CODE FOR MODEL 'stan.model' NOW.
## In file included from file11286111bec8d.cpp:8:
## In file included from /Users/eczech/Library/R/3.1/library/rstan/include//stansrc/stan/model/model_header.hpp:17:
## In file included from /Users/eczech/Library/R/3.1/library/rstan/include//stansrc/stan/agrad/rev.hpp:5:
## /Users/eczech/Library/R/3.1/library/rstan/include//stansrc/stan/agrad/rev/chainable.hpp:87:17: warning: 'static' function 'set_zero_all_adjoints' declared in header file should be declared 'static inline' [-Wunneeded-internal-declaration]
##     static void set_zero_all_adjoints() {
##                 ^
## In file included from file11286111bec8d.cpp:8:
## In file included from /Users/eczech/Library/R/3.1/library/rstan/include//stansrc/stan/model/model_header.hpp:21:
## /Users/eczech/Library/R/3.1/library/rstan/include//stansrc/stan/io/dump.hpp:26:14: warning: function 'product' is not needed and will not be emitted [-Wunneeded-internal-declaration]
##       size_t product(std::vector<size_t> dims) {
##              ^
## 2 warnings generated.
## 
## SAMPLING FOR MODEL 'stan.model' NOW (CHAIN 1).
## 
## Iteration:   1 / 1000 [  0%]  (Warmup)
## Iteration: 100 / 1000 [ 10%]  (Warmup)
## Iteration: 101 / 1000 [ 10%]  (Sampling)
## Iteration: 200 / 1000 [ 20%]  (Sampling)
## Iteration: 300 / 1000 [ 30%]  (Sampling)
## Iteration: 400 / 1000 [ 40%]  (Sampling)
## Iteration: 500 / 1000 [ 50%]  (Sampling)
## Iteration: 600 / 1000 [ 60%]  (Sampling)
## Iteration: 700 / 1000 [ 70%]  (Sampling)
## Iteration: 800 / 1000 [ 80%]  (Sampling)
## Iteration: 900 / 1000 [ 90%]  (Sampling)
## Iteration: 1000 / 1000 [100%]  (Sampling)
## #  Elapsed Time: 0.260178 seconds (Warm-up)
## #                1.89812 seconds (Sampling)
## #                2.1583 seconds (Total)

Take the sampled parameter estimates out of the stan model and plot the posterior estimates.

Also show the ACTUAL value for each parameter used to generate the data in the first place as a red line:

samples <- extract.samples(fit)

vlines <- samples %>% group_by(title, param, response) %>% summarise(actual=actual[1])

ggplot(samples, aes(x=sample)) + geom_density() + 
  facet_grid(title~response) + 
  geom_vline(aes(xintercept=actual), color='red', data = vlines) + theme_bw()