Loading data and setting up

Only need to do this first time - can delete or comment using “#” after.

#install.packages('tidyverse') 
#install.packages('abind') # also need to install abind
#install.packages('MASS') # same
#install.packages('readr') # last one - this one reads CSV files faster and has some useful defaults, though it is not necessary

Will only use one function from abind, readr, and MASS, so don’t have to load them using library:

library(tidyverse) 
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
setwd('~/dropbox/1_research/generality') # change this to folder that has the data
data <- readr::read_csv("all_observations.csv") 
## Parsed with column specification:
## cols(
##   id = col_integer(),
##   `7C_8-3` = col_double(),
##   `7C_12-2` = col_double(),
##   `8B_6-2` = col_double()
## )
data <- rbind(data, c(NA, '3.1', '3.1', NA)) # this is because `7C_8-3` and `7C_12-2` have no 3.1 obs - will remove this later
data[, 2:4] <- sapply(data[, 2:4], function(x) floor(as.numeric(x))) # this recodes our 10 variables to just six (0, 1, 2, 3, 4, and 5)
data
## # A tibble: 578 × 4
##        id `7C_8-3` `7C_12-2` `8B_6-2`
## *   <chr>    <dbl>     <dbl>    <dbl>
## 1  101307        0        NA       NA
## 2  101404        0         1       NA
## 3  101405        0        NA       NA
## 4  101466        0         0       NA
## 5  101477        0         1        3
## 6  101577        0         1        1
## 7  101597        0         0       NA
## 8  101607        0         2       NA
## 9  101609        0        NA        3
## 10 103232        0         1       NA
## # ... with 568 more rows

Look at likelihood of profile (using all data & six codes)

Prepare data for plot:

to_prep <- tidyr::gather(data, key, val, -id)

to_plot <- to_prep %>% 
    group_by(key, val) %>% 
    summarize(n = n()) %>% 
    filter(!is.na(val)) %>% 
    tidyr::spread(val, n, fill = 0) %>% 
    tidyr::gather(Code, the_val, -key) %>% 
    group_by(key) %>% 
    mutate(the_val = as.numeric(the_val),
           the_val = the_val / sum(the_val))

to_plot$key <- factor(to_plot$key,
                      levels = c("7C_8-3", "7C_12-2", "8B_6-2"))

to_plot$the_key <- factor(to_plot$Code)

to_plot <- to_plot %>% arrange(key, Code)

t1_freq <- as.vector(table(data$`7C_8-3`))
t2_freq <- as.vector(table(data$`7C_12-2`))
t3_freq <- as.vector(table(data$`8B_6-2`))
the_mat <- cbind(t1_freq, t2_freq, t3_freq)
row.names(the_mat) <- sort(unique(data$`8B_6-2`))
colnames(the_mat) <- names(data)[2:4] 

This throws a warning (not an error) because some cells have few (< 5) cases; it’s probably okay as long as we interpret those cells with caution.

chi_sq_output <- chisq.test(the_mat) 
## Warning in chisq.test(the_mat): Chi-squared approximation may be incorrect

This finds cells (i.e., shifts) which are more (+) or less (-) expected, or as frequent as expected (=). Here are the z-scores (> 1.96 means cell is more likely than expected; < 1.96 means cell is less likely than expected):

round(chi_sq_output$stdres, 3)  
##   7C_8-3 7C_12-2 8B_6-2
## 0  4.323  -3.272 -1.489
## 1 -4.690   5.014 -0.247
## 2  5.569  -3.308 -3.072
## 3  1.822  -3.904  2.584
## 4 -4.044   2.540  2.055
## 5 -0.316  -0.187  0.651

Here is the plot:

to_plot$sig <- ifelse(as.vector(chi_sq_output$stdres) >= 1.96, "+",
                      ifelse(as.vector(chi_sq_output$stdres) <= -1.96, "-",
                             "="))

ggplot(to_plot, aes(x = key, y = the_val, color = Code, group = Code, label = sig)) +
    geom_point(size = 1.5) +
    geom_line() +
    ggrepel::geom_label_repel(show.legend = F) +
    ylab("Proportion of Responses") +
    xlab(NULL) +
    theme(text = element_text(size = 14)) +
    theme(legend.title = element_blank()) +
    labs(
        title = "Number of Responses by Code for All Observations",
        caption = "Note. +, -, and = labels indicate code is more likely than expected evaluated using a chi-square test of proportions."
    ) +
    theme(text = element_text(family = "Times")) +
    theme(plot.caption = element_text(size = 8))

# save figure in working directory
ggsave("bar_chart_all_observations.png", width = 8, height = 6) 

Look at likelihood of shifts

Prepare frequency tables to examine shifts:

tab1 <- table(data$`7C_8-3`, data$`7C_12-2`)
tab2 <- table(data$`7C_12-2`, data$`8B_6-2`)

arr <- abind::abind(tab1, tab2, along = 3)
names(dimnames(arr)) <- c("first_code", "second_code", "shift")
dimnames(arr)[[3]] = c("shift_1", "shift_2")
arr
## , , shift = shift_1
## 
##           second_code
## first_code 0  1  2 3  4 5
##          0 2  5  1 0  2 0
##          1 1 43  2 2 26 0
##          2 4 39 37 7 33 2
##          3 1 21 10 7  7 1
##          4 1 22 15 4 42 1
##          5 0  0  0 0  3 0
## 
## , , shift = shift_2
## 
##           second_code
## first_code 0  1 2 3  4 5
##          0 0  0 0 0  0 0
##          1 0 15 5 5 14 0
##          2 0  3 1 1  6 0
##          3 0  1 0 3  1 0
##          4 1  8 1 1 15 1
##          5 0  1 0 0  1 0

Use log linear models:

m.sat <- MASS::loglm( ~ first_code + second_code + shift, arr)
m.sat_out <- as.data.frame(resid(m.sat))
## Re-fitting to get frequencies and fitted values
df1 <- data.frame(tab1)
df2 <- data.frame(tab2)

df1 <- mutate(df1, shift = "shift_1")
df2 <- mutate(df2, shift = "shift_2")

df <- bind_rows(df1, df2)
names(df) <- c('first_code', 'second_code', "n", "shift")
df_out <- df %>% arrange(shift, second_code)

out_out <- m.sat_out %>% 
    gather(key, val) %>% 
    mutate(code_1 = rep(c(0:5), 12)) %>% 
    unite(united, code_1, key)

df <- bind_cols(df_out, out_out)
df <- select(df, first_code, second_code, val, shift)
df <- unite(df, code, first_code, second_code, sep = "-")

out <- df %>% 
    filter(val >= 1.96 | val <= -1.96) %>% 
    mutate(sig = ifelse(val > 1.96, "+",
                        ifelse(val < -1.96, "-", NA))) %>% 
    select(-val) %>% 
    arrange(shift, code)

out
##    code   shift sig
## 1   0-0 shift_1   +
## 2   1-2 shift_1   -
## 3   1-3 shift_1   -
## 4   2-2 shift_1   +
## 5   3-3 shift_1   +
## 6   3-4 shift_1   -
## 7   4-1 shift_1   -
## 8   1-1 shift_2   +
## 9   1-3 shift_2   +
## 10  1-4 shift_2   +
## 11  2-1 shift_2   -
## 12  2-2 shift_2   -
## 13  3-3 shift_2   +
## 14  4-4 shift_2   +

Prepare plot:

to_plot <- data %>% 
    gather(key, val, -id) %>% 
    mutate(val = factor(val)) %>% 
    filter(!is.na(val)) %>% 
    count(key, val) %>% 
    mutate(prop = round(n / sum(n), 3))

to_plot$key <- ifelse(to_plot$key == "7C_8-3", "Time 1",
                      ifelse(to_plot$key == "7C_12-2", "Time 2", "Time 3"))

to_plot$key <- factor(to_plot$key,
                       levels = c("Time 1", "Time 2", "Time 3"))
                       
to_plot$val <- factor(to_plot$val,
                      levels = 
                      c("5", "4", "3", "2", "1", "0"))

Found likelihoods using this:

to_plot %>%
    arrange(key) %>%
    group_by(key) %>%
    mutate(new_prop = cumsum(prop),
           new_loc = (new_prop - (prop / 2)))
## Source: local data frame [18 x 6]
## Groups: key [3]
## 
##       key    val     n  prop new_prop new_loc
##    <fctr> <fctr> <int> <dbl>    <dbl>   <dbl>
## 1  Time 1      0    46 0.099    0.099  0.0495
## 2  Time 1      1   100 0.216    0.315  0.2070
## 3  Time 1      2   151 0.326    0.641  0.4780
## 4  Time 1      3    56 0.121    0.762  0.7015
## 5  Time 1      4   103 0.222    0.984  0.8730
## 6  Time 1      5     7 0.015    0.999  0.9915
## 7  Time 2      0    12 0.031    0.031  0.0155
## 8  Time 2      1   147 0.381    0.412  0.2215
## 9  Time 2      2    72 0.187    0.599  0.5055
## 10 Time 2      3    21 0.054    0.653  0.6260
## 11 Time 2      4   128 0.332    0.985  0.8190
## 12 Time 2      5     6 0.016    1.001  0.9930
## 13 Time 3      0     7 0.039    0.039  0.0195
## 14 Time 3      1    51 0.282    0.321  0.1800
## 15 Time 3      2    28 0.155    0.476  0.3985
## 16 Time 3      3    28 0.155    0.631  0.5535
## 17 Time 3      4    63 0.348    0.979  0.8050
## 18 Time 3      5     4 0.022    1.001  0.9900

Make plot:

ggplot(to_plot, aes(x = key, y = prop, fill = val, width = .625)) +
    geom_col(position = 'stack') +
    xlab(NULL) +
    ylab("Proportion of Responses") +
    xlab(NULL) +
    theme(text = element_text(size = 12)) +
    theme(legend.title = element_blank()) +
    theme(plot.caption = element_text(size = 7.5, family = "Times")) +
    annotate("segment", x = 1, xend = 2, y = .049, yend = .015, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .207, yend = .505, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .207, yend = .626, arrow=arrow(ends = "last", length=unit(.2,"cm"))) + 
    annotate("segment", x = 1, xend = 2, y = .478, yend = .505, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", x = 1, xend = 2, y = .701, yend = .626, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .705, yend = .819, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .873, yend = .221, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", x = 2, xend = 3, y = .221, yend = .180, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", x = 2, xend = 3, y = .221, yend = .553, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment",  x = 2, xend = 3, y = .221, yend = .805, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", linetype = "dashed", x = 2, xend = 3, y = .505, yend = .180, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", linetype = "dashed", x = 2, xend = 3, y = .505, yend = .398, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", x = 2, xend = 3, y = .626, yend = .553, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    annotate("segment", x = 2, xend = 3, y = .819, yend = .805, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
    labs(
        title = "Shifts Between Codes for All Observations",
        caption = "Note. Solid lines indicate shift is more likely than (and dashed lines indicate shift is less likely) than expected as evaluated using log linear models"
    )

ggsave("shift.png", width = 9, height = 9.25)