Only need to do this first time - can delete or comment using “#” after.
#install.packages('tidyverse')
#install.packages('abind') # also need to install abind
#install.packages('MASS') # same
#install.packages('readr') # last one - this one reads CSV files faster and has some useful defaults, though it is not necessary
Will only use one function from abind, readr, and MASS, so don’t have to load them using library:
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
setwd('~/dropbox/1_research/generality') # change this to folder that has the data
data <- readr::read_csv("all_observations.csv")
## Parsed with column specification:
## cols(
## id = col_integer(),
## `7C_8-3` = col_double(),
## `7C_12-2` = col_double(),
## `8B_6-2` = col_double()
## )
data <- rbind(data, c(NA, '3.1', '3.1', NA)) # this is because `7C_8-3` and `7C_12-2` have no 3.1 obs - will remove this later
data[, 2:4] <- sapply(data[, 2:4], function(x) floor(as.numeric(x))) # this recodes our 10 variables to just six (0, 1, 2, 3, 4, and 5)
data
## # A tibble: 578 × 4
## id `7C_8-3` `7C_12-2` `8B_6-2`
## * <chr> <dbl> <dbl> <dbl>
## 1 101307 0 NA NA
## 2 101404 0 1 NA
## 3 101405 0 NA NA
## 4 101466 0 0 NA
## 5 101477 0 1 3
## 6 101577 0 1 1
## 7 101597 0 0 NA
## 8 101607 0 2 NA
## 9 101609 0 NA 3
## 10 103232 0 1 NA
## # ... with 568 more rows
Prepare data for plot:
to_prep <- tidyr::gather(data, key, val, -id)
to_plot <- to_prep %>%
group_by(key, val) %>%
summarize(n = n()) %>%
filter(!is.na(val)) %>%
tidyr::spread(val, n, fill = 0) %>%
tidyr::gather(Code, the_val, -key) %>%
group_by(key) %>%
mutate(the_val = as.numeric(the_val),
the_val = the_val / sum(the_val))
to_plot$key <- factor(to_plot$key,
levels = c("7C_8-3", "7C_12-2", "8B_6-2"))
to_plot$the_key <- factor(to_plot$Code)
to_plot <- to_plot %>% arrange(key, Code)
t1_freq <- as.vector(table(data$`7C_8-3`))
t2_freq <- as.vector(table(data$`7C_12-2`))
t3_freq <- as.vector(table(data$`8B_6-2`))
the_mat <- cbind(t1_freq, t2_freq, t3_freq)
row.names(the_mat) <- sort(unique(data$`8B_6-2`))
colnames(the_mat) <- names(data)[2:4]
This throws a warning (not an error) because some cells have few (< 5) cases; it’s probably okay as long as we interpret those cells with caution.
chi_sq_output <- chisq.test(the_mat)
## Warning in chisq.test(the_mat): Chi-squared approximation may be incorrect
This finds cells (i.e., shifts) which are more (+) or less (-) expected, or as frequent as expected (=). Here are the z-scores (> 1.96 means cell is more likely than expected; < 1.96 means cell is less likely than expected):
round(chi_sq_output$stdres, 3)
## 7C_8-3 7C_12-2 8B_6-2
## 0 4.323 -3.272 -1.489
## 1 -4.690 5.014 -0.247
## 2 5.569 -3.308 -3.072
## 3 1.822 -3.904 2.584
## 4 -4.044 2.540 2.055
## 5 -0.316 -0.187 0.651
Here is the plot:
to_plot$sig <- ifelse(as.vector(chi_sq_output$stdres) >= 1.96, "+",
ifelse(as.vector(chi_sq_output$stdres) <= -1.96, "-",
"="))
ggplot(to_plot, aes(x = key, y = the_val, color = Code, group = Code, label = sig)) +
geom_point(size = 1.5) +
geom_line() +
ggrepel::geom_label_repel(show.legend = F) +
ylab("Proportion of Responses") +
xlab(NULL) +
theme(text = element_text(size = 14)) +
theme(legend.title = element_blank()) +
labs(
title = "Number of Responses by Code for All Observations",
caption = "Note. +, -, and = labels indicate code is more likely than expected evaluated using a chi-square test of proportions."
) +
theme(text = element_text(family = "Times")) +
theme(plot.caption = element_text(size = 8))
# save figure in working directory
ggsave("bar_chart_all_observations.png", width = 8, height = 6)
Prepare frequency tables to examine shifts:
tab1 <- table(data$`7C_8-3`, data$`7C_12-2`)
tab2 <- table(data$`7C_12-2`, data$`8B_6-2`)
arr <- abind::abind(tab1, tab2, along = 3)
names(dimnames(arr)) <- c("first_code", "second_code", "shift")
dimnames(arr)[[3]] = c("shift_1", "shift_2")
arr
## , , shift = shift_1
##
## second_code
## first_code 0 1 2 3 4 5
## 0 2 5 1 0 2 0
## 1 1 43 2 2 26 0
## 2 4 39 37 7 33 2
## 3 1 21 10 7 7 1
## 4 1 22 15 4 42 1
## 5 0 0 0 0 3 0
##
## , , shift = shift_2
##
## second_code
## first_code 0 1 2 3 4 5
## 0 0 0 0 0 0 0
## 1 0 15 5 5 14 0
## 2 0 3 1 1 6 0
## 3 0 1 0 3 1 0
## 4 1 8 1 1 15 1
## 5 0 1 0 0 1 0
Use log linear models:
m.sat <- MASS::loglm( ~ first_code + second_code + shift, arr)
m.sat_out <- as.data.frame(resid(m.sat))
## Re-fitting to get frequencies and fitted values
df1 <- data.frame(tab1)
df2 <- data.frame(tab2)
df1 <- mutate(df1, shift = "shift_1")
df2 <- mutate(df2, shift = "shift_2")
df <- bind_rows(df1, df2)
names(df) <- c('first_code', 'second_code', "n", "shift")
df_out <- df %>% arrange(shift, second_code)
out_out <- m.sat_out %>%
gather(key, val) %>%
mutate(code_1 = rep(c(0:5), 12)) %>%
unite(united, code_1, key)
df <- bind_cols(df_out, out_out)
df <- select(df, first_code, second_code, val, shift)
df <- unite(df, code, first_code, second_code, sep = "-")
out <- df %>%
filter(val >= 1.96 | val <= -1.96) %>%
mutate(sig = ifelse(val > 1.96, "+",
ifelse(val < -1.96, "-", NA))) %>%
select(-val) %>%
arrange(shift, code)
out
## code shift sig
## 1 0-0 shift_1 +
## 2 1-2 shift_1 -
## 3 1-3 shift_1 -
## 4 2-2 shift_1 +
## 5 3-3 shift_1 +
## 6 3-4 shift_1 -
## 7 4-1 shift_1 -
## 8 1-1 shift_2 +
## 9 1-3 shift_2 +
## 10 1-4 shift_2 +
## 11 2-1 shift_2 -
## 12 2-2 shift_2 -
## 13 3-3 shift_2 +
## 14 4-4 shift_2 +
Prepare plot:
to_plot <- data %>%
gather(key, val, -id) %>%
mutate(val = factor(val)) %>%
filter(!is.na(val)) %>%
count(key, val) %>%
mutate(prop = round(n / sum(n), 3))
to_plot$key <- ifelse(to_plot$key == "7C_8-3", "Time 1",
ifelse(to_plot$key == "7C_12-2", "Time 2", "Time 3"))
to_plot$key <- factor(to_plot$key,
levels = c("Time 1", "Time 2", "Time 3"))
to_plot$val <- factor(to_plot$val,
levels =
c("5", "4", "3", "2", "1", "0"))
Found likelihoods using this:
to_plot %>%
arrange(key) %>%
group_by(key) %>%
mutate(new_prop = cumsum(prop),
new_loc = (new_prop - (prop / 2)))
## Source: local data frame [18 x 6]
## Groups: key [3]
##
## key val n prop new_prop new_loc
## <fctr> <fctr> <int> <dbl> <dbl> <dbl>
## 1 Time 1 0 46 0.099 0.099 0.0495
## 2 Time 1 1 100 0.216 0.315 0.2070
## 3 Time 1 2 151 0.326 0.641 0.4780
## 4 Time 1 3 56 0.121 0.762 0.7015
## 5 Time 1 4 103 0.222 0.984 0.8730
## 6 Time 1 5 7 0.015 0.999 0.9915
## 7 Time 2 0 12 0.031 0.031 0.0155
## 8 Time 2 1 147 0.381 0.412 0.2215
## 9 Time 2 2 72 0.187 0.599 0.5055
## 10 Time 2 3 21 0.054 0.653 0.6260
## 11 Time 2 4 128 0.332 0.985 0.8190
## 12 Time 2 5 6 0.016 1.001 0.9930
## 13 Time 3 0 7 0.039 0.039 0.0195
## 14 Time 3 1 51 0.282 0.321 0.1800
## 15 Time 3 2 28 0.155 0.476 0.3985
## 16 Time 3 3 28 0.155 0.631 0.5535
## 17 Time 3 4 63 0.348 0.979 0.8050
## 18 Time 3 5 4 0.022 1.001 0.9900
Make plot:
ggplot(to_plot, aes(x = key, y = prop, fill = val, width = .625)) +
geom_col(position = 'stack') +
xlab(NULL) +
ylab("Proportion of Responses") +
xlab(NULL) +
theme(text = element_text(size = 12)) +
theme(legend.title = element_blank()) +
theme(plot.caption = element_text(size = 7.5, family = "Times")) +
annotate("segment", x = 1, xend = 2, y = .049, yend = .015, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .207, yend = .505, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .207, yend = .626, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 1, xend = 2, y = .478, yend = .505, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 1, xend = 2, y = .701, yend = .626, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .705, yend = .819, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", linetype = "dashed", x = 1, xend = 2, y = .873, yend = .221, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 2, xend = 3, y = .221, yend = .180, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 2, xend = 3, y = .221, yend = .553, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 2, xend = 3, y = .221, yend = .805, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", linetype = "dashed", x = 2, xend = 3, y = .505, yend = .180, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", linetype = "dashed", x = 2, xend = 3, y = .505, yend = .398, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 2, xend = 3, y = .626, yend = .553, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
annotate("segment", x = 2, xend = 3, y = .819, yend = .805, arrow=arrow(ends = "last", length=unit(.2,"cm"))) +
labs(
title = "Shifts Between Codes for All Observations",
caption = "Note. Solid lines indicate shift is more likely than (and dashed lines indicate shift is less likely) than expected as evaluated using log linear models"
)
ggsave("shift.png", width = 9, height = 9.25)