Tidy Tuesday week 34 Star Trek Voice Commands, data from SpeechInteraction.org.

# Load libraries
library(tidyverse)
library(colorspace)
library(patchwork)
library(ggmosaic)
library(ggpubr)
library(ggparallel)
library(ggdist)
library(glue)
library(ggsci)
library(ggparallel)
library(scales)

options(dplyr.summarise.inform = FALSE)
# Import data
computer <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-08-17/computer.csv')

── Column specification ───────────────────────────────────────────────────────────────────────────────
cols(
  name = col_double(),
  char = col_character(),
  line = col_character(),
  direction = col_character(),
  type = col_character(),
  pri_type = col_character(),
  domain = col_character(),
  sub_domain = col_character(),
  nv_resp = col_logical(),
  interaction = col_character(),
  char_type = col_character(),
  is_fed = col_logical(),
  error = col_logical(),
  value_id = col_double()
)
# Missing data
colSums(is.na(computer))
       name        char        line   direction        type    pri_type      domain  sub_domain 
          0           0           0        1167           0           0         252         754 
    nv_resp interaction   char_type      is_fed       error    value_id 
          0           0           0           0           0           0 
# variable unique value count 
computer %>% summarise_all(n_distinct)

Character type, interaction type, domain

# combine plot
ggarrange(p4,p5, ncol=1)

ALT text: Bar plot showing the count of interaction type by character type (human and computer) where the highest interaction type for computer is response and person is command. Mosaic plot showing the domain of interaction by computer and person, where human have higher proportions than computer across domains, with the exception of the emergency domain.

Word count

theme_set(theme_minimal(base_size = 10))
theme_update(plot.margin=unit(c(.5,.5,.5,.5),"cm"),
             plot.title.position="plot",
             panel.grid=element_line(size=.2))
# top 4 domains 
d = computer %>% filter(!is.na(domain)) %>% count(domain, sort=T) %>% slice(1:4)

c1 = computer %>% mutate(word_count = str_count(computer$interaction, "\\w+")) %>%
  filter(domain %in% d$domain) %>%
  filter(!is.na(domain)) 

# stat df
stat = c1 %>%
  group_by(domain,char_type) %>%
  summarise(median = median(word_count),
            max=max(word_count),
            n=n())

# plot
computer %>% mutate(word_count = str_count(computer$interaction, "\\w+")) %>%
  filter(domain %in% d$domain) %>%
  filter(!is.na(domain)) %>%
  ggplot(aes(x=char_type, y=word_count, fill=char_type)) + 
  #stat_interval(show.legend=F)+
  stat_halfeye(alpha=0.8) + 
  geom_text(data=stat, aes(y=median, x=char_type, label=median), color="white",
                           fontface="bold", size=3, nudge_x=.2) +
  geom_text(data=stat, aes(y=max, x=char_type, label = glue::glue("n = {n}"), color=char_type), 
                          size=2.5, nudge_x=.2)+
  coord_flip() + 
  scale_fill_aaas() + 
  scale_color_aaas() +
  facet_wrap(~domain, ncol=2, labeller = label_both) + 
  scale_y_continuous(breaks=seq(0,50,10)) +
  theme(legend.position = "none",
        panel.grid.major.y=element_blank(),
        panel.grid.minor.x=element_blank(),
        axis.title=element_blank(),
        axis.text.x=element_text(size=7)) + 
  labs(title="Word count by interaction domain and character type")

Character type and domain parallel plot

# parallel plot
dfc=as.data.frame(computer %>%
                    drop_na(domain) %>% 
                    mutate(domain= str_to_title(domain),
                           domain=fct_lump(domain,4)))
ggparallel(list('domain','char_type'), data=dfc, label=T, text.angle=0, label.size=3.2) + 
  #geom_text(aes(label=dfc$domain)) +
  scale_fill_aaas()+ 
  scale_color_aaas() + 
  scale_x_discrete(label=c("Domain","Character Type"), expand=c(0,0)) +
  theme(axis.text=element_blank(),
        axis.text.y=element_text(size=9, margin=margin(r=-10)),
        axis.title=element_blank(),
        panel.grid.major.x=element_blank(),
        panel.grid.minor.x=element_blank(),
        legend.position = "none") + 
  coord_flip() + 
  labs(title="Character type and interaction domain\n")

Proportion of character type by sub domain

# y axis labels
lab1 = computer %>% 
  drop_na(sub_domain) %>% 
  mutate(sub_domain= ifelse(sub_domain=="Holodeck?","Holodeck",sub_domain)) %>%
  count(sub_domain, sort=T) %>%
  mutate(sub_domain=fct_rev(fct_inorder(sub_domain)),
         label=paste0("<b>",sub_domain,"</b>"," ","(n=",n,")"))

# plot
computer %>% 
  drop_na(sub_domain) %>%
  mutate(sub_domain= ifelse(sub_domain=="Holodeck?","Holodeck",sub_domain)) %>%
  group_by(sub_domain, char_type) %>%
  tally() %>%
  mutate(prop=n/sum(n),
         col=ifelse(lead(n)>n,"Person > Computer","Computer > Person"),
         col=ifelse(is.na(col),"Computer > Person","Person > Computer")) %>%
  ggplot(aes(y=reorder(sub_domain,n), x=prop)) + 
  geom_line(aes(group=sub_domain, color=col)) +
  geom_point(aes(shape=char_type), size=2.2)+ 
  scale_y_discrete(labels=rev(lab1$label)) + 
  scale_shape_manual(values=c(1,19)) + 
  scale_color_manual(values=c("#3366CC","#AD722C")) + 
  scale_x_continuous(limits=c(0,1.05), label=scales::percent_format(), expand=c(0,0)) +
  theme(legend.position = "top",
        legend.justification = "left",
        panel.grid.minor=element_blank(),
        axis.title=element_text(size=8.5),
        axis.text.y=element_markdown(),
        plot.margin=unit(c(.5,1.5,.5,.5),"cm")
        ) + 
  guides(col = guide_legend(order = 2),shape = guide_legend(order = 1)) + 
  labs(color="", shape="",
       x="Proportion",y="Sub Domain",
       title= "Proportion of character type by interaction sub domain") 

