# load packages
library(tidyverse)
# source('correlation_table_plot.R') # plot_cor_table

# plot setup
theme_set(theme_classic(base_size = 10))
DATA_PATH <- "learning_quiz_dataset.csv"
raw_data <- read_csv(DATA_PATH)

tidy_data <- raw_data %>%
  mutate_if(is.character, as.factor)
count(tidy_data, user_id, tag, item_type) %>%
  ggplot(aes(x = n, fill = item_type)) +
  geom_histogram() +
  facet_grid(~ item_type ) +
  ggtitle("N datapoints per user and section")

0.1 Change in perfomance across time

From section 1 to 2, there is no improvement in vacobualry or grammar. There looks to be a trend toward WORSE perforamnce in grammar.

prop_correct_by_user <- tidy_data %>%
  group_by(item_type, user_id,section) %>% 
  summarize(mean_correct = mean(correct)) %>%
  group_by(item_type, section) %>%
  multi_boot_standard(col = "mean_correct") %>%
  ungroup() %>%
  mutate(section = as.factor(section))

ggplot(prop_correct_by_user, aes(x = section, y = mean, group = item_type, color = item_type)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper,))  +
  geom_line()

Let’s try to see if this is due to particular question types. Below is plotted performance change from session 1 to 2 as a function of grammar question type. It looks like the over-all decrease we see above may be due to a few questions: futuro-proximo, oraciones-subordinadas, preterito-ar. In general these seem to be related to tense. These are cases where there was no data from the first session, suggesting that the decrease could be an artefact of missing data.

performance_by_grammar_tag <- tidy_data %>%
  filter(item_type == "grammar") %>%
  group_by(item_type, user_id,section, tag) %>% 
  summarize(mean_correct = mean(correct)) %>%
  group_by(section, tag) %>%
  multi_boot_standard(col = "mean_correct") %>%
  ungroup() %>%
  mutate(section = as.factor(section))

ggplot(performance_by_grammar_tag, aes(x = section, y = mean, group = tag, color = tag)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper,))  +
  facet_wrap(~ tag) +
  geom_line() +
  theme(legend.position = "none")

performance_by_grammar_tag %>%
  select(section, tag, mean) %>%
  spread(section, mean) %>%
  
  #mutate(`1` = case_when(is.na(`1`) ~ 0, TRUE ~ `1`),
  #       `2` = case_when(is.na(`2`) ~ 0, TRUE ~ `2`)) %>%
  mutate(learning_delta = `1` - `2`,
         learning_delta = round(learning_delta, 2)) %>%
  arrange(learning_delta) %>%
  select(tag, learning_delta) %>%
  DT::datatable()

Let’s filter to cases where we have data for both section 1 and 2. If we exclude the missing data, there’s no longer a decrease in performance between the two sections, but there’s also no improvement. There’s a lot of variability across the sections, as well

prop_correct_by_user_no_na <- performance_by_grammar_tag %>%
  select(section, tag, mean) %>%
  spread(section, mean) %>%
  drop_na() %>%
  gather("section", "value", -tag) %>%
  group_by(section) %>%
  multi_boot_standard(col = "value") %>%
  ungroup() %>%
  mutate(section = as.factor(section))  %>%
  mutate(group = "grammar")

ggplot(prop_correct_by_user_no_na, aes(x = section, y = mean, group = group)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper))  +
  geom_line()

0.2 Within-user correlation in performance between item types

Are users that are good at vocabulary questiosn also good at grammar questions?

by_user_data <- tidy_data %>%
  group_by(item_type, user_id) %>%
  drop_na() %>%
  summarize(mean_performance = mean(correct)) %>%
  spread(item_type, mean_performance)

ggplot(by_user_data, aes(x = grammar, y = vocabulary)) +
  geom_point() + 
  geom_smooth() +
  geom_smooth(method = "lm", color = "red") 

perofrmance_cor <- cor.test(by_user_data$grammar, by_user_data$vocabulary) %>%
  tidy()
  kable(perofrmance_cor)
estimate statistic p.value parameter conf.low conf.high method alternative
0.5658013 40.5033 0 3484 0.5427989 0.5879554 Pearson’s product-moment correlation two.sided

There’s a pretty systematic relationship between performance on grammar and vocabulary: Users that are good a one type of question are good at the other the correlation 0.57 across all users, looks linear. The red linear shows a linear fit, and the blue line shows a loess fit.