Moodle search durations

library(tidyverse)

Import data

  1. Get list of csv files in data directory.
datadir <- "data"

csv_files <- datadir |>
    list.files(pattern = "csv", full.names = TRUE)
csv_files
[1] "data/logs_modified_nologin-func.csv"   
[2] "data/logs_modified_nologin-overall.csv"
[3] "data/logs_modified_user-func.csv"      
[4] "data/logs_modified_user-overall.csv"   
[5] "data/logs_orig_nologin-func.csv"       
[6] "data/logs_orig_nologin-overall.csv"    
[7] "data/logs_orig_user-func.csv"          
[8] "data/logs_orig_user-overall.csv"       
  1. Import each file and add a column with the file name (without file extension).
import_file <- function(x) {
    read_csv(x, col_names = FALSE) |>
        mutate(
            file = gsub(pattern = "\\.csv$", "", basename(x)),
            obs = row_number()
        )
}
df <- csv_files |>
    map(import_file) |>
    purrr::list_rbind() |>
    rename("duration" = "X1")
df
# A tibble: 48 × 3
   duration file                            obs
      <dbl> <chr>                         <int>
 1   25.2   logs_modified_nologin-func        1
 2    0.400 logs_modified_nologin-func        2
 3    8.52  logs_modified_nologin-func        3
 4    0.344 logs_modified_nologin-func        4
 5    0.782 logs_modified_nologin-func        5
 6    0.108 logs_modified_nologin-func        6
 7    0.434 logs_modified_nologin-func        7
 8    0.108 logs_modified_nologin-func        8
 9   26.7   logs_modified_nologin-overall     1
10    8.90  logs_modified_nologin-overall     2
# ℹ 38 more rows

Data wrangling

  1. Rename columns.

Next remove the logs_ prefix from the file name.

df <- df |>
    mutate(file = str_remove(file, "logs_"))
  1. Separate the file name into 3 columns.
df <- df |>
    separate_wider_regex(file, c(type = ".*", "_", user = ".*", "-", overall = ".*"))
df
# A tibble: 48 × 5
   duration type     user    overall   obs
      <dbl> <chr>    <chr>   <chr>   <int>
 1   25.2   modified nologin func        1
 2    0.400 modified nologin func        2
 3    8.52  modified nologin func        3
 4    0.344 modified nologin func        4
 5    0.782 modified nologin func        5
 6    0.108 modified nologin func        6
 7    0.434 modified nologin func        7
 8    0.108 modified nologin func        8
 9   26.7   modified nologin overall     1
10    8.90  modified nologin overall     2
# ℹ 38 more rows
  1. Convert character columns to factors.
df <- df |>
    mutate(across(where(is.character), as_factor))

Plotting

  1. Plot duration by query number, facetted by user and overall.
df |>
    ggplot(aes(x = obs, y = duration, color = type)) +
    geom_line() +
    facet_grid(user ~ overall) +
    scale_color_viridis_d(begin = 0.2, end = 0.8) +
    theme_minimal(base_size = 12) +
    labs(
        x = "Query number",
        y = "Duration",
        title = ""
    )

  1. Plot duration by query number, facetted by overall, with linetype denoting user type (login vs no login).
df |>
    ggplot(aes(
        x = obs, y = duration,
        linetype = user,
        color = type
    )) +
    geom_line() +
    geom_point() +
    facet_grid(overall ~ .) +
    scale_x_continuous(breaks = seq(1, 8)) +
    scale_color_viridis_d(begin = 0.2, end = 0.8) +
    theme_minimal(base_size = 12) +
    labs(
        x = "Query number",
        y = "Duration",
        title = "Initial and subsequent queries"
    )

  1. Plot duration by query number, same as above but with exponential smoothing.
df |>
    ggplot(aes(
        x = obs, y = duration,
        linetype = user,
        color = type
    )) +
    geom_smooth(method = "lm", formula = (y ~ exp(-x)), se = FALSE) +
    facet_grid(overall ~ .) +
    scale_x_continuous(breaks = seq(1, 8)) +
    scale_color_viridis_d(begin = 0.2, end = 0.8) +
    theme_minimal(base_size = 12) +
    labs(
        x = "Query number",
        y = "Duration",
        title = "Initial and subsequent queries"
    )