library(tidyverse)Moodle search durations
Import data
- Get list of
csvfiles indatadirectory.
datadir <- "data"
csv_files <- datadir |>
list.files(pattern = "csv", full.names = TRUE)csv_files[1] "data/logs_modified_nologin-func.csv"
[2] "data/logs_modified_nologin-overall.csv"
[3] "data/logs_modified_user-func.csv"
[4] "data/logs_modified_user-overall.csv"
[5] "data/logs_orig_nologin-func.csv"
[6] "data/logs_orig_nologin-overall.csv"
[7] "data/logs_orig_user-func.csv"
[8] "data/logs_orig_user-overall.csv"
- Import each file and add a column with the file name (without file extension).
import_file <- function(x) {
read_csv(x, col_names = FALSE) |>
mutate(
file = gsub(pattern = "\\.csv$", "", basename(x)),
obs = row_number()
)
}df <- csv_files |>
map(import_file) |>
purrr::list_rbind() |>
rename("duration" = "X1")df# A tibble: 48 × 3
duration file obs
<dbl> <chr> <int>
1 25.2 logs_modified_nologin-func 1
2 0.400 logs_modified_nologin-func 2
3 8.52 logs_modified_nologin-func 3
4 0.344 logs_modified_nologin-func 4
5 0.782 logs_modified_nologin-func 5
6 0.108 logs_modified_nologin-func 6
7 0.434 logs_modified_nologin-func 7
8 0.108 logs_modified_nologin-func 8
9 26.7 logs_modified_nologin-overall 1
10 8.90 logs_modified_nologin-overall 2
# ℹ 38 more rows
Data wrangling
- Rename columns.
Next remove the logs_ prefix from the file name.
df <- df |>
mutate(file = str_remove(file, "logs_"))- Separate the file name into 3 columns.
df <- df |>
separate_wider_regex(file, c(type = ".*", "_", user = ".*", "-", overall = ".*"))df# A tibble: 48 × 5
duration type user overall obs
<dbl> <chr> <chr> <chr> <int>
1 25.2 modified nologin func 1
2 0.400 modified nologin func 2
3 8.52 modified nologin func 3
4 0.344 modified nologin func 4
5 0.782 modified nologin func 5
6 0.108 modified nologin func 6
7 0.434 modified nologin func 7
8 0.108 modified nologin func 8
9 26.7 modified nologin overall 1
10 8.90 modified nologin overall 2
# ℹ 38 more rows
- Convert character columns to factors.
df <- df |>
mutate(across(where(is.character), as_factor))Plotting
- Plot duration by query number, facetted by user and overall.
df |>
ggplot(aes(x = obs, y = duration, color = type)) +
geom_line() +
facet_grid(user ~ overall) +
scale_color_viridis_d(begin = 0.2, end = 0.8) +
theme_minimal(base_size = 12) +
labs(
x = "Query number",
y = "Duration",
title = ""
)- Plot duration by query number, facetted by
overall, with linetype denotingusertype (loginvsno login).
df |>
ggplot(aes(
x = obs, y = duration,
linetype = user,
color = type
)) +
geom_line() +
geom_point() +
facet_grid(overall ~ .) +
scale_x_continuous(breaks = seq(1, 8)) +
scale_color_viridis_d(begin = 0.2, end = 0.8) +
theme_minimal(base_size = 12) +
labs(
x = "Query number",
y = "Duration",
title = "Initial and subsequent queries"
)- Plot duration by query number, same as above but with exponential smoothing.
df |>
ggplot(aes(
x = obs, y = duration,
linetype = user,
color = type
)) +
geom_smooth(method = "lm", formula = (y ~ exp(-x)), se = FALSE) +
facet_grid(overall ~ .) +
scale_x_continuous(breaks = seq(1, 8)) +
scale_color_viridis_d(begin = 0.2, end = 0.8) +
theme_minimal(base_size = 12) +
labs(
x = "Query number",
y = "Duration",
title = "Initial and subsequent queries"
)