library(tidyverse)
Moodle search durations
Import data
- Get list of
csv
files indata
directory.
<- "data"
datadir
<- datadir |>
csv_files list.files(pattern = "csv", full.names = TRUE)
csv_files
[1] "data/logs_modified_nologin-func.csv"
[2] "data/logs_modified_nologin-overall.csv"
[3] "data/logs_modified_user-func.csv"
[4] "data/logs_modified_user-overall.csv"
[5] "data/logs_orig_nologin-func.csv"
[6] "data/logs_orig_nologin-overall.csv"
[7] "data/logs_orig_user-func.csv"
[8] "data/logs_orig_user-overall.csv"
- Import each file and add a column with the file name (without file extension).
<- function(x) {
import_file read_csv(x, col_names = FALSE) |>
mutate(
file = gsub(pattern = "\\.csv$", "", basename(x)),
obs = row_number()
) }
<- csv_files |>
df map(import_file) |>
::list_rbind() |>
purrrrename("duration" = "X1")
df
# A tibble: 48 × 3
duration file obs
<dbl> <chr> <int>
1 25.2 logs_modified_nologin-func 1
2 0.400 logs_modified_nologin-func 2
3 8.52 logs_modified_nologin-func 3
4 0.344 logs_modified_nologin-func 4
5 0.782 logs_modified_nologin-func 5
6 0.108 logs_modified_nologin-func 6
7 0.434 logs_modified_nologin-func 7
8 0.108 logs_modified_nologin-func 8
9 26.7 logs_modified_nologin-overall 1
10 8.90 logs_modified_nologin-overall 2
# ℹ 38 more rows
Data wrangling
- Rename columns.
Next remove the logs_
prefix from the file name.
<- df |>
df mutate(file = str_remove(file, "logs_"))
- Separate the file name into 3 columns.
<- df |>
df separate_wider_regex(file, c(type = ".*", "_", user = ".*", "-", overall = ".*"))
df
# A tibble: 48 × 5
duration type user overall obs
<dbl> <chr> <chr> <chr> <int>
1 25.2 modified nologin func 1
2 0.400 modified nologin func 2
3 8.52 modified nologin func 3
4 0.344 modified nologin func 4
5 0.782 modified nologin func 5
6 0.108 modified nologin func 6
7 0.434 modified nologin func 7
8 0.108 modified nologin func 8
9 26.7 modified nologin overall 1
10 8.90 modified nologin overall 2
# ℹ 38 more rows
- Convert character columns to factors.
<- df |>
df mutate(across(where(is.character), as_factor))
Plotting
- Plot duration by query number, facetted by user and overall.
|>
df ggplot(aes(x = obs, y = duration, color = type)) +
geom_line() +
facet_grid(user ~ overall) +
scale_color_viridis_d(begin = 0.2, end = 0.8) +
theme_minimal(base_size = 12) +
labs(
x = "Query number",
y = "Duration",
title = ""
)
- Plot duration by query number, facetted by
overall
, with linetype denotinguser
type (login
vsno login
).
|>
df ggplot(aes(
x = obs, y = duration,
linetype = user,
color = type
+
)) geom_line() +
geom_point() +
facet_grid(overall ~ .) +
scale_x_continuous(breaks = seq(1, 8)) +
scale_color_viridis_d(begin = 0.2, end = 0.8) +
theme_minimal(base_size = 12) +
labs(
x = "Query number",
y = "Duration",
title = "Initial and subsequent queries"
)
- Plot duration by query number, same as above but with exponential smoothing.
|>
df ggplot(aes(
x = obs, y = duration,
linetype = user,
color = type
+
)) geom_smooth(method = "lm", formula = (y ~ exp(-x)), se = FALSE) +
facet_grid(overall ~ .) +
scale_x_continuous(breaks = seq(1, 8)) +
scale_color_viridis_d(begin = 0.2, end = 0.8) +
theme_minimal(base_size = 12) +
labs(
x = "Query number",
y = "Duration",
title = "Initial and subsequent queries"
)