Code
library(readxl)
library(tidyverse)
library(forcats)
library(lme4)
library(broom.mixed)
library(glue)
set.seed(123)Kort oppsummering: Vi leser inn data, gjør EDA på eksponering og engasjement, og estimerer en lineær mixed modell med kryssede tilfeldige intercept for student og item. Vi rangerer items på shrinkage-justert skår og visualiserer resultatene. (En ordinalsmodul er skissert som alternativ.)
library(readxl)
library(tidyverse)
library(forcats)
library(lme4)
library(broom.mixed)
library(glue)
set.seed(123)# Filstier (endre ved behov)
path_feedback <- "2025-10-30_feedback.xlsx"
path_sjanger <- "sjanger.csv"
# Enkel sjekk slik at rapporten feiler tydelig om filer mangler
stopifnot(file.exists(path_feedback))
stopifnot(file.exists(path_sjanger))feedback <- read_excel(path_feedback)
sjanger <- read.csv(path_sjanger, sep = ";", check.names = FALSE)
# harmoniser kolonnenavn brukt videre
sjanger$item <- sjanger$Tekstnavn
# Rå dimensjoner og unike verdier per kolonne
list(
feedback_dim = dim(feedback),
unique_counts = sapply(feedback, \(x) length(unique(x)))
)$feedback_dim
[1] 26945 8
$unique_counts
feedback_id sanity_text_id sanity_question_key student_answer
26945 159 156 6
student_id scale serialNumber type
2372 2 159 2
mydata <- feedback %>%
mutate(
item = serialNumber,
engagement = suppressWarnings(as.numeric(student_answer))
) %>%
filter(!is.na(item))
# Hurtigsjekk: eksponering/score pr item
exposed_items <- mydata %>%
group_by(item) %>%
summarise(score = mean(engagement, na.rm = TRUE),
exp = dplyr::n(), .groups = "drop")
# Oppgaver per student
items_perstudent <- mydata %>%
group_by(student_id) %>%
summarise(score = mean(engagement, na.rm = TRUE),
exp = dplyr::n(), .groups = "drop")# Fordeling av antall eksponeringer per item
exposed_items %>% ggplot(aes(x = exp)) +
geom_histogram(bins = 30) +
labs(x = "Eksponeringer per item", y = "Antall items",
title = "Ujevn eksponering på tvers av items") +
theme_minimal(base_size = 12)# Tabell: hvor mange oppgaver fikk studentene?
table(items_perstudent$exp)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
145 179 258 290 278 245 198 166 127 99 94 60 44 41 32 13 17 15 9 9
21 22 23 24 25 26 27 28 29 30 31 32 33 34 36 38 40 41 52 53
5 2 3 6 6 7 1 1 2 3 1 4 2 1 1 2 1 1 1 1
158 312
1 1
We explicitly show the setdiff() results so they appear in the knitted report, and we compute a few quick diagnostics to make the merges transparent.
# Hvilke items mangler i hver fil?
missing_in_sjanger <- setdiff(unique(mydata$item), unique(sjanger$item))
missing_in_feedback <- setdiff(unique(sjanger$item), unique(mydata$item))
cat(glue::glue("Items i feedback men mangler i sjanger: {length(missing_in_sjanger)}
"))Items i feedback men mangler i sjanger: 14
if (length(missing_in_sjanger) > 0) {
print(tibble(item = missing_in_sjanger) |> slice_head(n = 20))
}# A tibble: 14 × 1
item
<chr>
1 HT_147
2 HT_165_Festivaler_juli
3 HT_160_Blekksputer
4 HT_137
5 HT_145
6 HT_158_Skikk-og-Bruk_1
7 HT_56_Sjokolademaskinen
8 HT_138
9 HT_161_TekniskMuseum
10 HT_146
11 HT_150
12 HT_168_Festivaler_mars
13 HT_163_Skikk og bruk 2
14 HT_14
cat(glue::glue("
Items i sjanger men mangler i feedback: {length(missing_in_feedback)}
"))Items i sjanger men mangler i feedback: 14
if (length(missing_in_feedback) > 0) {
print(tibble(item = missing_in_feedback) |> slice_head(n = 20))
}# A tibble: 14 × 1
item
<chr>
1 HT_14_Vulkaner
2 HT_56_Sjokolademaskin
3 HT_137_Biden
4 HT_138_Trump
5 HT_145_Kostsirkel
6 HT_146_Elementærpartikler
7 HT_147_Tregrense
8 HT_150_Oddetall
9 HT_158_Skikk-og-bruk
10 HT_160_Blekkspruter
11 HT_161_Tekniskmuseum
12 HT_163_Skikk-og-bruk2
13 HT_165_Festivaler-juli
14 HT_168_Festivaler-mars
# Sammendrag antall items per sjanger (vises i output)
sjanger |> count(Genre, name = "n_items") |> arrange(desc(n_items)) Genre n_items
1 Non-fiction 84
2 Narrative non-fiction 46
3 Fiction 28
# Merk: right_join() beholder rader som finnes i 'sjanger' selv om de mangler i 'mydata'.
# Hvis du ønsker KUN overlapp ("det de har felles"), bruk inner_join().
# Vi lager begge og viser hvor mange som faller utenfor.
all_right <- right_join(mydata, sjanger, by = "item")
all_inner <- inner_join(mydata, sjanger, by = "item")
cat(glue::glue("
Antall unike items (right_join): {dplyr::n_distinct(all_right$item)}
"))Antall unike items (right_join): 158
cat(glue::glue("Antall unike items (inner_join): {dplyr::n_distinct(all_inner$item)}
"))Antall unike items (inner_join): 144
# Hvor mange rader i right_join mangler Genre eller mangler felter fra mydata?
rows_missing_genre <- all_right |> filter(is.na(Genre)) |> nrow()
rows_missing_eng <- all_right |> filter(is.na(student_answer)) |> nrow()
cat(glue::glue("Rader i right_join uten Genre: {rows_missing_genre}
"))Rader i right_join uten Genre: 0
cat(glue::glue("Rader i right_join med manglende svar: {rows_missing_eng}
"))Rader i right_join med manglende svar: 470
# Velg hvilken tabell du vil bruke videre:
all <- all_inner # <— strengt overlapp, mest ryddig for analyse
# all <- all_right # <— hvis du VIL beholde sjanger-rader uten feedbackexposed_items <- all %>%
group_by(item, Genre) %>%
summarise(eng = mean(engagement, na.rm = TRUE),
exp = dplyr::n(), .groups = "drop")
# Mean per sjanger
exposed_items %>%
group_by(Genre) %>%
summarise(n = dplyr::n(), score = mean(eng, na.rm = TRUE)) %>%
arrange(desc(score))# A tibble: 3 × 3
Genre n score
<chr> <int> <dbl>
1 Non-fiction 77 3.29
2 Fiction 26 3.22
3 Narrative non-fiction 41 3.18
# Hjelpefunksjon: velg hver k'te label gitt levels-rekkefølge
every_k <- function(x, k = 5, start = 1) {
stopifnot(is.factor(x) || is.character(x))
if (is.character(x)) x <- factor(x, levels = x)
lvls <- levels(x)
idx <- seq(start, length(lvls), by = k)
lvls[idx]
}
# Sortér etter naive engasjement
df <- exposed_items %>%
arrange(eng) %>%
mutate(item = factor(item, levels = item), idx = row_number())
lab_every5 <- every_k(df$item, k = 5, start = 1)
ggplot(df, aes(x = item, y = eng, fill = exp)) +
geom_col() +
scale_fill_gradient(name = "exp", low = "grey85", high = "steelblue") +
scale_x_discrete(breaks = lab_every5) +
labs(x = "Item (sortert på gj.snitt ↑)", y = "Gj.snittlig engasjement") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Linear mixed model fit by REML ['lmerMod']
Formula: answer_num ~ 1 + (1 | student_id) + (1 | item)
Data: dat
REML criterion at convergence: 45398
Scaled residuals:
Min 1Q Median 3Q Max
-3.9539 -0.5907 0.0467 0.6492 3.6809
Random effects:
Groups Name Variance Std.Dev.
student_id (Intercept) 0.4016 0.6337
item (Intercept) 0.1408 0.3753
Residual 1.0488 1.0241
Number of obs: 14707, groups: student_id, 2355; item, 144
Fixed effects:
Estimate Std. Error t value
(Intercept) 3.29117 0.03535 93.1
# A tibble: 6 × 7
item re se_re adj_score lo hi Genre
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 HT_116_Forskeporske1 -1.20 0.101 2.09 1.89 2.29 Narrative non-fiction
2 HT_26_Gresstrå -0.979 0.0886 2.31 2.14 2.49 Non-fiction
3 HT_118_Forskeporske3 -0.712 0.122 2.58 2.34 2.82 Narrative non-fiction
4 HT_151_Burton -0.696 0.134 2.59 2.33 2.86 Narrative non-fiction
5 HT_149_Antibiotika -0.676 0.118 2.61 2.38 2.85 Non-fiction
6 HT_102_v2_Solenergi -0.589 0.131 2.70 2.45 2.96 Non-fiction
top_items <- item_scores %>% slice_max(adj_score, n = 15, with_ties = FALSE)
bottom_items <- item_scores %>% slice_min(adj_score, n = 15, with_ties = FALSE)
top_items# A tibble: 15 × 7
item re se_re adj_score lo hi Genre
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 HT_52_v2_Minecraft 0.818 0.0929 4.11 3.93 4.29 Fiction
2 HT_29_Jaktkatt 0.676 0.0944 3.97 3.78 4.15 Non-fiction
3 HT_40_Hund 0.658 0.0911 3.95 3.77 4.13 Narrative non-f…
4 HT_45_Kattespråk 0.625 0.0934 3.92 3.73 4.10 Narrative non-f…
5 HT_139_Haier 0.617 0.0908 3.91 3.73 4.09 Non-fiction
6 HT_52_Minecraft 0.611 0.0928 3.90 3.72 4.08 Fiction
7 HT_88_Tiger 0.594 0.101 3.88 3.69 4.08 Non-fiction
8 HT_86_Ulv 0.573 0.0946 3.86 3.68 4.05 Non-fiction
9 HT_127_Eliteserien 0.564 0.0873 3.86 3.68 4.03 Non-fiction
10 HT_28_Ulvhund 0.549 0.101 3.84 3.64 4.04 Non-fiction
11 HT_30_Delfinkommunikasjon 0.519 0.100 3.81 3.61 4.01 Non-fiction
12 HT_82_Gepard 0.516 0.0892 3.81 3.63 3.98 Non-fiction
13 HT_74_Elg 0.507 0.102 3.80 3.60 4.00 Non-fiction
14 HT_73_Gaupe 0.489 0.101 3.78 3.58 3.98 Non-fiction
15 HT_114_Ku 0.468 0.102 3.76 3.56 3.96 Fiction
bottom_items# A tibble: 15 × 7
item re se_re adj_score lo hi Genre
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 HT_116_Forskeporske1 -1.20 0.101 2.09 1.89 2.29 Narrative non-fict…
2 HT_26_Gresstrå -0.979 0.0886 2.31 2.14 2.49 Non-fiction
3 HT_118_Forskeporske3 -0.712 0.122 2.58 2.34 2.82 Narrative non-fict…
4 HT_151_Burton -0.696 0.134 2.59 2.33 2.86 Narrative non-fict…
5 HT_149_Antibiotika -0.676 0.118 2.61 2.38 2.85 Non-fiction
6 HT_102_v2_Solenergi -0.589 0.131 2.70 2.45 2.96 Non-fiction
7 HT_64_Rein -0.575 0.105 2.72 2.51 2.92 Narrative non-fict…
8 HT_11 Demokrati -0.513 0.116 2.78 2.55 3.00 Non-fiction
9 HT_31_Årstider -0.504 0.101 2.79 2.59 2.99 Non-fiction
10 HT_102_Solenergi -0.503 0.123 2.79 2.55 3.03 Non-fiction
11 HT_54_Chuchichäschtli -0.494 0.118 2.80 2.56 3.03 Narrative non-fict…
12 HT_01_Promp -0.476 0.113 2.81 2.59 3.04 Fiction
13 HT_51_Vannpumpe -0.458 0.121 2.83 2.59 3.07 Narrative non-fict…
14 HT_76_Beite -0.453 0.108 2.84 2.63 3.05 Non-fiction
15 HT_58_Fattig -0.449 0.0950 2.84 2.66 3.03 Fiction
lvl <- item_scores$item
lab_every5 <- every_k(lvl, k = 5, start = 1)
item_scores %>%
mutate(item_f = factor(item, levels = lvl)) %>%
ggplot(aes(x = item_f, y = adj_score)) +
geom_col(aes(fill = Genre), width = 0.85) +
geom_errorbar(aes(ymin = lo, ymax = hi), width = 0.2, alpha = 0.5) +
scale_x_discrete(breaks = lab_every5) +
labs(
x = "Item (justert; sortert etter modellestimat ↑)",
y = "Justert gj.snitt (student_answer)",
fill = "Genre"
) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+ylab("Engagement")