Safety <- haven::read_sav(file = "Data/DrivingSimData/safety_data.sav")
Online <- haven::read_sav(file = "Data/DrivingSimData/online_lesson_data.sav")
Reading and renaming
Report <-
haven::read_sav(file = "Data/DrivingSimData/report_data.sav") %>%
select(-FirstName, -LastName, -EMail ) %>%
rename(courses_taken = CoursesTaken,
fuel_used = FuelUsed,
fuel_wasted = FuelWasted,
score = TaskScore) %>%
mutate(Part = as_factor(StudentID),
Course = as_factor(CourseID),
Gender = if_else(Gender == 0, "Female", "Male"),
Learning_style = as_factor(LearningStyle),
feedback_level = 1 - mascutils::rescale_unit(LearningLevel),
Task_group = as_factor(TaskGroup),
Task = as_factor(TaskID))
Report %>% sample_n(8)
| StudentID | BirthDate | Gender | Language | OverallScore | LearningStyle | courses_taken | CourseID | CourseScore | DateTime | CourseModID | fuel_used | fuel_wasted | ModuleID | TaskGroup | TaskID | NrRight | NrTotal | score | LearningLevel | TaskReason | SecondTestValid | Part | Course | Learning_style | feedback_level | Task_group | Task |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2003-01-01 | Female | NL | 7.550541 | 1 | 2 | L0-32 | 7.6 | 2020-12-29 11:47:06 | 2 | 0.24 | 0.165 | 4 | 4 | 200 | NA | NA | NA | 0 | NA | 1 | 1 | L0-32 | Beslisser | 1.0000000 | Snelwegen | Oefening: een rijstrook naar rechts |
| 15 | 2003-01-01 | Female | NL | 7.435118 | 1 | 3 | L0-32 | 7.6 | 2021-01-02 12:13:03 | 2 | 0.24 | 0.019 | 2 | 2 | 167 | 2 | 2 | 10 | 2 | NA | 1 | 15 | L0-32 | Beslisser | 0.3333333 | Kruisingen (basis) | Linksaf slaan op voorrangskruispunt (met voorrang) |
| 8 | 2003-01-01 | Female | NL | 8.160268 | 1 | 2 | L0-32 | 7.0 | 2020-12-23 12:45:31 | 2 | 0.18 | 0.017 | 0 | 1 | 71 | NA | NA | NA | 0 | NA | 1 | 8 | L0-32 | Beslisser | 1.0000000 | Voertuigbeheersing | Het nemen van een flauwe bocht |
| 3 | 2003-01-01 | Female | NL | 7.416213 | 1 | 3 | L0-32 | 9.1 | 2021-01-09 11:53:56 | 2 | 0.19 | 0.047 | 0 | 1 | 62 | 8 | 8 | 10 | 3 | NA | 0 | 3 | L0-32 | Beslisser | 0.0000000 | Voertuigbeheersing | Positie binnen de rijbaan |
| 8 | 2003-01-01 | Female | NL | 8.160268 | 1 | 3 | L0-32 | 7.6 | 2021-01-02 14:51:46 | 2 | 0.19 | 0.009 | 2 | 2 | 173 | 1 | 1 | 10 | 3 | NA | 1 | 8 | L0-32 | Beslisser | 0.0000000 | Kruisingen (basis) | Linksaf slaan op kruispunt met verkeerslichten |
| 4 | 2003-01-01 | Male | NL | 9.261662 | 1 | NA | L0-32 | 8.9 | 2020-12-22 12:50:20 | 2 | 0.24 | 0.037 | 2 | 2 | 171 | NA | NA | NA | 0 | NA | 0 | 4 | L0-32 | Beslisser | 1.0000000 | Kruisingen (basis) | Rechtsaf slaan op voorrangskruispunt (zonder voorrang) |
| 8 | 2003-01-01 | Female | NL | 8.160268 | 1 | 3 | L0-32 | 7.6 | 2021-01-02 14:51:46 | 2 | 0.19 | 0.009 | 6 | 0 | 76 | NA | NA | NA | 0 | NA | 1 | 8 | L0-32 | Beslisser | 1.0000000 | Geen | Het benaderen van een haakse bocht |
| 15 | 2003-01-01 | Female | NL | 7.435118 | 1 | 3 | L0-32 | 7.6 | 2021-01-02 12:13:03 | 2 | 0.24 | 0.019 | 1 | 1 | 109 | NA | NA | NA | 0 | NA | 1 | 15 | L0-32 | Beslisser | 1.0000000 | Voertuigbeheersing | Gebruik van het rempedaal |
A measures can be described as an encounter of entities, here it is the encounter of a participant and a driving task. In addition, these encounters happen in a sequence of length between two and four.
The data set already is already in long format and needs only minimal finishing. Still, there are some additional measures on participant level. The following simply extracts the participant-level table, i.e. one row per participant.
Part <-
Report %>%
mutate(age = lubridate::as_date(DateTime) - BirthDate) %>%
select(Course, Part, Gender, Learning_style) %>%
distinct()
Part %>% sample_n(8)
| Course | Part | Gender | Learning_style |
|---|---|---|---|
| L0-32 | 15 | Female | Beslisser |
| L0-32 | 16 | Male | Beslisser |
| L0-32 | 4 | Male | Beslisser |
| L0-32 | 8 | Female | Beslisser |
| L0-32 | 13 | Female | Doener |
| L0-32 | 5 | Male | Beslisser |
| L0-32 | 2 | Female | Doener |
| L0-32 | 6 | Female | Beslisser |
Part %>%
mutate() %>%
ggplot(aes(x = Learning_style, fill = Gender)) +
geom_bar()
After some polishing (and adding an age variable), we join the participant table with the measures.
PreTrain <-
Report %>%
select(Part, courses_taken, feedback_level, Task_group, Task, score, fuel_wasted) %>%
right_join(Part, by = "Part") %>%
print()
## # A tibble: 1,755 x 10
## Part courses_taken feedback_level Task_group Task score fuel_wasted Course
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <fct>
## 1 1 1 1 Voertuigbe~ Het ~ NA 0.079 L0-32
## 2 1 1 0.667 Voertuigbe~ Het ~ 10 0.079 L0-32
## 3 1 1 0.667 Voertuigbe~ Het ~ 6.3 0.079 L0-32
## 4 1 1 0 Voertuigbe~ Posi~ 10 0.079 L0-32
## 5 1 1 0.667 Voertuigbe~ Wegr~ 5.4 0.079 L0-32
## 6 1 1 1 Voertuigbe~ Door~ NA 0.079 L0-32
## 7 1 1 1 Voertuigbe~ Teru~ NA 0.079 L0-32
## 8 1 1 0.667 Voertuigbe~ Houd~ 10 0.079 L0-32
## 9 1 1 1 Voertuigbe~ Afst~ NA 0.079 L0-32
## 10 1 1 1 Voertuigbe~ Gebr~ NA 0.079 L0-32
## # ... with 1,745 more rows, and 2 more variables: Gender <chr>,
## # Learning_style <fct>
Because the encounter is Task by Part, we can do any analysis on both levels, in addition to the population level. We explore the learning curve over the sequence of courses.
PreTrain %>%
group_by(courses_taken) %>%
summarize(avg_score = mean(score, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(x = courses_taken,
y = avg_score)) +
geom_point() +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
PreTrain %>%
group_by(courses_taken) %>%
summarize(avg_fuel_wasted = mean(fuel_wasted, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(x = courses_taken,
y = avg_fuel_wasted)) +
geom_point() +
geom_line() +
ylim(0, 0.2)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
The Gender effect is in the plot to create an example, despite the stereotype.
PreTrain %>%
group_by(Part, Gender, courses_taken) %>%
summarize(avg_score = mean(score, na.rm = T),
avg_fuel_wasted = mean(fuel_wasted, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(x = courses_taken,
y = avg_score,
col = Gender,
group = Part)) +
geom_point() +
geom_line()
## `summarise()` has grouped output by 'Part', 'Gender'. You can override using the `.groups` argument.
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 row(s) containing missing values (geom_path).
PreTrain %>%
group_by(Part, Gender, courses_taken) %>%
summarize(avg_score = mean(score, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(x = courses_taken,
y = avg_score,
col = Gender,
group = Part)) +
geom_point() +
geom_line()
## `summarise()` has grouped output by 'Part', 'Gender'. You can override using the `.groups` argument.
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 row(s) containing missing values (geom_path).
PreTrain %>%
group_by(Task_group, Task, courses_taken) %>%
summarize(avg_score = mean(score, na.rm = T)) %>%
ungroup() %>%
ggplot(aes(x = courses_taken,
y = avg_score,
color = Task_group,
group = Task)) +
geom_point() +
geom_line()
## `summarise()` has grouped output by 'Task_group', 'Task'. You can override using the `.groups` argument.
## Warning: Removed 150 rows containing missing values (geom_point).
## Warning: Removed 150 row(s) containing missing values (geom_path).
Some task groups are under-represented and there is no sharp difference between the groups. Finding subtle differences is useless case. We regard all tasks as members of a population.