Librairies Loading
#clean the environment
rm(list = ls())
#upload libraries
library(readr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyverse)
library(kableExtra)
library(lubridate)
library(plotly)
library(shiny)The packages required for data processing are loaded.
#clean the environment
rm(list = ls())
#upload libraries
library(readr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyverse)
library(kableExtra)
library(lubridate)
library(plotly)
library(shiny)The raw data matrix is loaded.
df <- read_csv("../data/data_appli_avril24.csv",
col_types = cols(day = col_datetime(format = "%Y-%m-%d"),
hour = col_time(format = "%H:%M:%S")))The column related to activity type is recoded for simplicity.
df <- df %>% mutate(activity_type = case_when(
is.na(activity_type) == TRUE ~ "INCONNUE",
activity_type == "A - Je lis avec ou pour mon enfant." ~ "PARTAGE",
activity_type == "B - Mon enfant lit seul pour son plaisir." ~ "INDEPENDANT",
activity_type == "C - Mon enfant lit pour ses devoirs." ~ "DEVOIRS",
activity_type == "D - Mon enfant fait des jeux pédagogiques sur écran." ~ "JEUX"
))The raw data contains 3533 data points from 178 unique users. We aim to identify the users who regularly used the application.
#Get the week number
weekly_presence <- df %>%
mutate(date = as.Date(day),
year = isoyear(day), # Année ISO
week = isoweek(day)) # Semaine ISO
weekly_presence <- weekly_presence %>%
distinct(uid, year, week) %>%
mutate(week_id = year * 100 + week)
pp_weeks <- weekly_presence %>%
distinct(uid, year, week) %>% # Keep one line by user/week
count(uid, name = "nb_weeks")
pp_3weeks <- weekly_presence %>%
distinct(uid, year, week) %>% # Keep one line by user/week
count(uid, name = "nb_weeks") %>% # Count unique week.
filter(nb_weeks >= 3)
pp_6weeks <- weekly_presence %>%
distinct(uid, year, week) %>% # Keep one line by user/week
count(uid, name = "nb_weeks") %>% # Count unique week.
filter(nb_weeks >= 6)
# Identify sequence of successive weeks.
result <- weekly_presence %>%
arrange(uid, week_id) %>%
group_by(uid) %>%
mutate(
diff = week_id - row_number() # Get sucessive sequence
) %>%
group_by(uid, diff) %>%
summarise(
n_weeks = n(),
.groups = "drop"
)
pp_3weeks_successive <- result %>% filter(n_weeks >= 3) %>% distinct(uid)
pp_6weeks_successive <- result %>% filter(n_weeks >= 6) %>% distinct(uid)
#-----------------------------------------------
#keep only participants who used the application for at least three consecutive weeks.
df2 <- df %>% filter(uid %in% pp_3weeks_successive$uid)
(nrow(df)-nrow(df2))
## [1] 266
info.by.grade <-df2 %>% distinct(uid,grade)We found that :
We decided to keep only participants who used the application for at least three consecutive weeks. Consequently, we have 22 first-grade children, 29 second-grade children, 30 third-grade children, 15 fourth-grade children, and 18 fifth-grade children.
#get the week number
df2 <- df2 %>% mutate(year = isoyear(day), week = isoweek(day))
#remove activities with less than 1 minutes
df3 <- df2 %>% filter(duration_min>=1)
(nrow(df2)-nrow(df3))/nrow(df2)*100
## [1] 1.285583
#1.29%
#Replaceing outliers.
df4 <- df3 %>%
group_by(uid) %>%
mutate(
# Moyenne individuelle des durées valides (<= 60 min)
user_mean = mean(duration_min[duration_min <= 60], na.rm = TRUE),
# Indicateur de correction
corrected = duration_min > 60,
# Durée corrigée si > 60 min
duration_corrected = ifelse(corrected, user_mean, duration_min)
) %>%
ungroup()
(length(df4$corrected)-length(df4[df4$corrected==FALSE,]$corrected))/length(df4$corrected)*100
## [1] 5.178295
#PLlot the distribution
ggplot(df4, aes(x = duration_corrected)) +
geom_histogram(fill = "lightblue") +
labs(title = "Distribution des durées", x = "Durée (en min)", y = "Fréquence")We decided to exclude all activities lasting less than 1 minute, as they correspond to timing errors (1.29%). We also considered that all activities lasting more than 60 minutes corresponded to timing omissions. Therefore, we replaced these durations with the user’s average duration (5.18%).
#Get Mean, Min, Max information by users
sum.by.user <- df4 %>%
group_by(uid) %>%
summarize(min = min(duration_corrected), max = max(duration_corrected), mean = mean(duration_corrected))
#Sort in alphabetic order.
sum.by.user <- sum.by.user %>% arrange(uid)
#Long Format
sum.by.user.long <- sum.by.user %>%
pivot_longer(
cols = c(min, max, mean),
names_to = "type_valeur",
values_to = "valeur"
)
p <- ggplot(sum.by.user.long, aes(x = uid, y = valeur, fill = type_valeur,
text = paste("Utilisateur:", uid,
"<br>Type:", type_valeur,
"<br>Valeur:", round(valeur, 2)))) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Valeurs minimum, maximum et moyenne des activités par utilisateur",
x = "Utilisateur", y = "Durée de l'activité (en min)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 6),
legend.position = "top") +
scale_fill_manual(values = c("min" = "skyblue", "mean" = "orange", "max" = "tomato"))
# Conversion en plot interactif
ggplotly(p, tooltip = "text")The graph above shows the minimum, average, and maximum durations of activities per user. For all users, the minimum duration is 1 minutes, the average duration is 18 minutes, and the maximum duration is 59 minutes.
act.per.week <- df4 %>%
group_by(uid, week) %>%
summarise(nb_act = n(), .groups = "drop")
freq.per.user <- act.per.week %>%
group_by(uid) %>%
summarise(
min = min(nb_act),
max = max(nb_act),
mean = mean(nb_act),
.groups = "drop"
)
freq.per.user.long <- freq.per.user %>%
pivot_longer(cols = c(min, max, mean),
names_to = "type_valeur",
values_to = "valeur")
p <- ggplot(freq.per.user.long, aes(x = uid, y = valeur, fill = type_valeur,
text = paste("Utilisateur:", uid,
"<br>Type:", type_valeur,
"<br>Valeur:", round(valeur, 2)))) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Fréquence minimum, maximum et moyenne des activités hebdomadaire par utilisateur",
x = "Utilisateur", y = "Fréquence de l'activité") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 6),
legend.position = "top") +
scale_fill_manual(values = c("min" = "skyblue", "mean" = "orange", "max" = "tomato"))
# Conversion en plot interactif
ggplotly(p, tooltip = "text")The graph above shows the minimum, average, and maximum number of weekly activities per participant. Across all users, the minimum number of activities per week is 1, the average is 4, and the maximum is 22.