Quality Check of Mobile Application Data.

Author

Brossette B.

1 Description of mobile app data.

The packages required for data processing are loaded.

Librairies Loading

#clean the environment
rm(list = ls())

#upload libraries
library(readr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyverse)
library(kableExtra)
library(lubridate)
library(plotly)
library(shiny)

The raw data matrix is loaded.

Data Loading

df <- read_csv("../data/data_appli_avril24.csv", 
    col_types = cols(day = col_datetime(format = "%Y-%m-%d"), 
                    hour = col_time(format = "%H:%M:%S")))

The column related to activity type is recoded for simplicity.

Data recoding

df <- df %>% mutate(activity_type = case_when(
  is.na(activity_type) == TRUE ~ "INCONNUE",
  activity_type == "A - Je lis avec ou pour mon enfant." ~ "PARTAGE",
  activity_type == "B - Mon enfant lit seul pour son plaisir." ~ "INDEPENDANT",
  activity_type == "C - Mon enfant lit pour ses devoirs." ~ "DEVOIRS",
  activity_type == "D - Mon enfant fait des jeux pédagogiques sur écran." ~ "JEUX"
))

The raw data contains 3533 data points from 178 unique users. We aim to identify the users who regularly used the application.

week-usage

#Get the week number
weekly_presence <- df %>%
  mutate(date = as.Date(day),
         year = isoyear(day),        # Année ISO
         week = isoweek(day))        # Semaine ISO

weekly_presence <- weekly_presence %>%
  distinct(uid, year, week) %>%
  mutate(week_id = year * 100 + week)

pp_weeks <- weekly_presence %>%
  distinct(uid, year, week) %>%       # Keep one line by user/week
  count(uid, name = "nb_weeks")

pp_3weeks <- weekly_presence %>%
  distinct(uid, year, week) %>%       # Keep one line by user/week
  count(uid, name = "nb_weeks") %>%   # Count unique week. 
  filter(nb_weeks >= 3)     

pp_6weeks <- weekly_presence %>%
  distinct(uid, year, week) %>%       # Keep one line by user/week
  count(uid, name = "nb_weeks") %>%   # Count unique week. 
  filter(nb_weeks >= 6)     

# Identify sequence of successive weeks. 
result <- weekly_presence %>%
  arrange(uid, week_id) %>%
  group_by(uid) %>%
  mutate(
    diff = week_id - row_number()  # Get sucessive sequence
  ) %>%
  group_by(uid, diff) %>%
  summarise(
    n_weeks = n(),
    .groups = "drop"
  ) 
pp_3weeks_successive <- result %>% filter(n_weeks >= 3) %>% distinct(uid)
pp_6weeks_successive <- result %>% filter(n_weeks >= 6) %>% distinct(uid)

#-----------------------------------------------
#keep only participants who used the application for at least three consecutive weeks. 
df2 <- df %>% filter(uid %in% pp_3weeks_successive$uid)
(nrow(df)-nrow(df2))
## [1] 266

info.by.grade <-df2  %>% distinct(uid,grade)

We found that :

133 participants used the application for at least three weeks.
118 participants used the application for at least three successive weeks.
69 participants used the application for at least six weeks.
46 participants used the application for at least six successive weeks.

We decided to keep only participants who used the application for at least three consecutive weeks. Consequently, we have 22 first-grade children, 29 second-grade children, 30 third-grade children, 15 fourth-grade children, and 18 fifth-grade children.

prepare-row-data

#get the week number
df2 <- df2 %>%  mutate(year = isoyear(day), week = isoweek(day))

#remove activities with less than 1 minutes
df3 <- df2 %>% filter(duration_min>=1)
(nrow(df2)-nrow(df3))/nrow(df2)*100
## [1] 1.285583
#1.29%

#Replaceing outliers. 
df4 <- df3 %>%
  group_by(uid) %>%
  mutate(
    # Moyenne individuelle des durées valides (<= 60 min)
    user_mean = mean(duration_min[duration_min <= 60], na.rm = TRUE),
    
    # Indicateur de correction
    corrected = duration_min > 60,
    
    # Durée corrigée si > 60 min
    duration_corrected = ifelse(corrected, user_mean, duration_min)
  ) %>%
  ungroup()
(length(df4$corrected)-length(df4[df4$corrected==FALSE,]$corrected))/length(df4$corrected)*100
## [1] 5.178295

#PLlot the distribution
ggplot(df4, aes(x = duration_corrected)) +
  geom_histogram(fill = "lightblue") +
  labs(title = "Distribution des durées", x = "Durée (en min)", y = "Fréquence")

We decided to exclude all activities lasting less than 1 minute, as they correspond to timing errors (1.29%). We also considered that all activities lasting more than 60 minutes corresponded to timing omissions. Therefore, we replaced these durations with the user’s average duration (5.18%).

activity-duration

#Get Mean, Min, Max information by users
sum.by.user <- df4 %>% 
  group_by(uid) %>% 
  summarize(min = min(duration_corrected), max = max(duration_corrected), mean = mean(duration_corrected))

#Sort in alphabetic order. 
sum.by.user <- sum.by.user %>% arrange(uid)


#Long Format
sum.by.user.long <- sum.by.user %>%
  pivot_longer(
    cols = c(min, max, mean),
    names_to = "type_valeur",
    values_to = "valeur"
  )

p <- ggplot(sum.by.user.long, aes(x = uid, y = valeur, fill = type_valeur,
                        text = paste("Utilisateur:", uid,
                                      "<br>Type:", type_valeur,
                                      "<br>Valeur:", round(valeur, 2)))) +
  geom_bar(stat = "identity", position = position_dodge()) +
   labs(title = "Valeurs minimum, maximum et moyenne des activités par utilisateur",
        x = "Utilisateur", y = "Durée de l'activité (en min)") +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 90, size = 6),
         legend.position = "top") +
   scale_fill_manual(values = c("min" = "skyblue", "mean" = "orange", "max" = "tomato"))
 
# Conversion en plot interactif
ggplotly(p, tooltip = "text")

The graph above shows the minimum, average, and maximum durations of activities per user. For all users, the minimum duration is 1 minutes, the average duration is 18 minutes, and the maximum duration is 59 minutes.

activity-frequency


act.per.week <- df4 %>%
  group_by(uid, week) %>%
  summarise(nb_act = n(), .groups = "drop")

freq.per.user <- act.per.week %>%
  group_by(uid) %>%
  summarise(
    min = min(nb_act),
    max = max(nb_act),
    mean = mean(nb_act),
    .groups = "drop"
  )


freq.per.user.long <- freq.per.user %>%
  pivot_longer(cols = c(min, max, mean),
               names_to = "type_valeur",
               values_to = "valeur")


p <- ggplot(freq.per.user.long, aes(x = uid, y = valeur, fill = type_valeur,
                        text = paste("Utilisateur:", uid,
                                      "<br>Type:", type_valeur,
                                      "<br>Valeur:", round(valeur, 2)))) +
  geom_bar(stat = "identity", position = position_dodge()) +
   labs(title = "Fréquence minimum, maximum et moyenne des activités hebdomadaire par utilisateur",
        x = "Utilisateur", y = "Fréquence de l'activité") +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 90, size = 6),
         legend.position = "top") +
   scale_fill_manual(values = c("min" = "skyblue", "mean" = "orange", "max" = "tomato"))
 
# Conversion en plot interactif
ggplotly(p, tooltip = "text")

The graph above shows the minimum, average, and maximum number of weekly activities per participant. Across all users, the minimum number of activities per week is 1, the average is 4, and the maximum is 22.

--- title: "Quality Check of Mobile Application Data." author: "Brossette B." editor: visual toc: true number-sections: true theme: cosmo highlight-style: github format: html: code-fold: true code-tools: true knitr: opts_chunk: collapse: true --- ## Description of mobile app data. The packages required for data processing are loaded. ```{r} #| label: upload-libraries #| code-summary: "Librairies Loading" #| message: false #clean the environment rm(list = ls()) #upload libraries library(readr) library(stringr) library(plyr) library(dplyr) library(tidyverse) library(kableExtra) library(lubridate) library(plotly) library(shiny) ``` The raw data matrix is loaded. ```{r} #| label: upload-data #| code-summary: "Data Loading" #| message: false df <- read_csv("../data/data_appli_avril24.csv", col_types = cols(day = col_datetime(format = "%Y-%m-%d"), hour = col_time(format = "%H:%M:%S"))) ``` The column related to activity type is recoded for simplicity. ```{r} #| label: recoding #| code-summary: "Data recoding" #| message: false df <- df %>% mutate(activity_type = case_when( is.na(activity_type) == TRUE ~ "INCONNUE", activity_type == "A - Je lis avec ou pour mon enfant." ~ "PARTAGE", activity_type == "B - Mon enfant lit seul pour son plaisir." ~ "INDEPENDANT", activity_type == "C - Mon enfant lit pour ses devoirs." ~ "DEVOIRS", activity_type == "D - Mon enfant fait des jeux pédagogiques sur écran." ~ "JEUX" )) ``` The raw data contains **`r length(df$uid)` data points** from **`r length(unique(df$uid))` unique users.** We aim to identify the users who regularly used the application. ```{r} #| label: week-usage #| code-summary: "week-usage" #| message: false #Get the week number weekly_presence <- df %>% mutate(date = as.Date(day), year = isoyear(day), # Année ISO week = isoweek(day)) # Semaine ISO weekly_presence <- weekly_presence %>% distinct(uid, year, week) %>% mutate(week_id = year * 100 + week) pp_weeks <- weekly_presence %>% distinct(uid, year, week) %>% # Keep one line by user/week count(uid, name = "nb_weeks") pp_3weeks <- weekly_presence %>% distinct(uid, year, week) %>% # Keep one line by user/week count(uid, name = "nb_weeks") %>% # Count unique week. filter(nb_weeks >= 3) pp_6weeks <- weekly_presence %>% distinct(uid, year, week) %>% # Keep one line by user/week count(uid, name = "nb_weeks") %>% # Count unique week. filter(nb_weeks >= 6) # Identify sequence of successive weeks. result <- weekly_presence %>% arrange(uid, week_id) %>% group_by(uid) %>% mutate( diff = week_id - row_number() # Get sucessive sequence ) %>% group_by(uid, diff) %>% summarise( n_weeks = n(), .groups = "drop" ) pp_3weeks_successive <- result %>% filter(n_weeks >= 3) %>% distinct(uid) pp_6weeks_successive <- result %>% filter(n_weeks >= 6) %>% distinct(uid) #----------------------------------------------- #keep only participants who used the application for at least three consecutive weeks. df2 <- df %>% filter(uid %in% pp_3weeks_successive$uid) (nrow(df)-nrow(df2)) info.by.grade <-df2 %>% distinct(uid,grade) ``` We found that : - `r length(pp_3weeks$uid)` participants used the application for at least three weeks. - `r length(pp_3weeks_successive$uid)` participants used the application for at least three successive weeks. - `r length(pp_6weeks$uid)` participants used the application for at least six weeks. - `r length(pp_6weeks_successive$uid)` participants used the application for at least six successive weeks. We decided to keep only participants who used the application for at least three consecutive weeks. Consequently, we have `r nrow(info.by.grade[info.by.grade$grade=="CP",])` first-grade children, `r nrow(info.by.grade[info.by.grade$grade=="CE1",])` second-grade children, `r nrow(info.by.grade[info.by.grade$grade=="CE2",])` third-grade children, `r nrow(info.by.grade[info.by.grade$grade=="CM1",])` fourth-grade children, and `r nrow(info.by.grade[info.by.grade$grade=="CM2",])` fifth-grade children. ```{r} #| label: prepare-row-data #| code-summary: "prepare-row-data" #| message: false #get the week number df2 <- df2 %>% mutate(year = isoyear(day), week = isoweek(day)) #remove activities with less than 1 minutes df3 <- df2 %>% filter(duration_min>=1) (nrow(df2)-nrow(df3))/nrow(df2)*100 #1.29% #Replaceing outliers. df4 <- df3 %>% group_by(uid) %>% mutate( # Moyenne individuelle des durées valides (<= 60 min) user_mean = mean(duration_min[duration_min <= 60], na.rm = TRUE), # Indicateur de correction corrected = duration_min > 60, # Durée corrigée si > 60 min duration_corrected = ifelse(corrected, user_mean, duration_min) ) %>% ungroup() (length(df4$corrected)-length(df4[df4$corrected==FALSE,]$corrected))/length(df4$corrected)*100 #PLlot the distribution ggplot(df4, aes(x = duration_corrected)) + geom_histogram(fill = "lightblue") + labs(title = "Distribution des durées", x = "Durée (en min)", y = "Fréquence") ``` We decided to exclude all activities lasting less than 1 minute, as they correspond to timing errors (1.29%). We also considered that all activities lasting more than 60 minutes corresponded to timing omissions. Therefore, we replaced these durations with the user's average duration (5.18%). ```{r} #| label: activity-duration #| code-summary: "activity-duration" #| message: false #Get Mean, Min, Max information by users sum.by.user <- df4 %>% group_by(uid) %>% summarize(min = min(duration_corrected), max = max(duration_corrected), mean = mean(duration_corrected)) #Sort in alphabetic order. sum.by.user <- sum.by.user %>% arrange(uid) #Long Format sum.by.user.long <- sum.by.user %>% pivot_longer( cols = c(min, max, mean), names_to = "type_valeur", values_to = "valeur" ) p <- ggplot(sum.by.user.long, aes(x = uid, y = valeur, fill = type_valeur, text = paste("Utilisateur:", uid, "<br>Type:", type_valeur, "<br>Valeur:", round(valeur, 2)))) + geom_bar(stat = "identity", position = position_dodge()) + labs(title = "Valeurs minimum, maximum et moyenne des activités par utilisateur", x = "Utilisateur", y = "Durée de l'activité (en min)") + theme_minimal() + theme(axis.text.x = element_text(angle = 90, size = 6), legend.position = "top") + scale_fill_manual(values = c("min" = "skyblue", "mean" = "orange", "max" = "tomato")) # Conversion en plot interactif ggplotly(p, tooltip = "text") ``` The graph above shows the minimum, average, and maximum durations of activities per user. For all users, the minimum duration is `r round(min(sum.by.user$min),0)` minutes, the average duration is `r round(mean(sum.by.user$mean),0)` minutes, and the maximum duration is `r round(max(sum.by.user$max),0)` minutes. ```{r} #| label: activity-frequency #| code-summary: "activity-frequency" #| message: false act.per.week <- df4 %>% group_by(uid, week) %>% summarise(nb_act = n(), .groups = "drop") freq.per.user <- act.per.week %>% group_by(uid) %>% summarise( min = min(nb_act), max = max(nb_act), mean = mean(nb_act), .groups = "drop" ) freq.per.user.long <- freq.per.user %>% pivot_longer(cols = c(min, max, mean), names_to = "type_valeur", values_to = "valeur") p <- ggplot(freq.per.user.long, aes(x = uid, y = valeur, fill = type_valeur, text = paste("Utilisateur:", uid, "<br>Type:", type_valeur, "<br>Valeur:", round(valeur, 2)))) + geom_bar(stat = "identity", position = position_dodge()) + labs(title = "Fréquence minimum, maximum et moyenne des activités hebdomadaire par utilisateur", x = "Utilisateur", y = "Fréquence de l'activité") + theme_minimal() + theme(axis.text.x = element_text(angle = 90, size = 6), legend.position = "top") + scale_fill_manual(values = c("min" = "skyblue", "mean" = "orange", "max" = "tomato")) # Conversion en plot interactif ggplotly(p, tooltip = "text") ``` The graph above shows the minimum, average, and maximum number of weekly activities per participant. Across all users, the minimum number of activities per week is `r round(min(freq.per.user$min),0)`, the average is `r round(mean(freq.per.user$mean),0)`, and the maximum is `r round(max(freq.per.user$max),0)`.