Yogurt

Importation des données.

Les données sont disponible dans le package Ecdat.

library(Ecdat)
Loading required package: Ecfun

Attaching package: 'Ecfun'
The following object is masked from 'package:base':

    sign

Attaching package: 'Ecdat'
The following object is masked from 'package:datasets':

    Orange
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# data(package="Ecdat") importe le package Ecdat.
Yogurt %>% as.tibble() -> df
Warning: `as.tibble()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` instead.
ℹ The signature and semantics have changed, see `?as_tibble`.
# ancienne méthode.
Yogurt%>%select(choice)%>%table()%>% # calcul de la fréquence de choice (variable texte)
prop.table()%>%round(5) # porp.table fait la conversion en proportion
choice
yoplait  dannon  hiland  weight 
0.33914 0.40216 0.02944 0.22927 
Yogurt%>%summarize(mean_price.yoplait=mean(price.yoplait),mean_price.dannon=mean(price.dannon)) # méthode longue
  mean_price.yoplait mean_price.dannon
1           10.68213          8.163474
Yogurt %>%
 summarise(across(where(is.numeric) & !all_of("id"), list(Mean = mean, Sd = sd), na.rm = TRUE)) # summarise est une statistique descriptive, across where is numeric = parmi les variables numerique, &= condition et, all_of= tout sauf, list de moyenne et d'écart type, na.rm= cherche valeur manquante.
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(...)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
  feat.yoplait_Mean feat.yoplait_Sd feat.dannon_Mean feat.dannon_Sd
1        0.05597015       0.2299117       0.03772803      0.1905772
  feat.hiland_Mean feat.hiland_Sd feat.weight_Mean feat.weight_Sd
1       0.03689884      0.1885525       0.03772803      0.1905772
  price.yoplait_Mean price.yoplait_Sd price.dannon_Mean price.dannon_Sd
1           10.68213         1.906265          8.163474        1.062886
  price.hiland_Mean price.hiland_Sd price.weight_Mean price.weight_Sd
1          5.362935        0.805391          7.949088       0.7735004
set.seed(123)
data <- tibble(
 groupe = sample(c("A", "B", "C"), 100, replace = TRUE),
 x = rnorm(100, mean = 50, sd = 10),  # Variable quantitative
 y = rnorm(100, mean = 30, sd = 5)    # Variable quantitative
)
data
# A tibble: 100 × 3
   groupe     x     y
   <chr>  <dbl> <dbl>
 1 C       46.7  23.7
 2 C       39.8  46.2
 3 C       39.3  27.9
 4 B       53.0  31.5
 5 C       54.5  33.2
 6 B       50.5  27.6
 7 B       59.2  32.6
 8 B       70.5  31.8
 9 C       45.1  28.9
10 A       26.9  30.3
# ℹ 90 more rows
data %>%
 summarise(
   count = n(),  # Nombre d'observations par groupe
   mean_x = mean(x, na.rm = TRUE),
   sd_x = sd(x, na.rm = TRUE),
   min_x = min(x, na.rm = TRUE),
   max_x = max(x, na.rm = TRUE),
   mean_y = mean(y, na.rm = TRUE),
   sd_y = sd(y, na.rm = TRUE)
   )
# A tibble: 1 × 7
  count mean_x  sd_x min_x max_x mean_y  sd_y
  <int>  <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>
1   100   49.4  9.57  26.9  71.9   30.0  4.71

Reproduire table1.

Un exemple pédagogique pour comprendre pivot longer.

Partons d’un exemple simple.

df <- tibble(
 id = c(1, 2, 3),
 math = c(80, 90, 85),
 science = c(75, 95, 88),
 english = c(78, 85, 82)
)
df
# A tibble: 3 × 4
     id  math science english
  <dbl> <dbl>   <dbl>   <dbl>
1     1    80      75      78
2     2    90      95      85
3     3    85      88      82

En réalité, il n’y a qu’une seule variable, c’est la note mais déclinée par élève et par matière. On pourrait créer 3 colonnes: id, matière, note.

df_long <- df %>%
 pivot_longer(cols = c(math, science, english),
              names_to = "matiere",
              values_to = "note")
df_long
# A tibble: 9 × 3
     id matiere  note
  <dbl> <chr>   <dbl>
1     1 math       80
2     1 science    75
3     1 english    78
4     2 math       90
5     2 science    95
6     2 english    85
7     3 math       85
8     3 science    88
9     3 english    82

Pivoter le dataframe.

Le data frame est dans un format peu commode qui s’appelle un format wide. Il y a beaucoup de colonne (1+1+4+4=10 colonnes). Il est plus commmode et plus parlant statistiquement de créer un format long avec des colonnes id, choice, feat, price mais on sera onbligé d’ajouter une autre colonne qui identifie la marque en question.

Yogurt |> # création d'une colonne achat comprenant les numéros de ligne.
 mutate(purshase = as.integer(row_number()),id=as.integer(id)) |> # nouvelle forme de pipe
 pivot_longer(cols = starts_with("price") | starts_with("feat"), # fonction qui pivote les colonnes commençant par "price" et "feat".
              names_to = c(".value", "brand"),
              names_sep = "\\.") |>  
 relocate( choice, .after = last_col())->Yogurt_long
Yogurt_long
# A tibble: 9,648 × 6
      id purshase brand   price  feat choice
   <int>    <int> <chr>   <dbl> <dbl> <fct> 
 1     1        1 yoplait 10.8      0 weight
 2     1        1 dannon   8.1      0 weight
 3     1        1 hiland   6.10     0 weight
 4     1        1 weight   7.90     0 weight
 5     1        2 yoplait 10.8      0 dannon
 6     1        2 dannon   9.80     0 dannon
 7     1        2 hiland   6.40     0 dannon
 8     1        2 weight   7.50     0 dannon
 9     1        3 yoplait 10.8      0 dannon
10     1        3 dannon   9.80     0 dannon
# ℹ 9,638 more rows
library(glue)
packageVersion("glue")
[1] '1.8.0'
library("gtsummary")
Yogurt_long %>%
 mutate(price = price / 100, ms = as.numeric(brand == choice)) %>%
 tbl_summary(
   by = brand,  
   statistic = list(
     ms ~ "{mean}({sd})",
     feat ~ "{mean}({sd})",
     price ~ "{mean}({sd})"
   ),
   type = list(feat ~ "continuous", ms ~ "continuous"),
   digits = list(ms ~ 3, feat ~ 3, price ~ 3),
   include = c(ms, feat, price),
   label = list(ms = "Market Share", feat = "Feature", price = "Price"),
   missing = "no"
 ) %>%
 modify_header(
   all_stat_cols() ~ "**{level}**"  # Modifie les en-têtes pour supprimer "N = ..."
 ) %>%
 modify_header(
   label = "**Variable**"
 ) %>%
 bold_labels() %>%modify_caption("Summary statistics for Yogurt data")
Summary statistics for Yogurt data
Variable dannon1 hiland1 weight1 yoplait1
Market Share 0.402(0.490) 0.029(0.169) 0.229(0.420) 0.339(0.474)
Feature 0.038(0.191) 0.037(0.189) 0.038(0.191) 0.056(0.230)
Price 0.082(0.011) 0.054(0.008) 0.079(0.008) 0.107(0.019)
1 Mean(SD)