Projet données Yogurt

Author

Karim Kilani

Introduction

Chargement des packages

library(tidyverse)
library(mlogit)
library(stargazer)
library(Ecdat)

Chargement des data et statistiques descriptives

data("Yogurt")

On peut calculer

Yogurt%>%select(choice)%>%table()%>%
prop.table()%>%round(5)

choice
yoplait  dannon  hiland  weight 
0.33914 0.40216 0.02944 0.22927

# Moyenne et écart-type pour les variables numériques
Yogurt %>%
  summarise(across(where(is.numeric) & !all_of("id"), list(mean = mean, sd = sd), na.rm = TRUE))

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(...)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))

  feat.yoplait_mean feat.yoplait_sd feat.dannon_mean feat.dannon_sd
1        0.05597015       0.2299117       0.03772803      0.1905772
  feat.hiland_mean feat.hiland_sd feat.weight_mean feat.weight_sd
1       0.03689884      0.1885525       0.03772803      0.1905772
  price.yoplait_mean price.yoplait_sd price.dannon_mean price.dannon_sd
1           10.68213         1.906265          8.163474        1.062886
  price.hiland_mean price.hiland_sd price.weight_mean price.weight_sd
1          5.362935        0.805391          7.949088       0.7735004

# Calcul de proportions pour la variable qualititative
Yogurt %>%
  group_by(choice) %>%
  summarise(proportion = n())

# A tibble: 4 × 2
  choice  proportion
  <fct>        <int>
1 yoplait        818
2 dannon         970
3 hiland          71
4 weight         553

set.seed(123)
data <- tibble(
  groupe = sample(c("A", "B", "C"), 100, replace = TRUE),
  x = rnorm(100, mean = 50, sd = 10),  # Variable quantitative
  y = rnorm(100, mean = 30, sd = 5)    # Variable quantitative
)

data %>%
  summarise(
    mean_x = mean(x, na.rm = TRUE),
    sd_x = sd(x, na.rm = TRUE),
    min_x = min(x, na.rm = TRUE),
    max_x = max(x, na.rm = TRUE),
    mean_y = mean(y, na.rm = TRUE),
    sd_y = sd(y, na.rm = TRUE)
    )

# A tibble: 1 × 6
  mean_x  sd_x min_x max_x mean_y  sd_y
   <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>
1   49.4  9.57  26.9  71.9   30.0  4.71

Reproduire Table 1

Un exemple pedagogique pour comprendre pivot longer

Partonds d’un exemple simple

df <- tibble(
  id = c(1, 2, 3),
  math = c(80, 90, 85),
  science = c(75, 95, 88),
  english = c(78, 85, 82)
)

En réalité il n’y a qu’une seule variable, c’esl la note mais decilinée par élève et par matière. On pourrait créer 3 colonne id une colonne note et une colonne matière

df_long <- df %>%
  pivot_longer(cols = c(math, science, english), 
               names_to = "matiere", 
               values_to = "note")

df_long

# A tibble: 9 × 3
     id matiere  note
  <dbl> <chr>   <dbl>
1     1 math       80
2     1 science    75
3     1 english    78
4     2 math       90
5     2 science    95
6     2 english    85
7     3 math       85
8     3 science    88
9     3 english    82

Pivoter le dataframe

Le data frame est dans un format peu commode qui s’appelle un format wide. Il y a beaucoup de colonne (1+4+4+1=10) colonnes. Il est plus commode et plus parlant statistiquement de créer un format long avec des colonne . id, feat, price, choice mais on sera obligé d’jouter une autre colonne qui identifie la marque en question

Yogurt |> 
  mutate(purshase = as.integer(row_number()),id=as.integer(id)) |>  
  pivot_longer(cols = starts_with("price") | starts_with("feat"), 
               names_to = c(".value", "brand"), 
               names_sep = "\\.") |>  
  relocate( choice, .after = last_col())->Yogurt_long
Yogurt_long

# A tibble: 9,648 × 6
      id purshase brand   price  feat choice
   <int>    <int> <chr>   <dbl> <dbl> <fct> 
 1     1        1 yoplait 10.8      0 weight
 2     1        1 dannon   8.1      0 weight
 3     1        1 hiland   6.10     0 weight
 4     1        1 weight   7.90     0 weight
 5     1        2 yoplait 10.8      0 dannon
 6     1        2 dannon   9.80     0 dannon
 7     1        2 hiland   6.40     0 dannon
 8     1        2 weight   7.50     0 dannon
 9     1        3 yoplait 10.8      0 dannon
10     1        3 dannon   9.80     0 dannon
# ℹ 9,638 more rows

Utiliser gtsummary

library(glue)
packageVersion("glue")

[1] '1.8.0'

library("gtsummary")

Yogurt_long %>%
  mutate(price = price / 100, ms = as.numeric(brand == choice)) %>%
  tbl_summary(
    by = brand,  
    statistic = list(
      ms ~ "{mean}({sd})",
      feat ~ "{mean}({sd})",
      price ~ "{mean}({sd})"
    ),
    type = list(feat ~ "continuous", ms ~ "continuous"),
    digits = list(ms ~ 3, feat ~ 3, price ~ 3),
    include = c(ms, feat, price),
    label = list(ms = "Market Share", feat = "Feature", price = "Price"),
    missing = "no"
  ) %>%
  modify_header(
    all_stat_cols() ~ "**{level}**"  # Modifie les en-têtes pour supprimer "N = ..."
  ) %>%
  modify_header(
    label = "**Variable**"
  ) %>% 
  bold_labels() %>%modify_caption("Summary statistics for Yogurt data")

Summary statistics for Yogurt data
Variable	dannon¹	hiland¹	weight¹	yoplait¹
Market Share	0.402(0.490)	0.029(0.169)	0.229(0.420)	0.339(0.474)
Feature	0.038(0.191)	0.037(0.189)	0.038(0.191)	0.056(0.230)
Price	0.082(0.011)	0.054(0.008)	0.079(0.008)	0.107(0.019)
¹ Mean(SD)

log(00.9)

[1] -0.1053605

Estimer un modèle logit avec le package mlogit

library(mlogit)
data(ModeCanada)

Apprentissage avec des données simples

library(stargazer)
MC <- ModeCanada |>
  dfidx(subset = noalt == 4, alt.levels = levels(factor(unique(ModeCanada$alt)))
)
  

MNL<-mlogit(choice ~ cost+freq+ovt|income ,reflevel = "bus" ,MC)
MNL%>%summary()


Call:
mlogit(formula = choice ~ cost + freq + ovt | income, data = MC, 
    reflevel = "bus", method = "nr")

Frequencies of alternatives:choice
      bus     train       air       car 
0.0035984 0.1666067 0.3738755 0.4559194 

nr method
9 iterations, 0h:0m:1s 
g'(-H)^-1g = 9.24E-05 
successive function values within tolerance limits 

Coefficients :
                    Estimate Std. Error  z-value  Pr(>|z|)    
(Intercept):train  4.8074276  0.6747002   7.1253 1.039e-12 ***
(Intercept):air    7.7020867  0.7861126   9.7977 < 2.2e-16 ***
(Intercept):car    3.1611780  0.6990468   4.5221 6.122e-06 ***
cost              -0.0634143  0.0033165 -19.1208 < 2.2e-16 ***
freq               0.1163068  0.0043995  26.4362 < 2.2e-16 ***
ovt               -0.0369923  0.0026639 -13.8867 < 2.2e-16 ***
income:train       0.0519669  0.0179975   2.8875 0.0038837 ** 
income:air         0.0889521  0.0181069   4.9126 8.988e-07 ***
income:car         0.0636173  0.0179400   3.5461 0.0003909 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log-Likelihood: -2031.4
McFadden R^2:  0.30034 
Likelihood ratio test : chisq = 1744 (p.value = < 2.22e-16)

MNL%>%stargazer(type = "text", title = "Résultats de la régression pour le MNL")


Résultats de la régression pour le MNL
=============================================
                      Dependent variable:    
                  ---------------------------
                            choice           
---------------------------------------------
(Intercept):train          4.807***          
                            (0.675)          
                                             
(Intercept):air            7.702***          
                            (0.786)          
                                             
(Intercept):car            3.161***          
                            (0.699)          
                                             
cost                       -0.063***         
                            (0.003)          
                                             
freq                       0.116***          
                            (0.004)          
                                             
ovt                        -0.037***         
                            (0.003)          
                                             
income:train               0.052***          
                            (0.018)          
                                             
income:air                 0.089***          
                            (0.018)          
                                             
income:car                 0.064***          
                            (0.018)          
                                             
---------------------------------------------
Observations                 2,779           
R2                           0.300           
Log Likelihood            -2,031.390         
LR Test              1,743.974*** (df = 9)   
=============================================
Note:             *p<0.1; **p<0.05; ***p<0.01

?stargazer