Statistiques descriptives
2027-04-04
Cette présentation couvre les notions de base en statistiques descriptives avec R :
proc freqproc meansproc summaryPrésentation avec une base de l’enquête EFE : Base_efe
# A tibble: 6 × 1,010
sirus_id strate2023 taille7_bds secteur12 sirus_id_2 ech2020 expl2020
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 423690874 2_02_NA_PME 3 02 423690874 1 1
2 423712934 5_10_AKTO_ETI 7 10 423712934 1 1
3 423728773 2_02_OPCOEP_PME 2 02 423728773 1 1
4 423748276 2_04_OpcoMobilite… 2 04 423748276 1 1
5 423754241 5_07_AFDAS_ETI 7 07 423754241 1 NA
6 423765684 5_10_AKTO_ETI 7 10 423765684 1 1
# ℹ 1,003 more variables: sirus_id_3 <chr>, ech2021 <chr>, expl2021 <dbl>,
# sirus_id_4 <chr>, ech2022 <chr>, expl2022 <dbl>, sirus_id_5 <chr>,
# poids_2023 <dbl>, sirus_id_6 <chr>, ech2023 <chr>, num_groupe <dbl>,
# sirus_id_7 <chr>, sirus_mere <chr>, procedure <chr>, categorie <chr>,
# questionnaire <chr>, cle_a2tot <dbl>, cle_c0tot <dbl>, cle_c1tot <dbl>,
# cle_c3tot <dbl>, cle_rec1 <chr>, cle_rec6 <chr>, commentaire <chr>,
# a2tot <dbl>, c0tot <dbl>, d2f5 <dbl>, d2f6 <dbl>, d2f7 <dbl>, …
⚠️ Attention ici on n’est pas en Dplyr
01 02 03 04 05 06 07 08 09 10 11 12
1 391 674 709 1091 253 397 300 468 924 368 1076 861
2 299 448 275 616 199 225 189 333 385 316 350 375
3 115 431 212 367 195 141 192 181 265 302 207 478
4 62 502 498 372 467 249 431 125 538 535 396 596
5 3 326 76 255 110 25 85 51 172 153 52 303
6 3 163 40 121 55 19 46 43 94 85 25 188
7 1 273 60 220 75 37 97 160 115 138 32 265
✅ Ecriture en Dplyr
⚠️ Attention pas hyper lisible car deux colonne il faut pivoter la base
✅ Ecriture en Dplyr
✅ Format tableau croisé
➡️ Utilisation de pivot_wider
Base_efe |>
count(taille7_bds, secteur12) |>
pivot_wider(
names_from = secteur12, # nom des futures colonnes
values_from = n, # valeurs des futures colonnes
values_fill = 0 # si croisement n'existe pas mettre 0
)# A tibble: 7 × 13
taille7_bds `01` `02` `03` `04` `05` `06` `07` `08` `09` `10` `11`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 391 674 709 1091 253 397 300 468 924 368 1076
2 2 299 448 275 616 199 225 189 333 385 316 350
3 3 115 431 212 367 195 141 192 181 265 302 207
4 4 62 502 498 372 467 249 431 125 538 535 396
5 5 3 326 76 255 110 25 85 51 172 153 52
6 6 3 163 40 121 55 19 46 43 94 85 25
7 7 1 273 60 220 75 37 97 160 115 138 32
# ℹ 1 more variable: `12` <int>
✅ Pourcentage colonne
Base_efe |>
group_by(secteur12) |> # on regroupe par colonne (ici secteur)
count(taille7_bds, secteur12) |> # on compte les occurrences de croisement
mutate(pct=round(100 * n / sum(n),1)) |> # on calcul le %
select(-n) |> # on enlève les effectifs
pivot_wider(names_from = secteur12, # on pivote
values_from = pct,
values_fill = 0)# A tibble: 7 × 13
taille7_bds `01` `02` `03` `04` `05` `06` `07` `08` `09` `10` `11`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 44.7 23.9 37.9 35.9 18.7 36.3 22.4 34.4 37.1 19.4 50.3
2 2 34.2 15.9 14.7 20.2 14.7 20.6 14.1 24.5 15.4 16.7 16.4
3 3 13.2 15.3 11.3 12.1 14.4 12.9 14.3 13.3 10.6 15.9 9.7
4 4 7.1 17.8 26.6 12.2 34.5 22.8 32.2 9.2 21.6 28.2 18.5
5 5 0.3 11.6 4.1 8.4 8.1 2.3 6.3 3.7 6.9 8.1 2.4
6 6 0.3 5.8 2.1 4 4.1 1.7 3.4 3.2 3.8 4.5 1.2
7 7 0.1 9.7 3.2 7.2 5.5 3.4 7.2 11.8 4.6 7.3 1.5
# ℹ 1 more variable: `12` <dbl>
✅ Pourcentage ligne
Base_efe |>
group_by(taille7_bds) |> # on regroupe par ligne (ici taille)
count(taille7_bds, secteur12) |> # on compte les occurrences de croisement
mutate(pct=round(100 * n / sum(n),1)) |> # on calcul le %
select(-n) |> # on enlève les effectifs
pivot_wider(names_from = secteur12, # on pivote
values_from = pct,
values_fill = 0)# A tibble: 7 × 13
# Groups: taille7_bds [7]
taille7_bds `01` `02` `03` `04` `05` `06` `07` `08` `09` `10` `11`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5.2 9 9.4 14.5 3.4 5.3 4 6.2 12.3 4.9 14.3
2 2 7.5 11.2 6.9 15.4 5 5.6 4.7 8.3 9.6 7.9 8.7
3 3 3.7 14 6.9 11.9 6.3 4.6 6.2 5.9 8.6 9.8 6.7
4 4 1.3 10.5 10.4 7.8 9.8 5.2 9 2.6 11.3 11.2 8.3
5 5 0.2 20.2 4.7 15.8 6.8 1.6 5.3 3.2 10.7 9.5 3.2
6 6 0.3 18.5 4.5 13.7 6.2 2.2 5.2 4.9 10.7 9.6 2.8
7 7 0.1 18.5 4.1 14.9 5.1 2.5 6.6 10.9 7.8 9.4 2.2
# ℹ 1 more variable: `12` <dbl>
✅ Pourcentage total
Base_efe |>
# On groupe pas
count(taille7_bds, secteur12) |> # on compte les occurrences de croisement
mutate(pct=round(100 * n / sum(n),1)) |> # on calcul le %
select(-n) |> # on enlève les effectifs
pivot_wider(names_from = secteur12, # on pivote
values_from = pct,
values_fill = 0)# A tibble: 7 × 13
taille7_bds `01` `02` `03` `04` `05` `06` `07` `08` `09` `10` `11`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1.7 2.9 3 4.7 1.1 1.7 1.3 2 4 1.6 4.6
2 2 1.3 1.9 1.2 2.6 0.9 1 0.8 1.4 1.6 1.4 1.5
3 3 0.5 1.8 0.9 1.6 0.8 0.6 0.8 0.8 1.1 1.3 0.9
4 4 0.3 2.2 2.1 1.6 2 1.1 1.8 0.5 2.3 2.3 1.7
5 5 0 1.4 0.3 1.1 0.5 0.1 0.4 0.2 0.7 0.7 0.2
6 6 0 0.7 0.2 0.5 0.2 0.1 0.2 0.2 0.4 0.4 0.1
7 7 0 1.2 0.3 0.9 0.3 0.2 0.4 0.7 0.5 0.6 0.1
# ℹ 1 more variable: `12` <dbl>
🏋️ Avec Pondération
je rajoute wt = poids
Base_efe |>
group_by(taille7_bds) |> # on regroupe par ligne (ici taille)
count(taille7_bds, secteur12, wt = poids_2023) |> # on compte les occurrences avec les poids
mutate(pct = round(100 * n / sum(n), 1)) |> # on calcule le % pondéré
select(-n) |> # on enlève les effectifs
pivot_wider(names_from = secteur12, # on pivote
values_from = pct,
values_fill = 0)# A tibble: 7 × 13
# Groups: taille7_bds [7]
taille7_bds `01` `02` `03` `04` `05` `06` `07` `08` `09` `10` `11`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6.1 6.8 12 17.6 3 6.8 3.9 6.6 13.8 6.1 10.4
2 2 1.7 10 11.4 18 4.7 12 3.7 2.6 13 8.7 6.8
3 3 1.2 12.9 10.5 13.8 6.7 9.1 5.7 2.3 9.1 10.9 4.6
4 4 0.7 18.2 5.5 17.1 7.7 5.6 4.5 1.3 8.1 10.9 4.5
5 5 0.1 24.9 5.4 15.7 5.2 1 5.7 2.6 9.5 7.6 2.9
6 6 0.3 23 4.2 12 6.1 2 4.5 4.7 9.9 12.2 2
7 7 0.1 18.5 4.1 14.9 5.1 2.5 6.6 10.9 7.8 9.4 2.2
# ℹ 1 more variable: `12` <dbl>
questionrTable_questionR <- wtd.table(Base_efe$secteur12, Base_efe$taille7_bds, weights = Base_efe$poids_2023)
print(Table_questionR) 1 2 3 4 5
01 4.706185e+04 1.648329e+03 6.664322e+02 2.434912e+02 3.153846e+00
02 5.197706e+04 9.847583e+03 7.338618e+03 6.111454e+03 1.006553e+03
03 9.230302e+04 1.122976e+04 5.960385e+03 1.843610e+03 2.202504e+02
04 1.353282e+05 1.767356e+04 7.860906e+03 5.750496e+03 6.346673e+02
05 2.336567e+04 4.645477e+03 3.834528e+03 2.573192e+03 2.111989e+02
06 5.223819e+04 1.179397e+04 5.193111e+03 1.895287e+03 4.140476e+01
07 2.981856e+04 3.664893e+03 3.255340e+03 1.524095e+03 2.314540e+02
08 5.044481e+04 2.601404e+03 1.286382e+03 4.506342e+02 1.054844e+02
09 1.060156e+05 1.274408e+04 5.203057e+03 2.714119e+03 3.846817e+02
10 4.690996e+04 8.596791e+03 6.182807e+03 3.642339e+03 3.061103e+02
11 7.980305e+04 6.694437e+03 2.608386e+03 1.499829e+03 1.188944e+02
12 5.469811e+04 7.210291e+03 7.530242e+03 5.304700e+03 7.851645e+02
6 7
01 4.333333e+00 1.000000e+00
02 3.724319e+02 2.730000e+02
03 6.856190e+01 6.000000e+01
04 1.950900e+02 2.200000e+02
05 9.942216e+01 7.500000e+01
06 3.300000e+01 3.700000e+01
07 7.337601e+01 9.700000e+01
08 7.685118e+01 1.600000e+02
09 1.601157e+02 1.150000e+02
10 1.971721e+02 1.380000e+02
11 3.290000e+01 3.200000e+01
12 3.061639e+02 2.650000e+02
⚠️ Attention on obtient une table et non un dataframe
Si on veut repasser en dataframe il faut transformer la table et la pivoter
Pourcentage ligne
Table_questionR <- wtd.table(Base_efe$secteur12, Base_efe$taille7_bds, weights = Base_efe$poids_2023)
# Convertir en pourcentages par ligne
Table_questionR_pct_ligne <- prop.table(Table_questionR, margin = 1) * 100
# Afficher le tableau avec des pourcentages
print(Table_questionR_pct_ligne) 1 2 3 4 5
01 94.828103633 3.321328903 1.342839192 0.490626882 0.006354898
02 67.566994499 12.801255370 9.539753851 7.944516477 1.308457855
03 82.645414209 10.054801031 5.336753963 1.650714499 0.197205738
04 80.714445301 10.541126927 4.688519052 3.429796789 0.378537739
05 67.134075098 13.347349879 11.017335666 7.393275916 0.606815005
06 73.335318902 16.557135726 7.290422643 2.660725580 0.058126660
07 77.120852578 9.478649059 8.419406733 3.941823757 0.598618056
08 91.508920644 4.719051097 2.333548874 0.817468736 0.191353031
09 83.256161480 10.008172700 4.086062676 2.131451334 0.302098070
10 71.104594283 13.030735215 9.371697059 5.520938433 0.463992065
11 87.898988774 7.373581567 2.873004134 1.651985416 0.130956067
12 71.876934091 9.474798680 9.895235514 6.970726703 1.031757957
6 7
01 0.008731526 0.002014968
02 0.484138667 0.354883281
03 0.061388319 0.053722240
04 0.116358509 0.131215683
05 0.285659005 0.215489430
06 0.046327516 0.051942973
07 0.189775102 0.250874715
08 0.139411152 0.290246467
09 0.125741979 0.090311761
10 0.298867047 0.209175898
11 0.036237673 0.035246369
12 0.402319554 0.348227502
Pourcentage colonne
Table_questionR_pct_col <- prop.table(Table_questionR, margin = 2) * 100
print(Table_questionR_pct_col) 1 2 3 4 5 6
01 6.11221389 1.67597266 1.17081852 0.72568595 0.07789163 0.26758582
02 6.75058237 10.01273605 12.89281962 18.21419598 24.85920036 22.99788342
03 11.98796392 11.41809664 10.47147706 5.49458034 5.43960030 4.23373689
04 17.57590654 17.96995865 13.81039980 17.13841887 15.67459859 12.04692136
05 3.03464458 4.72338597 6.73667355 7.66898049 5.21605198 6.13937556
06 6.78449676 11.99176834 9.12349542 5.64859497 1.02258778 2.03776890
07 3.87272087 3.72635636 5.71912992 4.54231734 5.71629933 4.53101047
08 6.55157936 2.64503133 2.25997467 1.34304208 2.60518576 4.74560456
09 13.76890628 12.95780434 9.14096786 8.08899146 9.50061649 9.88723392
10 6.09248740 8.74096666 10.86223805 10.85539863 7.56011334 12.17549053
11 10.36451555 6.80670901 4.58253170 4.46999702 2.93637508 2.03159385
12 7.10398249 7.33121399 13.22947384 15.80979687 19.39147934 18.90579472
7
01 0.06788866
02 18.53360489
03 4.07331976
04 14.93550577
05 5.09164969
06 2.51188052
07 6.58520027
08 10.86218601
09 7.80719620
10 9.36863544
11 2.17243720
12 17.99049559
Pourcentage total
1 2 3 4 5
01 4.872183e+00 1.706469e-01 6.899387e-02 2.520797e-02 3.265089e-04
02 5.381041e+00 1.019493e+00 7.597467e-01 6.327019e-01 1.042057e-01
03 9.555875e+00 1.162586e+00 6.170621e-01 1.908638e-01 2.280191e-02
04 1.401015e+01 1.829694e+00 8.138178e-01 5.953329e-01 6.570534e-02
05 2.418983e+00 4.809333e-01 3.969780e-01 2.663954e-01 2.186483e-02
06 5.408074e+00 1.220997e+00 5.376284e-01 1.962138e-01 4.286520e-03
07 3.087033e+00 3.794162e-01 3.370163e-01 1.577853e-01 2.396179e-02
08 5.222411e+00 2.693161e-01 1.331755e-01 4.665291e-02 1.092051e-02
09 1.097550e+01 1.319359e+00 5.386580e-01 2.809852e-01 3.982502e-02
10 4.856458e+00 8.900019e-01 6.400888e-01 3.770812e-01 3.169075e-02
11 8.261788e+00 6.930565e-01 2.700389e-01 1.552731e-01 1.230880e-02
12 5.662744e+00 7.464614e-01 7.795850e-01 5.491809e-01 8.128589e-02
6 7
01 4.486180e-04 1.035272e-04
02 3.855684e-02 2.826293e-02
03 7.098023e-03 6.211633e-03
04 2.019713e-02 2.277599e-02
05 1.029290e-02 7.764542e-03
06 3.416398e-03 3.830507e-03
07 7.596414e-03 1.004214e-02
08 7.956189e-03 1.656436e-02
09 1.657633e-02 1.190563e-02
10 2.041268e-02 1.428676e-02
11 3.406046e-03 3.312871e-03
12 3.169630e-02 2.743471e-02
Avec somme ligne colonne
Table_questionR_with_margins <- addmargins(Table_questionR_pct_col)
print(Table_questionR_with_margins) 1 2 3 4 5
01 6.11221389 1.67597266 1.17081852 0.72568595 0.07789163
02 6.75058237 10.01273605 12.89281962 18.21419598 24.85920036
03 11.98796392 11.41809664 10.47147706 5.49458034 5.43960030
04 17.57590654 17.96995865 13.81039980 17.13841887 15.67459859
05 3.03464458 4.72338597 6.73667355 7.66898049 5.21605198
06 6.78449676 11.99176834 9.12349542 5.64859497 1.02258778
07 3.87272087 3.72635636 5.71912992 4.54231734 5.71629933
08 6.55157936 2.64503133 2.25997467 1.34304208 2.60518576
09 13.76890628 12.95780434 9.14096786 8.08899146 9.50061649
10 6.09248740 8.74096666 10.86223805 10.85539863 7.56011334
11 10.36451555 6.80670901 4.58253170 4.46999702 2.93637508
12 7.10398249 7.33121399 13.22947384 15.80979687 19.39147934
Sum 100.00000000 100.00000000 100.00000000 100.00000000 100.00000000
6 7 Sum
01 0.26758582 0.06788866 10.09805713
02 22.99788342 18.53360489 114.26102268
03 4.23373689 4.07331976 53.11877491
04 12.04692136 14.93550577 109.15170960
05 6.13937556 5.09164969 38.61076182
06 2.03776890 2.51188052 39.12059269
07 4.53101047 6.58520027 34.69303456
08 4.74560456 10.86218601 31.01260378
09 9.88723392 7.80719620 71.15171656
10 12.17549053 9.36863544 65.65533004
11 2.03159385 2.17243720 33.36415939
12 18.90579472 17.99049559 99.76223684
Sum 100.00000000 100.00000000 700.00000000
janitorpourcentage ligne
Pourcentage colonne
Pourcentage colonne avec somme
Pourcentage ligne avec somme
gmodelsle plus proche de SAS
CrossTable(Base_efe$naf_interim, Base_efe$naf_asso,
prop.r = TRUE, prop.c = TRUE, prop.t = TRUE, chisq = TRUE)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 23345
| Base_efe$naf_asso
Base_efe$naf_interim | 0 | 1 | Row Total |
---------------------|-----------|-----------|-----------|
0 | 19448 | 3723 | 23171 |
| 0.034 | 0.180 | |
| 0.839 | 0.161 | 0.993 |
| 0.991 | 0.999 | |
| 0.833 | 0.159 | |
---------------------|-----------|-----------|-----------|
1 | 172 | 2 | 174 |
| 4.539 | 23.908 | |
| 0.989 | 0.011 | 0.007 |
| 0.009 | 0.001 | |
| 0.007 | 0.000 | |
---------------------|-----------|-----------|-----------|
Column Total | 19620 | 3725 | 23345 |
| 0.840 | 0.160 | |
---------------------|-----------|-----------|-----------|
Statistics for All Table Factors
Pearson's Chi-squared test
------------------------------------------------------------
Chi^2 = 28.66078 d.f. = 1 p = 8.623242e-08
Pearson's Chi-squared test with Yates' continuity correction
------------------------------------------------------------
Chi^2 = 27.55914 d.f. = 1 p = 1.523635e-07
Avec Pondération
# Table pondérée
table_pond <- xtabs(poids_2023 ~ naf_interim + naf_asso, data = Base_efe)
# CrossTable dessus
CrossTable(table_pond,
prop.r = TRUE, prop.c = TRUE, prop.t = TRUE, chisq = TRUE)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 965929.5
| naf_asso
naf_interim | 0 | 1 | Row Total |
-------------|------------------|------------------|------------------|
0 | 873275 | 88725 | 962000 |
| 0.143 | 1.418 | |
| 0.908 | 0.092 | 0.996 |
| 0.996 | 1.000 | |
| 0.904 | 0.092 | |
-------------|------------------|------------------|------------------|
1 | 3922 | 6 | 3928 |
| 35.116 | 347.152 | |
| 0.998 | 0.002 | 0.004 |
| 0.004 | 0.000 | |
| 0.004 | 0.000 | |
-------------|------------------|------------------|------------------|
Column Total | 877197 | 88732 | 965929 |
| 0.908 | 0.092 | |
-------------|------------------|------------------|------------------|
Statistics for All Table Factors
Pearson's Chi-squared test
------------------------------------------------------------
Chi^2 = 383.8294 d.f. = 1 p = 1.824954e-85
Pearson's Chi-squared test with Yates' continuity correction
------------------------------------------------------------
Chi^2 = 382.7458 d.f. = 1 p = 3.141672e-85
surveydes <- svydesign(
ids = ~1, # Pas de grappes, donc on met ~1
data = Base_efe, # Utilise ton dataframe
weights = ~poids_2023 # Pondération individuelle pour chaque observation
)
# Table pondérée (comptage des observations pour 'naf_interim' et 'naf_asso')
freq_table <- svytable(~ naf_interim + naf_asso, design = des)Base_efe %>%
group_by(naf_interim) %>%
summarise(
moyenne = mean(a2tot, na.rm = TRUE),
mediane = median(a2tot, na.rm = TRUE),
q1 = quantile(a2tot, 0.25, na.rm = TRUE),
q3 = quantile(a2tot, 0.75, na.rm = TRUE),
min = min(a2tot, na.rm = TRUE),
max = max(a2tot, na.rm = TRUE)
)# A tibble: 2 × 7
naf_interim moyenne mediane q1 q3 min max
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 468. 19 5 91 1 4798211
2 1 2633. 91 48.8 253. 1 121336
survey mean SE
a2tot 49.49 32.417
$a2tot
quantile ci.2.5 ci.97.5 se
0.1 1 1 2 0.2550935
0.5 3 3 4 0.2550935
0.9 19 19 20 0.2550935
attr(,"hasci")
[1] TRUE
attr(,"class")
[1] "newsvyquantile"
# A tibble: 2 × 9
naf_asso moyenne moyenne_se mediane_q10 mediane_q50 mediane_q75 mediane_q10_se
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 52.6 35.7 1 3 7 0.255
2 1 18.8 0.862 1 3 10 0.255
# ℹ 2 more variables: mediane_q50_se <dbl>, mediane_q75_se <dbl>
Statistiques descriptives