Formation R

Edmond Noack

Les Fenêtres

Opérations simples :

1 + 1
[1] 2
4 - 1
[1] 3
4/2
[1] 2
2*1
[1] 2

Les Variables :

var1 <- 3

print(var1)
[1] 3
2 * var1
[1] 6

Les vecteurs :

vect <- c(-2, 12, 5,999, 7)

print(vect)
[1]  -2  12   5 999   7
vect * 2
[1]   -4   24   10 1998   14

On peut voir les variables et vecteurs ici :

Les packages

L’installation par défaut du logiciel R contient le cœur du programme ainsi qu’un ensemble de fonctions de base fournissant un grand nombre d’outils de traitement de données et d’analyse statistiques.

R étant un logiciel libre, il bénéficie d’une forte communauté d’utilisateurs qui peuvent librement contribuer au développement du logiciel en lui ajoutant des fonctionnalités supplémentaires. Ces contributions prennent la forme d’extensions (packages en anglais) pouvant être installées par l’utilisateur et fournissant alors diverses fonctionnalités supplémentaires.

install.packages(“NOM DU PACKAGE”)

Le Package Tidyverse

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Le Package Tidyverse

Cette commande va en fait installer plusieurs extensions qui constituent le “coeur” du tidyverse, à savoir :

  • ggplot2 (visualisation)

  • dplyr (manipulation des données)

  • tidyr (remise en forme des données)

  • purrr (programmation)

  • readr (importation de données)

  • tibble (tableaux de données)

  • forcats (variables qualitatives)

  • stringr (chaînes de caractères)

  • lubridate (manipulation de dates)

Le Package Tidyverse

Lire une base

library(readxl)
base_efe <- read_xlsx("P:\\Formation_R_ppt\\EFE_1.xlsx")

Visualiser une base

library(dplyr)
View(base_efe)

head(base_efe)
# A tibble: 6 × 30
  tx_acc ns_txacc tx_form ns_txform tx_courses ns_txcourses tx_autres
   <dbl>    <dbl>   <dbl>     <dbl>      <dbl>        <dbl>     <dbl>
1   39.6        0    30.2         0       28.3            0      1.91
2   14.3        0    19.0         0       17.7            0      1.30
3   52.6        0    36.8         0       32.8            0      3.98
4   27.3        0    27.2         0       25.6            0      1.65
5   38.6        0    30.6         0       28.8            0      1.78
6   39.5        0    36.5         0       34.2            0      2.30
# ℹ 23 more variables: ns_txautres <dbl>, heurstag <dbl>, heurstag_sal <dbl>,
#   top1_c5 <chr>, top2_c5 <chr>, top3_c5 <chr>, top1_c5_tx <dbl>,
#   top2_c5_tx <dbl>, top3_c5_tx <dbl>, top1_d3 <chr>, top2_d3 <chr>,
#   top3_d3 <chr>, top1_d3_tx <dbl>, top2_d3_tx <dbl>, top3_d3_tx <dbl>,
#   top1_e1 <chr>, top2_e1 <chr>, top3_e1 <chr>, top1_e1_tx <dbl>,
#   top2_e1_tx <dbl>, top3_e1_tx <dbl>, taille <chr>, secteur <chr>

Visualiser une base

glimpse(base_efe)
Rows: 35
Columns: 30
$ tx_acc       <dbl> 39.65, 14.29, 52.59, 27.31, 38.57, 39.53, 14.89, 10.48, 9…
$ ns_txacc     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ tx_form      <dbl> 30.23904, 18.99544, 36.78412, 27.23459, 30.57052, 36.4895…
$ ns_txform    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ tx_courses   <dbl> 28.32985, 17.69419, 32.80376, 25.58107, 28.79003, 34.1912…
$ ns_txcourses <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ tx_autres    <dbl> 1.9091827, 1.3012493, 3.9803580, 1.6535162, 1.7804855, 2.…
$ ns_txautres  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ heurstag     <dbl> 28.28457, 30.95564, 20.13409, 54.98455, 29.13603, 26.5185…
$ heurstag_sal <dbl> 14.661037, 13.135202, 12.066899, 26.411764, 14.902905, 12…
$ top1_c5      <chr> "C5i", "C5m", "C5i", "C5i", "C5i", "C5i", "C5i", "C5m", "…
$ top2_c5      <chr> "C5m", "C5i", "C5m", "C5m", "C5m", "C5m", "C5m", "C5i", "…
$ top3_c5      <chr> "C5c", "C5g", "C5c", "C5g", "C5c", "C5c", "C5g", "C5g", "…
$ top1_c5_tx   <dbl> 51.18931, 47.47342, 69.48573, 60.66320, 47.84556, 59.9406…
$ top2_c5_tx   <dbl> 20.83284, 26.97161, 27.03221, 21.39208, 18.16681, 24.0300…
$ top3_c5_tx   <dbl> 11.186783, 11.274131, 15.771133, 12.938600, 11.510267, 14…
$ top1_d3      <chr> "d3i", "d3i", "d3i", "d3i", "d3i", "d3i", "d3i", "d3h", "…
$ top2_d3      <chr> "d3h", "d3h", "d3h", "d3h", "d3h", "d3h", "d3h", "d3i", "…
$ top3_d3      <chr> "d3b", "d3e", "d3b", "d3a", "d3b", "d3b", "d3b", "d3e", "…
$ top1_d3_tx   <dbl> 53.34838, 47.71013, 49.17692, 43.96847, 53.65347, 69.6482…
$ top2_d3_tx   <dbl> 40.78846, 47.20222, 41.08555, 43.64921, 39.22565, 45.9239…
$ top3_d3_tx   <dbl> 34.52397, 37.25828, 33.51655, 28.87762, 36.94359, 33.9122…
$ top1_e1      <chr> "e1a", "e1a", "e1a", "e1a", "e1a", "e1a", "e1a", "e1a", "…
$ top2_e1      <chr> "e1b", "e1b", "e1h", "e1b", "e1b", "e1i", "e1b", "e1b", "…
$ top3_e1      <chr> "e1i", "e1h", "e1b", "e1h", "e1i", "e1b", "e1i", "e1h", "…
$ top1_e1_tx   <dbl> 60.48851, 61.24143, 67.25265, 58.35803, 61.26543, 50.5072…
$ top2_e1_tx   <dbl> 32.98634, 32.65841, 36.18566, 30.74533, 33.57854, 33.2471…
$ top3_e1_tx   <dbl> 29.40886, 28.32106, 33.68433, 30.05647, 30.70871, 31.4062…
$ taille       <chr> "Ensemble", "Ensemble", "Ensemble", "Ensemble", "Ensemble…
$ secteur      <chr> "Ensemble", "Agriculture, sylviculture, pêche", "Industri…

Visualiser une base

# A tibble: 6 × 30
  tx_acc ns_txacc tx_form ns_txform tx_courses ns_txcourses tx_autres
   <dbl>    <dbl>   <dbl>     <dbl>      <dbl>        <dbl>     <dbl>
1   39.6        0    30.2         0       28.3            0     1.91 
2   14.9        0    22.4         0       21.9            0     0.456
3   25.1        0    73.0         0       60.7            0    12.3  
4   41.5        0    94.3         0       89.9            0     4.40 
5   54.3        0    99.2         0       97.6            0     1.60 
6   61.0        0    99.6         0       98.3            0     1.35 
# ℹ 23 more variables: ns_txautres <dbl>, heurstag <dbl>, heurstag_sal <dbl>,
#   top1_c5 <chr>, top2_c5 <chr>, top3_c5 <chr>, top1_c5_tx <dbl>,
#   top2_c5_tx <dbl>, top3_c5_tx <dbl>, top1_d3 <chr>, top2_d3 <chr>,
#   top3_d3 <chr>, top1_d3_tx <dbl>, top2_d3_tx <dbl>, top3_d3_tx <dbl>,
#   top1_e1 <chr>, top2_e1 <chr>, top3_e1 <chr>, top1_e1_tx <dbl>,
#   top2_e1_tx <dbl>, top3_e1_tx <dbl>, taille <chr>, secteur <chr>
# A tibble: 6 × 30
  tx_acc ns_txacc tx_form ns_txform tx_courses ns_txcourses tx_autres
   <dbl>    <dbl>   <dbl>     <dbl>      <dbl>        <dbl>     <dbl>
1   39.6        0    30.2         0       28.3            0     1.91 
2   14.9        0    22.4         0       21.9            0     0.456
3   25.1        0    73.0         0       60.7            0    12.3  
4   41.5        0    94.3         0       89.9            0     4.40 
5   54.3        0    99.2         0       97.6            0     1.60 
6   61.0        0    99.6         0       98.3            0     1.35 
# ℹ 23 more variables: ns_txautres <dbl>, heurstag <dbl>, heurstag_sal <dbl>,
#   top1_c5 <chr>, top2_c5 <chr>, top3_c5 <chr>, top1_c5_tx <dbl>,
#   top2_c5_tx <dbl>, top3_c5_tx <dbl>, top1_d3 <chr>, top2_d3 <chr>,
#   top3_d3 <chr>, top1_d3_tx <dbl>, top2_d3_tx <dbl>, top3_d3_tx <dbl>,
#   top1_e1 <chr>, top2_e1 <chr>, top3_e1 <chr>, top1_e1_tx <dbl>,
#   top2_e1_tx <dbl>, top3_e1_tx <dbl>, taille <chr>, secteur <chr>

Les stats

mean(base_efe$tx_acc)
[1] 38.11314
median(base_efe$tx_acc)
[1] 39.65
quantile(base_efe$tx_acc)
    0%    25%    50%    75%   100% 
 9.260 23.120 39.650 52.335 70.890 
table(base_efe$top1_c5)

C5i C5m 
 33   2 
table(base_efe$top1_c5, base_efe$top2_c5)
     
      C5c C5i C5m
  C5i  11   0  22
  C5m   0   2   0
prop.table(table(base_efe$top1_c5, base_efe$top2_c5))
     
             C5c        C5i        C5m
  C5i 0.31428571 0.00000000 0.62857143
  C5m 0.00000000 0.05714286 0.00000000

Les manipulations de données de Base

%>% PIPE

Le pipe permet d’enchainer les opération les unes aprés les autres

head(summarise(group_by(filter(base_efe, tx_acc > 0.2),top1_c5),moyenne = mean(tx_acc)))
# A tibble: 2 × 2
  top1_c5 moyenne
  <chr>     <dbl>
1 C5i        39.7
2 C5m        12.4

%>% PIPE

# A tibble: 2 × 2
  top1_c5 moyenne
  <chr>     <dbl>
1 C5i        39.7
2 C5m        12.4

MUTATE

mutate()crée de nouvelles colonnes qui sont des fonctions de variables existantes. Il peut également modifier (si le nom est le même qu’une colonne existante) et supprimer des colonnes

base_new<- base_efe %>%
  mutate(tx_acc_plus_1=tx_acc+1)

mean(base_new$tx_acc)
[1] 38.11314
mean(base_new$tx_acc_plus_1)
[1] 39.11314

SELECT

base_new <- base_new %>% select(-tx_acc_plus_1)

mean(base_new$tx_acc_plus_1)
[1] NA
base_new <- base_new %>% select(taille)

head(base_new)
# A tibble: 6 × 1
  taille  
  <chr>   
1 Ensemble
2 Ensemble
3 Ensemble
4 Ensemble
5 Ensemble
6 Ensemble

CASEWHEN

base_efe <- base_efe %>%
  mutate(
    new_var = case_when(
      tx_form==100~ "1",
      tx_form<=30 ~ "0",
      TRUE ~ "9"
      )
  )

table(base_efe$new_var)

 0  1  9 
 8  5 22 

IFELSE

base_efe <- base_efe %>%
  mutate(
   new_var2 = ifelse(
      tx_acc <=30, "0", "1"
    )
  )


table(base_efe$new_var2)

 0  1 
13 22 

MERGE