Cette analyse exploite la puissance complète de R avec des packages modernes pour des visualisations de qualité professionnelle.
# Manipulation de données
library(tidyverse)
# Visualisation avancée
library(GGally)
library(ggforce)
library(ggrepel) # Labels intelligents sans chevauchement
library(patchwork) # Combinaison de graphiques
library(ggsci) # Palettes scientifiques
library(scales) # Formatage des axes
# ACP avancée
library(factoextra) # LE package pour l'ACP
library(FactoMineR) # Analyses multivariées
library(corrplot) # Matrices de corrélation
# Interactivité (optionnel)
library(plotly) # Graphiques interactifs
# Définir un thème personnalisé
theme_acp <- function() {
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5, color = "gray40"),
plot.caption = element_text(size = 9, color = "gray50"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray90"),
legend.position = "right"
)
}
# Palette de couleurs moderne
palette_moderne <- c("#E64B35FF", "#4DBBD5FF", "#00A087FF", "#3C5488FF",
"#F39B7FFF", "#8491B4FF", "#91D1C2FF", "#DC0000FF")
# Importation des données
data <- read_delim("Exemple_data_ACP.txt", delim = "\t")
# Aperçu élégant
head(data) %>%
knitr::kable(caption = "📊 Aperçu des données (notes des étudiants)")
| …1 | MATH | PHYS | SVT | HIST | GEOG | ARAB | FRAN | SPORT |
|---|---|---|---|---|---|---|---|---|
| Et1 | 9.5 | 8.5 | 9 | 15.5 | 16.0 | 14 | 17.0 | 15 |
| Et2 | 11.0 | 11.5 | 10 | 9.0 | 12.5 | 13 | 10.5 | 13 |
| Et3 | 14.5 | 14.5 | 16 | 16.5 | 14.0 | 14 | 13.5 | 15 |
| Et4 | 15.5 | 14.5 | 13 | 12.5 | 10.0 | 13 | 11.0 | 16 |
| Et5 | 8.0 | 7.5 | 9 | 14.5 | 15.0 | 16 | 14.5 | 14 |
| Et6 | 12.0 | 10.0 | 11 | 11.5 | 11.5 | 9 | 10.0 | 13 |
data_num <- data %>% select(where(is.numeric))
# Statistiques résumées
summary(data_num) %>%
knitr::kable(caption = "📈 Statistiques descriptives par matière")
| MATH | PHYS | SVT | HIST | GEOG | ARAB | FRAN | SPORT | |
|---|---|---|---|---|---|---|---|---|
| Min. : 4.00 | Min. : 6.000 | Min. : 6.50 | Min. : 4.50 | Min. : 4.50 | Min. : 4.000 | Min. : 5.00 | Min. : 7.00 | |
| 1st Qu.: 8.50 | 1st Qu.: 8.125 | 1st Qu.: 9.50 | 1st Qu.: 9.50 | 1st Qu.: 9.50 | 1st Qu.: 9.625 | 1st Qu.:10.00 | 1st Qu.:13.50 | |
| Median :11.50 | Median :11.500 | Median :12.00 | Median :13.00 | Median :12.75 | Median :13.000 | Median :12.50 | Median :15.50 | |
| Mean :11.89 | Mean :11.870 | Mean :12.09 | Mean :12.48 | Mean :12.06 | Mean :12.090 | Mean :12.01 | Mean :14.57 | |
| 3rd Qu.:15.50 | 3rd Qu.:15.000 | 3rd Qu.:15.00 | 3rd Qu.:15.50 | 3rd Qu.:14.88 | 3rd Qu.:15.000 | 3rd Qu.:14.50 | 3rd Qu.:16.50 | |
| Max. :18.00 | Max. :17.500 | Max. :18.00 | Max. :17.50 | Max. :18.00 | Max. :18.000 | Max. :17.00 | Max. :18.00 |
# Calcul de la matrice de corrélation
cor_matrix <- cor(data_num)
# Visualisation avec corrplot (beaucoup plus élégante)
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
number.cex = 0.7,
col = colorRampPalette(c("#6D9EC1", "white", "#E46726"))(200),
title = "Matrice de Corrélation des Matières",
mar = c(0, 0, 2, 0))
# Transformer en format long pour ggplot
data_long <- data_num %>%
pivot_longer(cols = everything(),
names_to = "Matiere",
values_to = "Note")
# Boxplot moderne avec violon
p1 <- ggplot(data_long, aes(x = reorder(Matiere, Note, median),
y = Note,
fill = Matiere)) +
geom_violin(alpha = 0.7, scale = "width") +
geom_boxplot(width = 0.2, alpha = 0.8, outlier.shape = 21) +
stat_summary(fun = mean, geom = "point",
shape = 23, size = 3, fill = "red") +
scale_fill_manual(values = palette_moderne) +
coord_flip() +
theme_acp() +
labs(
title = "Distribution des notes par matière",
subtitle = "Violin + Boxplot | ◆ = moyenne",
x = "Matière",
y = "Note"
) +
theme(legend.position = "none")
p1
# Sélection de quelques variables pour la lisibilité
vars_select <- c("MATH", "PHYS", "SVT", "HIST", "ARAB", "FRAN")
data_select <- data_num %>% select(all_of(vars_select))
# GGpairs avec personnalisation
ggpairs(data_select,
title = "Matrice de Scatter Plots des Matières Principales",
upper = list(continuous = wrap("cor", size = 4, color = "steelblue")),
diag = list(continuous = wrap("densityDiag", fill = "steelblue", alpha = 0.5)),
lower = list(continuous = wrap("smooth", alpha = 0.2, color = "coral"))) +
theme_minimal()
# Matrice X standardisée
X <- scale(data_num, center = TRUE, scale = TRUE)
# Matrice de variance-covariance (= matrice de corrélation car standardisé)
V <- cor(data_num)
# Affichage élégant
knitr::kable(round(V, 3),
caption = "Matrice de Corrélation (V)")
| MATH | PHYS | SVT | HIST | GEOG | ARAB | FRAN | SPORT | |
|---|---|---|---|---|---|---|---|---|
| MATH | 1.000 | 0.943 | 0.913 | 0.065 | -0.006 | -0.007 | 0.090 | -0.098 |
| PHYS | 0.943 | 1.000 | 0.931 | 0.075 | -0.010 | 0.037 | 0.097 | -0.200 |
| SVT | 0.913 | 0.931 | 1.000 | 0.048 | -0.030 | -0.014 | 0.090 | -0.139 |
| HIST | 0.065 | 0.075 | 0.048 | 1.000 | 0.871 | 0.879 | 0.815 | -0.043 |
| GEOG | -0.006 | -0.010 | -0.030 | 0.871 | 1.000 | 0.915 | 0.888 | -0.028 |
| ARAB | -0.007 | 0.037 | -0.014 | 0.879 | 0.915 | 1.000 | 0.877 | -0.062 |
| FRAN | 0.090 | 0.097 | 0.090 | 0.815 | 0.888 | 0.877 | 1.000 | -0.109 |
| SPORT | -0.098 | -0.200 | -0.139 | -0.043 | -0.028 | -0.062 | -0.109 | 1.000 |
# Décomposition spectrale
eig <- eigen(V)
# Valeurs propres
lambda <- eig$values
# Vecteurs propres
Q <- eig$vectors
# Tableau des valeurs propres
tibble(
Composante = paste0("Dim", 1:length(lambda)),
`Valeur propre` = round(lambda, 3),
`% Variance` = round(100 * lambda / sum(lambda), 2),
`% Variance cumulée` = round(100 * cumsum(lambda) / sum(lambda), 2)
) %>%
knitr::kable(caption = "📊 Valeurs propres et variance expliquée")
| Composante | Valeur propre | % Variance | % Variance cumulée |
|---|---|---|---|
| Dim1 | 3.653 | 45.66 | 45.66 |
| Dim2 | 2.875 | 35.93 | 81.60 |
| Dim3 | 0.971 | 12.13 | 93.73 |
| Dim4 | 0.186 | 2.32 | 96.05 |
| Dim5 | 0.107 | 1.34 | 97.39 |
| Dim6 | 0.094 | 1.17 | 98.57 |
| Dim7 | 0.079 | 0.99 | 99.55 |
| Dim8 | 0.036 | 0.45 | 100.00 |
# Données pour le scree plot
scree_data <- tibble(
Axe = factor(1:length(lambda)),
Valeur_propre = lambda,
Variance_expliquée = lambda / sum(lambda),
Variance_cumulée = cumsum(lambda) / sum(lambda)
)
# Scree plot avec deux axes
p_scree <- ggplot(scree_data, aes(x = Axe)) +
# Valeurs propres
geom_line(aes(y = Valeur_propre, group = 1, color = "Valeur propre"),
linewidth = 1.2) +
geom_point(aes(y = Valeur_propre, fill = "Valeur propre"),
size = 4, shape = 21, stroke = 1.5) +
# Variance cumulée
geom_line(aes(y = Variance_cumulée * max(lambda),
group = 1, color = "Variance cumulée"),
linewidth = 1.2, linetype = "dashed") +
geom_point(aes(y = Variance_cumulée * max(lambda),
fill = "Variance cumulée"),
size = 3, shape = 22, stroke = 1.5) +
# Ligne de Kaiser
geom_hline(yintercept = 1, linetype = "dotted",
color = "red", linewidth = 0.8) +
annotate("text", x = 7, y = 1.2, label = "Critère de Kaiser (λ = 1)",
color = "red", size = 3.5) +
# Échelle secondaire
scale_y_continuous(
name = "Valeur propre",
sec.axis = sec_axis(~ . / max(lambda),
name = "Variance cumulée (%)",
labels = percent_format())
) +
scale_color_manual(values = c("Valeur propre" = "#E64B35",
"Variance cumulée" = "#4DBBD5")) +
scale_fill_manual(values = c("Valeur propre" = "#E64B35",
"Variance cumulée" = "#4DBBD5")) +
theme_acp() +
labs(
title = "Scree Plot — Éboulis des valeurs propres",
subtitle = "Critère du coude et variance expliquée",
x = "Composante principale",
color = NULL,
fill = NULL
) +
theme(legend.position = "top")
p_scree
p_variance <- ggplot(scree_data, aes(x = Axe, y = Variance_expliquée)) +
geom_col(aes(fill = Axe), alpha = 0.8, color = "black") +
geom_text(aes(label = paste0(round(Variance_expliquée * 100, 1), "%")),
vjust = -0.5, size = 3.5, fontface = "bold") +
scale_fill_manual(values = palette_moderne) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
theme_acp() +
labs(
title = "Contribution de chaque composante à la variance totale",
x = "Composante principale",
y = "Variance expliquée (%)"
) +
theme(legend.position = "none")
p_variance
# Matrice des composantes principales (scores)
C <- X %*% Q
# Conversion en tibble avec noms des étudiants
C_df <- as_tibble(C, .name_repair = ~ paste0("Dim", seq_along(.x))) %>%
mutate(Etudiant = paste0("Et", 1:nrow(.)))
# Variance expliquée
var_expl <- lambda / sum(lambda)
var1 <- round(var_expl[1] * 100, 2)
var2 <- round(var_expl[2] * 100, 2)
# Aperçu
head(C_df) %>%
knitr::kable(caption = "🎯 Scores des individus (premières lignes)")
| Dim1 | Dim2 | Dim3 | Dim4 | Dim5 | Dim6 | Dim7 | Dim8 | Etudiant |
|---|---|---|---|---|---|---|---|---|
| -1.7424901 | 1.7347590 | -0.0379169 | -0.5626094 | -0.5388337 | 0.3248806 | -0.2359421 | -0.0796506 | Et1 |
| 0.5581921 | 0.4194254 | 0.6546037 | -0.3110646 | 0.9276801 | 0.2995803 | 0.0591462 | 0.0416775 | Et2 |
| -1.5440460 | -1.1812939 | -0.4117822 | 0.4020182 | -0.2889418 | -0.2891585 | 0.2932279 | 0.0295352 | Et3 |
| 0.1535797 | -1.0542224 | -0.6335935 | 0.3175086 | 0.3750181 | -0.0993926 | -0.5546076 | 0.1998262 | Et4 |
| -1.3215797 | 2.0188693 | 0.3647639 | -0.1864374 | 0.1582612 | -0.0929566 | 0.0194858 | 0.3209984 | Et5 |
| 1.0094856 | 0.2241532 | 0.5949066 | 0.2170318 | -0.3240638 | 0.5827309 | 0.2001991 | 0.1286688 | Et6 |
# Coordonnées des variables dans le cercle
corvar <- Q %*% diag(sqrt(lambda))
colnames(corvar) <- paste0("Dim", 1:ncol(corvar))
# Conversion en tibble
corvar_df <- as_tibble(corvar) %>%
mutate(
Variable = colnames(data_num),
# Calcul de la qualité de représentation (cos²)
Cos2_Dim1 = Dim1^2,
Cos2_Dim2 = Dim2^2,
Cos2_Total = Cos2_Dim1 + Cos2_Dim2
)
# Tableau des coordonnées
corvar_df %>%
select(Variable, Dim1, Dim2, Cos2_Total) %>%
arrange(desc(Cos2_Total)) %>%
knitr::kable(
digits = 3,
caption = "📍 Coordonnées des variables (triées par qualité de représentation)"
)
| Variable | Dim1 | Dim2 | Cos2_Total |
|---|---|---|---|
| PHYS | -0.208 | -0.960 | 0.965 |
| MATH | -0.189 | -0.950 | 0.938 |
| SVT | -0.177 | -0.952 | 0.937 |
| GEOG | -0.944 | 0.210 | 0.935 |
| ARAB | -0.947 | 0.187 | 0.932 |
| FRAN | -0.939 | 0.088 | 0.889 |
| HIST | -0.927 | 0.123 | 0.875 |
| SPORT | 0.118 | 0.205 | 0.056 |
# Direction (ajustable)
s <- 1
# Normaliser pour le biplot
C_norm <- C_df %>%
mutate(
Dim1_norm = Dim1 * 0.8,
Dim2_norm = Dim2 * 0.8
)
corvar_norm <- corvar_df %>%
mutate(
Dim1_scale = Dim1 * 4,
Dim2_scale = Dim2 * 4
)
# Biplot ultime
p_biplot <- ggplot() +
# Grille
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
# Individus (points)
geom_point(
data = C_norm,
aes(x = s * Dim1_norm, y = s * Dim2_norm, color = Groupe),
size = 3, alpha = 0.5
) +
# Variables (flèches)
geom_segment(
data = corvar_norm,
aes(x = 0, y = 0, xend = s * Dim1_scale, yend = s * Dim2_scale),
arrow = arrow(length = unit(0.4, "cm"), type = "closed"),
color = "#E64B35", linewidth = 1.5, alpha = 0.7
) +
# Labels des variables
geom_label_repel(
data = corvar_norm,
aes(x = s * Dim1_scale, y = s * Dim2_scale, label = Variable),
fill = "#E64B35", color = "white",
fontface = "bold", size = 4,
box.padding = 0.5
) +
scale_color_manual(values = c("#4DBBD5", "#00A087", "#F39B7F", "#3C5488")) +
coord_equal() +
theme_acp() +
labs(
title = "Biplot : Individus et Variables",
subtitle = "Projection simultanée des étudiants (points) et des matières (flèches)",
x = paste0("Dimension 1 (", var1, "%)"),
y = paste0("Dimension 2 (", var2, "%)"),
color = "Profil étudiant"
)
p_biplot
Pour une validation, utilisons FactoMineR qui est
LA référence pour l’ACP en R.
# ACP avec FactoMineR
res_pca <- PCA(data_num, scale.unit = TRUE, graph = FALSE)
# Graphique des valeurs propres (factoextra)
p1_facto <- fviz_eig(res_pca, addlabels = TRUE, ylim = c(0, 50),
barfill = "#4DBBD5", barcolor = "#4DBBD5") +
theme_acp() +
labs(title = "Variance expliquée par composante (factoextra)")
# Graphique des individus
p2_facto <- fviz_pca_ind(
res_pca,
col.ind = "contrib",
gradient.cols = c("#00A087", "#4DBBD5", "#E64B35"),
repel = TRUE,
legend.title = "Contribution"
) +
theme_acp() +
labs(title = "Individus (colorés par contribution)")
# Graphique des variables
p3_facto <- fviz_pca_var(
res_pca,
col.var = "cos2",
gradient.cols = c("#F39B7F", "#4DBBD5", "#E64B35"),
repel = TRUE,
legend.title = "Cos²"
) +
theme_acp() +
labs(title = "Variables (colorées par cos²)")
# Biplot
p4_facto <- fviz_pca_biplot(
res_pca,
repel = TRUE,
col.var = "#E64B35",
col.ind = "#696969",
legend.title = "Variables"
) +
theme_acp() +
labs(title = "Biplot (FactoMineR)")
# Combiner avec patchwork
(p1_facto + p2_facto) / (p3_facto + p4_facto) +
plot_annotation(
title = "Analyse complète avec FactoMineR & factoextra",
theme = theme(plot.title = element_text(size = 18, face = "bold", hjust = 0.5))
)
La première dimension (variance expliquée : 45.66%) oppose clairement :
Interprétation : Cette dimension discrimine les étudiants selon leur profil scientifique vs littéraire.
La deuxième dimension (variance expliquée : 35.93%) semble représenter un gradient de performance globale, tous domaines confondus.
Les ellipses de confiance révèlent 4 profils distincts :
Analyse réalisée avec R version R version 4.4.2 (2024-10-31 ucrt)