1 1) Introduction

Ce notebook présente une étude complète de la régression :

Régression linéaire multiple (OLS/MCO)
Estimation et interprétation des coefficients
\(R^2\) et \(R^2\) ajusté
Tests de significativité (t et F)
Prévision
Méthode pas à pas (stepwise)
Multicolin��arité (VIF)
Points aberrants et influents
Homoscédasticité et normalité des résidus
Limites via train/test
Régression sur composantes (PCR/PLS)
Comparaison OLS vs PCR vs PLS

2 2) Packages

required_packages <- c(
  "AmesHousing", "ggplot2", "dplyr", "tibble", "tidyr",
  "caret", "car", "lmtest", "MASS", "pls", "broom", "patchwork"
)

installed <- rownames(installed.packages())
to_install <- required_packages[!(required_packages %in% installed)]
if(length(to_install) > 0) install.packages(to_install)

library(AmesHousing)
library(ggplot2)
library(dplyr)
library(tibble)
library(tidyr)
library(caret)
library(car)
library(lmtest)
library(MASS)
library(pls)
library(broom)
library(patchwork)

set.seed(2026)

3 3) Chargement et préparation des données

data_raw <- AmesHousing::make_ames()

# Sélection d'un sous-ensemble numérique robuste
df <- data.frame(
  SalePrice   = data_raw$Sale_Price,
  GrLivArea   = data_raw$Gr_Liv_Area,
  OverallQual = data_raw$Overall_Qual,
  YearBuilt   = data_raw$Year_Built,
  GarageArea  = data_raw$Garage_Area,
  TotalBsmtSF = data_raw$Total_Bsmt_SF,
  FullBath    = data_raw$Full_Bath,
  Bedroom     = data_raw$Bedroom_AbvGr,
  LotArea     = data_raw$Lot_Area
)

# Retrait NA
df <- df[stats::complete.cases(df), ]

str(df)

## 'data.frame':    2930 obs. of  9 variables:
##  $ SalePrice  : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
##  $ GrLivArea  : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ OverallQual: Factor w/ 10 levels "Very_Poor","Poor",..: 6 5 6 7 5 6 8 8 8 7 ...
##  $ YearBuilt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ GarageArea : num  528 730 312 522 482 470 582 506 608 442 ...
##  $ TotalBsmtSF: num  1080 882 1329 2110 928 ...
##  $ FullBath   : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Bedroom    : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ LotArea    : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...

summary(df)

##    SalePrice        GrLivArea           OverallQual    YearBuilt   
##  Min.   : 12789   Min.   : 334   Average      :825   Min.   :1872  
##  1st Qu.:129500   1st Qu.:1126   Above_Average:732   1st Qu.:1954  
##  Median :160000   Median :1442   Good         :602   Median :1973  
##  Mean   :180796   Mean   :1500   Very_Good    :350   Mean   :1971  
##  3rd Qu.:213500   3rd Qu.:1743   Below_Average:226   3rd Qu.:2001  
##  Max.   :755000   Max.   :5642   Excellent    :107   Max.   :2010  
##                                  (Other)      : 88                 
##    GarageArea      TotalBsmtSF      FullBath        Bedroom     
##  Min.   :   0.0   Min.   :   0   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 320.0   1st Qu.: 793   1st Qu.:1.000   1st Qu.:2.000  
##  Median : 480.0   Median : 990   Median :2.000   Median :3.000  
##  Mean   : 472.7   Mean   :1051   Mean   :1.567   Mean   :2.854  
##  3rd Qu.: 576.0   3rd Qu.:1302   3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :1488.0   Max.   :6110   Max.   :4.000   Max.   :8.000  
##                                                                 
##     LotArea      
##  Min.   :  1300  
##  1st Qu.:  7440  
##  Median :  9436  
##  Mean   : 10148  
##  3rd Qu.: 11555  
##  Max.   :215245  
##

4 4) Analyse exploratoire

4.1 4.1 Distribution de la cible

ggplot(df, aes(x = SalePrice)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "white") +
  labs(title = "Distribution de SalePrice", x = "Prix", y = "Effectif")

4.2 4.2 Relations bivariées

p1 <- ggplot(df, aes(x = GrLivArea, y = SalePrice)) +
  geom_point(alpha = 0.35) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "SalePrice ~ GrLivArea")

p2 <- ggplot(df, aes(x = OverallQual, y = SalePrice)) +
  geom_point(alpha = 0.35) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "SalePrice ~ OverallQual")

p3 <- ggplot(df, aes(x = TotalBsmtSF, y = SalePrice)) +
  geom_point(alpha = 0.35) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "SalePrice ~ TotalBsmtSF")

p4 <- ggplot(df, aes(x = GarageArea, y = SalePrice)) +
  geom_point(alpha = 0.35) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "SalePrice ~ GarageArea")

(p1 | p2) / (p3 | p4)

4.3 4.3 Corrélations (sans `where()`)

# Sélection uniquement numérique, compatible toutes versions
df_num <- df[, sapply(df, is.numeric), drop = FALSE]

corr_mat <- stats::cor(df_num, use = "complete.obs")
round(corr_mat, 3)

##             SalePrice GrLivArea YearBuilt GarageArea TotalBsmtSF FullBath
## SalePrice       1.000     0.707     0.558      0.640       0.633    0.546
## GrLivArea       0.707     1.000     0.242      0.484       0.445    0.630
## YearBuilt       0.558     0.242     1.000      0.481       0.408    0.469
## GarageArea      0.640     0.484     0.481      1.000       0.486    0.406
## TotalBsmtSF     0.633     0.445     0.408      0.486       1.000    0.325
## FullBath        0.546     0.630     0.469      0.406       0.325    1.000
## Bedroom         0.144     0.517    -0.055      0.073       0.053    0.359
## LotArea         0.267     0.286     0.023      0.213       0.254    0.127
##             Bedroom LotArea
## SalePrice     0.144   0.267
## GrLivArea     0.517   0.286
## YearBuilt    -0.055   0.023
## GarageArea    0.073   0.213
## TotalBsmtSF   0.053   0.254
## FullBath      0.359   0.127
## Bedroom       1.000   0.137
## LotArea       0.137   1.000

5 5) Train / Test split

idx <- caret::createDataPartition(df$SalePrice, p = 0.80, list = FALSE)
train <- df[idx, , drop = FALSE]
test  <- df[-idx, , drop = FALSE]

c(n_train = nrow(train), n_test = nrow(test))

## n_train  n_test 
##    2346     584

6 6) Régression linéaire multiple (OLS)

mod_ols <- stats::lm(SalePrice ~ ., data = train)
summary(mod_ols)

## 
## Call:
## stats::lm(formula = SalePrice ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -547659  -14632    -304   12931  236427 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -8.393e+05  6.723e+04 -12.485  < 2e-16 ***
## GrLivArea                  5.449e+01  2.508e+00  21.732  < 2e-16 ***
## OverallQualPoor            1.955e+04  1.975e+04   0.990 0.322509    
## OverallQualFair            2.690e+04  1.811e+04   1.485 0.137592    
## OverallQualBelow_Average   3.679e+04  1.734e+04   2.122 0.033977 *  
## OverallQualAverage         5.139e+04  1.725e+04   2.979 0.002926 ** 
## OverallQualAbove_Average   6.029e+04  1.729e+04   3.488 0.000496 ***
## OverallQualGood            7.692e+04  1.738e+04   4.426 1.00e-05 ***
## OverallQualVery_Good       1.179e+05  1.756e+04   6.714 2.37e-11 ***
## OverallQualExcellent       1.926e+05  1.795e+04  10.726  < 2e-16 ***
## OverallQualVery_Excellent  2.077e+05  1.912e+04  10.862  < 2e-16 ***
## YearBuilt                  4.251e+02  3.345e+01  12.710  < 2e-16 ***
## GarageArea                 3.218e+01  4.299e+00   7.486 1.00e-13 ***
## TotalBsmtSF                1.563e+01  2.125e+00   7.354 2.65e-13 ***
## FullBath                   1.047e+03  1.908e+03   0.549 0.583179    
## Bedroom                   -4.219e+03  1.130e+03  -3.734 0.000193 ***
## LotArea                    7.452e-01  8.921e-02   8.353  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34140 on 2329 degrees of freedom
## Multiple R-squared:  0.8237, Adjusted R-squared:  0.8224 
## F-statistic: 679.9 on 16 and 2329 DF,  p-value: < 2.2e-16

6.1 6.1 Coefficients + IC 95%

coef_table <- broom::tidy(mod_ols, conf.int = TRUE)
coef_table

## # A tibble: 17 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)          -8.39e+5   6.72e+4   -12.5   1.13e-34 -9.71e+5  -7.07e+5
##  2 GrLivArea             5.45e+1   2.51e+0    21.7   1.69e-95  4.96e+1   5.94e+1
##  3 OverallQualPoor       1.95e+4   1.98e+4     0.990 3.23e- 1 -1.92e+4   5.83e+4
##  4 OverallQualFair       2.69e+4   1.81e+4     1.49  1.38e- 1 -8.62e+3   6.24e+4
##  5 OverallQualBelow_Av…  3.68e+4   1.73e+4     2.12  3.40e- 2  2.79e+3   7.08e+4
##  6 OverallQualAverage    5.14e+4   1.73e+4     2.98  2.93e- 3  1.76e+4   8.52e+4
##  7 OverallQualAbove_Av…  6.03e+4   1.73e+4     3.49  4.96e- 4  2.64e+4   9.42e+4
##  8 OverallQualGood       7.69e+4   1.74e+4     4.43  1.00e- 5  4.28e+4   1.11e+5
##  9 OverallQualVery_Good  1.18e+5   1.76e+4     6.71  2.37e-11  8.35e+4   1.52e+5
## 10 OverallQualExcellent  1.93e+5   1.80e+4    10.7   3.11e-26  1.57e+5   2.28e+5
## 11 OverallQualVery_Exc…  2.08e+5   1.91e+4    10.9   7.58e-27  1.70e+5   2.45e+5
## 12 YearBuilt             4.25e+2   3.34e+1    12.7   7.79e-36  3.60e+2   4.91e+2
## 13 GarageArea            3.22e+1   4.30e+0     7.49  1.00e-13  2.37e+1   4.06e+1
## 14 TotalBsmtSF           1.56e+1   2.12e+0     7.35  2.65e-13  1.15e+1   1.98e+1
## 15 FullBath              1.05e+3   1.91e+3     0.549 5.83e- 1 -2.69e+3   4.79e+3
## 16 Bedroom              -4.22e+3   1.13e+3    -3.73  1.93e- 4 -6.43e+3  -2.00e+3
## 17 LotArea               7.45e-1   8.92e-2     8.35  1.12e-16  5.70e-1   9.20e-1

6.2 6.2 Test global (ANOVA)

stats::anova(mod_ols)

## Analysis of Variance Table
## 
## Response: SalePrice
##               Df     Sum Sq    Mean Sq   F value    Pr(>F)    
## GrLivArea      1 7.7181e+12 7.7181e+12 6621.8077 < 2.2e-16 ***
## OverallQual    9 4.2440e+12 4.7156e+11  404.5768 < 2.2e-16 ***
## YearBuilt      1 3.9810e+11 3.9810e+11  341.5554 < 2.2e-16 ***
## GarageArea     1 1.1760e+11 1.1760e+11  100.8953 < 2.2e-16 ***
## TotalBsmtSF    1 1.0278e+11 1.0278e+11   88.1822 < 2.2e-16 ***
## FullBath       1 7.7809e+07 7.7809e+07    0.0668  0.796141    
## Bedroom        1 1.6867e+10 1.6867e+10   14.4716  0.000146 ***
## LotArea        1 8.1330e+10 8.1330e+10   69.7779 < 2.2e-16 ***
## Residuals   2329 2.7146e+12 1.1656e+09                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7 7) Sélection pas à pas (méthode progressive)

mod_step <- MASS::stepAIC(mod_ols, direction = "both", trace = FALSE)
summary(mod_step)

## 
## Call:
## stats::lm(formula = SalePrice ~ GrLivArea + OverallQual + YearBuilt + 
##     GarageArea + TotalBsmtSF + Bedroom + LotArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -549113  -14593    -388   12876  236462 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -8.514e+05  6.352e+04 -13.404  < 2e-16 ***
## GrLivArea                  5.500e+01  2.334e+00  23.567  < 2e-16 ***
## OverallQualPoor            2.003e+04  1.973e+04   1.015 0.310122    
## OverallQualFair            2.733e+04  1.809e+04   1.510 0.131069    
## OverallQualBelow_Average   3.704e+04  1.733e+04   2.137 0.032685 *  
## OverallQualAverage         5.154e+04  1.725e+04   2.988 0.002834 ** 
## OverallQualAbove_Average   6.057e+04  1.728e+04   3.507 0.000463 ***
## OverallQualGood            7.735e+04  1.736e+04   4.456 8.74e-06 ***
## OverallQualVery_Good       1.182e+05  1.754e+04   6.739 2.00e-11 ***
## OverallQualExcellent       1.929e+05  1.794e+04  10.753  < 2e-16 ***
## OverallQualVery_Excellent  2.079e+05  1.912e+04  10.873  < 2e-16 ***
## YearBuilt                  4.314e+02  3.141e+01  13.737  < 2e-16 ***
## GarageArea                 3.216e+01  4.298e+00   7.482 1.03e-13 ***
## TotalBsmtSF                1.558e+01  2.123e+00   7.340 2.94e-13 ***
## Bedroom                   -4.113e+03  1.113e+03  -3.695 0.000225 ***
## LotArea                    7.443e-01  8.918e-02   8.346  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34140 on 2330 degrees of freedom
## Multiple R-squared:  0.8236, Adjusted R-squared:  0.8225 
## F-statistic: 725.4 on 15 and 2330 DF,  p-value: < 2.2e-16

AIC(mod_ols, mod_step)

##          df      AIC
## mod_ols  18 55652.78
## mod_step 17 55651.08

8 8) Diagnostics du modèle final

8.1 8.1 Multicolinéarité (VIF)

vif_vals <- car::vif(mod_step)
vif_vals

##                 GVIF Df GVIF^(1/(2*Df))
## GrLivArea   2.839653  1        1.685127
## OverallQual 3.324905  9        1.069025
## YearBuilt   1.806843  1        1.344189
## GarageArea  1.794109  1        1.339443
## TotalBsmtSF 1.776424  1        1.332825
## Bedroom     1.702767  1        1.304901
## LotArea     1.155861  1        1.075110

8.2 8.2 Graphiques diagnostics

oldpar <- par(no.readonly = TRUE)
par(mfrow = c(2,2))
plot(mod_step)

par(oldpar)

8.3 8.3 Homoscédasticité (Breusch-Pagan)

lmtest::bptest(mod_step)

## 
##  studentized Breusch-Pagan test
## 
## data:  mod_step
## BP = 762.35, df = 15, p-value < 2.2e-16

8.4 8.4 Normalité des résidus (Shapiro sur sous-échantillon)

res <- stats::residuals(mod_step)

set.seed(2026)
if(length(res) > 5000){
  res_sub <- sample(res, 5000)
} else {
  res_sub <- res
}

stats::shapiro.test(res_sub)

## 
##  Shapiro-Wilk normality test
## 
## data:  res_sub
## W = 0.76441, p-value < 2.2e-16

8.5 8.5 Points influents et aberrants

h     <- stats::hatvalues(mod_step)
rstud <- stats::rstudent(mod_step)
cook  <- stats::cooks.distance(mod_step)

n <- nrow(train)
k <- length(stats::coef(mod_step))

seuil_levier <- 2*k/n
seuil_cook   <- 4/n

suspects <- which(h > seuil_levier | abs(rstud) > 2 | cook > seuil_cook)

list(
  seuil_levier = seuil_levier,
  seuil_cook = seuil_cook,
  nb_points_suspects = length(suspects),
  points_suspects_head = head(suspects, 20)
)

## $seuil_levier
## [1] 0.01364024
## 
## $seuil_cook
## [1] 0.00170503
## 
## $nb_points_suspects
## [1] 198
## 
## $points_suspects_head
##  16  17  42  45  66  72  92 131 137 175 182 186 187 217 218 229 235 242 288 291 
##  13  14  38  40  56  62  80 107 111 135 141 144 145 172 173 183 186 192 229 232

9 9) Prévision OLS et limites train/test

rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

pred_train <- stats::predict(mod_step, newdata = train)
pred_test  <- stats::predict(mod_step, newdata = test)

perf_ols <- tibble::tibble(
  Modele = "OLS_stepwise",
  Jeu = c("Train", "Test"),
  RMSE = c(rmse(train$SalePrice, pred_train), rmse(test$SalePrice, pred_test)),
  MAE  = c(mae(train$SalePrice, pred_train),  mae(test$SalePrice, pred_test)),
  R2   = c(r2(train$SalePrice, pred_train),   r2(test$SalePrice, pred_test))
)

perf_ols

## # A tibble: 2 × 5
##   Modele       Jeu     RMSE    MAE    R2
##   <chr>        <chr>  <dbl>  <dbl> <dbl>
## 1 OLS_stepwise Train 34018. 20429. 0.824
## 2 OLS_stepwise Test  27586. 19571. 0.865

10 10) PCR

set.seed(2026)

mod_pcr <- pls::pcr(
  SalePrice ~ .,
  data = train,
  scale = TRUE,
  validation = "CV",
  segments = 10
)

summary(mod_pcr)

## Data:    X dimension: 2346 16 
##  Y dimension: 2346 1
## Fit method: svdpc
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           81038    41881    41647    39517    39070    39008    38842
## adjCV        81038    41869    41753    39490    39035    38958    38790
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       38867    38917    39093     38609     38580     38594     36652
## adjCV    38817    38864    39035     38556     38512     38538     36591
##        14 comps  15 comps  16 comps
## CV        35648     35515     35405
## adjCV     35586     35443     35331
## 
## TRAINING: % variance explained
##            1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X            22.98    32.21    41.15    49.36    56.51    63.51    70.05
## SalePrice    73.49    73.63    76.65    77.25    77.66    77.90    77.93
##            8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X            76.43    82.52     87.89     91.05     94.11     96.71     98.78
## SalePrice    78.05    78.06     78.48     78.71     78.71     80.83     81.88
##            15 comps  16 comps
## X             99.99    100.00
## SalePrice     82.23     82.37

pls::validationplot(mod_pcr, val.type = "MSEP")

11 11) PLS

set.seed(2026)

mod_pls <- pls::plsr(
  SalePrice ~ .,
  data = train,
  scale = TRUE,
  validation = "CV",
  segments = 10
)

summary(mod_pls)

## Data:    X dimension: 2346 16 
##  Y dimension: 2346 1
## Fit method: kernelpls
## Number of components considered: 16
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           81038    39186    36708    35743    35469    35496    35507
## adjCV        81038    39166    36645    35659    35403    35427    35435
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       35502    35496    35529     35547     35412     35394     35404
## adjCV    35430    35425    35454     35471     35347     35321     35330
##        14 comps  15 comps  16 comps
## CV        35405     35405     35405
## adjCV     35332     35331     35331
## 
## TRAINING: % variance explained
##            1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X            22.84    30.72    34.81    40.72    45.43    49.99    56.03
## SalePrice    77.17    80.78    82.04    82.18    82.21    82.23    82.23
##            8 comps  9 comps  10 comps  11 comps  12 comps  13 comps  14 comps
## X            60.21    67.26     71.17     75.23     77.37     84.12     90.39
## SalePrice    82.23    82.24     82.26     82.30     82.36     82.37     82.37
##            15 comps  16 comps
## X             93.86    100.00
## SalePrice     82.37     82.37

pls::validationplot(mod_pls, val.type = "MSEP")

12 12) Choix du nombre de composantes

rmsep_pcr <- pls::RMSEP(mod_pcr)$val[1,1,-1]
rmsep_pls <- pls::RMSEP(mod_pls)$val[1,1,-1]

ncomp_pcr <- which.min(rmsep_pcr)
ncomp_pls <- which.min(rmsep_pls)

list(ncomp_pcr = ncomp_pcr, ncomp_pls = ncomp_pls)

## $ncomp_pcr
## 16 comps 
##       16 
## 
## $ncomp_pls
## 12 comps 
##       12

13 13) Prédictions PCR/PLS et comparaison finale

pred_pcr_test <- as.numeric(stats::predict(mod_pcr, newdata = test, ncomp = ncomp_pcr))
pred_pls_test <- as.numeric(stats::predict(mod_pls, newdata = test, ncomp = ncomp_pls))

comparaison <- tibble::tibble(
  Modele = c("OLS_stepwise", "PCR", "PLS"),
  RMSE_test = c(
    rmse(test$SalePrice, pred_test),
    rmse(test$SalePrice, pred_pcr_test),
    rmse(test$SalePrice, pred_pls_test)
  ),
  MAE_test = c(
    mae(test$SalePrice, pred_test),
    mae(test$SalePrice, pred_pcr_test),
    mae(test$SalePrice, pred_pls_test)
  ),
  R2_test = c(
    r2(test$SalePrice, pred_test),
    r2(test$SalePrice, pred_pcr_test),
    r2(test$SalePrice, pred_pls_test)
  )
)

comparaison

## # A tibble: 3 × 4
##   Modele       RMSE_test MAE_test R2_test
##   <chr>            <dbl>    <dbl>   <dbl>
## 1 OLS_stepwise    27586.   19571.   0.865
## 2 PCR             27613.   19565.   0.865
## 3 PLS             27604.   19565.   0.865

comparaison_long <- tidyr::pivot_longer(
  comparaison,
  cols = c("RMSE_test", "MAE_test", "R2_test"),
  names_to = "Metric",
  values_to = "Value"
)

ggplot(comparaison_long, aes(x = Modele, y = Value, fill = Modele)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Metric, scales = "free_y") +
  labs(title = "Comparaison finale sur le jeu test")

14 14) Conclusion

OLS fournit une excellente interprétation économique.
Les diagnostics (VIF, résidus, influence) sont indispensables pour valider l’analyse.
La séparation train/test met en évidence les limites de généralisation.
PCR/PLS sont des alternatives robustes en cas de colinéarité, avec souvent un avantage prédictif pour PLS.

15 15) Rappels théoriques (formules clés)

\[ \hat a = (X^\top X)^{-1}X^\top Y,\quad \hat Y = X\hat a,\quad \hat\varepsilon = Y-\hat Y \]

\[ R^2 = 1-\frac{SCR}{SCT}, \qquad \bar R^2 = 1-\frac{n-1}{n-k-1}(1-R^2) \]

\[ T_j = \frac{\hat a_j-a_j^\star}{\widehat{se}(\hat a_j)} \sim t_{n-k-1} \]

\[ F = \frac{(R^2/k)}{(1-R^2)/(n-k-1)} \]

\[ VIF_j = \frac{1}{1-R_j^2}, \qquad h_i = x_i^\top (X^\top X)^{-1} x_i \]

\[ \hat y_0 = x_0^\top\hat a,\quad \mathrm{Var}(y_0-\hat y_0)=\sigma^2\left(1+x_0^\top(X^\top X)^{-1}x_0\right) \]

Projet de fin de module — Analyse de la régression

Étudiant INSEA (Data Science - Cycle Ingénieur)

2026-04-03

1 1) Introduction

2 2) Packages

3 3) Chargement et préparation des données

4 4) Analyse exploratoire

4.1 4.1 Distribution de la cible

4.2 4.2 Relations bivariées

4.3 4.3 Corrélations (sans `where()`)

5 5) Train / Test split

6 6) Régression linéaire multiple (OLS)

6.1 6.1 Coefficients + IC 95%

6.2 6.2 Test global (ANOVA)

7 7) Sélection pas à pas (méthode progressive)

8 8) Diagnostics du modèle final

8.1 8.1 Multicolinéarité (VIF)

8.2 8.2 Graphiques diagnostics

8.3 8.3 Homoscédasticité (Breusch-Pagan)

8.4 8.4 Normalité des résidus (Shapiro sur sous-échantillon)

8.5 8.5 Points influents et aberrants

9 9) Prévision OLS et limites train/test

10 10) PCR

11 11) PLS

12 12) Choix du nombre de composantes

13 13) Prédictions PCR/PLS et comparaison finale

14 14) Conclusion

15 15) Rappels théoriques (formules clés)

Projet de fin de module — Analyse de la régression

Étudiant INSEA (Data Science - Cycle Ingénieur)

2026-04-03

1 1) Introduction

2 2) Packages

3 3) Chargement et préparation des données

4 4) Analyse exploratoire

4.1 4.1 Distribution de la cible

4.2 4.2 Relations bivariées

4.3 4.3 Corrélations (sans where())

5 5) Train / Test split

6 6) Régression linéaire multiple (OLS)

6.1 6.1 Coefficients + IC 95%

6.2 6.2 Test global (ANOVA)

7 7) Sélection pas à pas (méthode progressive)

8 8) Diagnostics du modèle final

8.1 8.1 Multicolinéarité (VIF)

8.2 8.2 Graphiques diagnostics

8.3 8.3 Homoscédasticité (Breusch-Pagan)

8.4 8.4 Normalité des résidus (Shapiro sur sous-échantillon)

8.5 8.5 Points influents et aberrants

9 9) Prévision OLS et limites train/test

10 10) PCR

11 11) PLS

12 12) Choix du nombre de composantes

13 13) Prédictions PCR/PLS et comparaison finale

14 14) Conclusion

15 15) Rappels théoriques (formules clés)

4.3 4.3 Corrélations (sans `where()`)