Ce notebook présente une étude complète de la régression :
required_packages <- c(
"AmesHousing", "ggplot2", "dplyr", "tibble", "tidyr",
"caret", "car", "lmtest", "MASS", "pls", "broom", "patchwork"
)
installed <- rownames(installed.packages())
to_install <- required_packages[!(required_packages %in% installed)]
if(length(to_install) > 0) install.packages(to_install)
library(AmesHousing)
library(ggplot2)
library(dplyr)
library(tibble)
library(tidyr)
library(caret)
library(car)
library(lmtest)
library(MASS)
library(pls)
library(broom)
library(patchwork)
set.seed(2026)
data_raw <- AmesHousing::make_ames()
# Sélection d'un sous-ensemble numérique robuste
df <- data.frame(
SalePrice = data_raw$Sale_Price,
GrLivArea = data_raw$Gr_Liv_Area,
OverallQual = data_raw$Overall_Qual,
YearBuilt = data_raw$Year_Built,
GarageArea = data_raw$Garage_Area,
TotalBsmtSF = data_raw$Total_Bsmt_SF,
FullBath = data_raw$Full_Bath,
Bedroom = data_raw$Bedroom_AbvGr,
LotArea = data_raw$Lot_Area
)
# Retrait NA
df <- df[stats::complete.cases(df), ]
str(df)
## 'data.frame': 2930 obs. of 9 variables:
## $ SalePrice : int 215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
## $ GrLivArea : int 1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
## $ OverallQual: Factor w/ 10 levels "Very_Poor","Poor",..: 6 5 6 7 5 6 8 8 8 7 ...
## $ YearBuilt : int 1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
## $ GarageArea : num 528 730 312 522 482 470 582 506 608 442 ...
## $ TotalBsmtSF: num 1080 882 1329 2110 928 ...
## $ FullBath : int 1 1 1 2 2 2 2 2 2 2 ...
## $ Bedroom : int 3 2 3 3 3 3 2 2 2 3 ...
## $ LotArea : int 31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
summary(df)
## SalePrice GrLivArea OverallQual YearBuilt
## Min. : 12789 Min. : 334 Average :825 Min. :1872
## 1st Qu.:129500 1st Qu.:1126 Above_Average:732 1st Qu.:1954
## Median :160000 Median :1442 Good :602 Median :1973
## Mean :180796 Mean :1500 Very_Good :350 Mean :1971
## 3rd Qu.:213500 3rd Qu.:1743 Below_Average:226 3rd Qu.:2001
## Max. :755000 Max. :5642 Excellent :107 Max. :2010
## (Other) : 88
## GarageArea TotalBsmtSF FullBath Bedroom
## Min. : 0.0 Min. : 0 Min. :0.000 Min. :0.000
## 1st Qu.: 320.0 1st Qu.: 793 1st Qu.:1.000 1st Qu.:2.000
## Median : 480.0 Median : 990 Median :2.000 Median :3.000
## Mean : 472.7 Mean :1051 Mean :1.567 Mean :2.854
## 3rd Qu.: 576.0 3rd Qu.:1302 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :1488.0 Max. :6110 Max. :4.000 Max. :8.000
##
## LotArea
## Min. : 1300
## 1st Qu.: 7440
## Median : 9436
## Mean : 10148
## 3rd Qu.: 11555
## Max. :215245
##
ggplot(df, aes(x = SalePrice)) +
geom_histogram(bins = 40, fill = "steelblue", color = "white") +
labs(title = "Distribution de SalePrice", x = "Prix", y = "Effectif")
p1 <- ggplot(df, aes(x = GrLivArea, y = SalePrice)) +
geom_point(alpha = 0.35) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "SalePrice ~ GrLivArea")
p2 <- ggplot(df, aes(x = OverallQual, y = SalePrice)) +
geom_point(alpha = 0.35) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "SalePrice ~ OverallQual")
p3 <- ggplot(df, aes(x = TotalBsmtSF, y = SalePrice)) +
geom_point(alpha = 0.35) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "SalePrice ~ TotalBsmtSF")
p4 <- ggplot(df, aes(x = GarageArea, y = SalePrice)) +
geom_point(alpha = 0.35) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "SalePrice ~ GarageArea")
(p1 | p2) / (p3 | p4)
where())# Sélection uniquement numérique, compatible toutes versions
df_num <- df[, sapply(df, is.numeric), drop = FALSE]
corr_mat <- stats::cor(df_num, use = "complete.obs")
round(corr_mat, 3)
## SalePrice GrLivArea YearBuilt GarageArea TotalBsmtSF FullBath
## SalePrice 1.000 0.707 0.558 0.640 0.633 0.546
## GrLivArea 0.707 1.000 0.242 0.484 0.445 0.630
## YearBuilt 0.558 0.242 1.000 0.481 0.408 0.469
## GarageArea 0.640 0.484 0.481 1.000 0.486 0.406
## TotalBsmtSF 0.633 0.445 0.408 0.486 1.000 0.325
## FullBath 0.546 0.630 0.469 0.406 0.325 1.000
## Bedroom 0.144 0.517 -0.055 0.073 0.053 0.359
## LotArea 0.267 0.286 0.023 0.213 0.254 0.127
## Bedroom LotArea
## SalePrice 0.144 0.267
## GrLivArea 0.517 0.286
## YearBuilt -0.055 0.023
## GarageArea 0.073 0.213
## TotalBsmtSF 0.053 0.254
## FullBath 0.359 0.127
## Bedroom 1.000 0.137
## LotArea 0.137 1.000
idx <- caret::createDataPartition(df$SalePrice, p = 0.80, list = FALSE)
train <- df[idx, , drop = FALSE]
test <- df[-idx, , drop = FALSE]
c(n_train = nrow(train), n_test = nrow(test))
## n_train n_test
## 2346 584
mod_ols <- stats::lm(SalePrice ~ ., data = train)
summary(mod_ols)
##
## Call:
## stats::lm(formula = SalePrice ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -547659 -14632 -304 12931 236427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.393e+05 6.723e+04 -12.485 < 2e-16 ***
## GrLivArea 5.449e+01 2.508e+00 21.732 < 2e-16 ***
## OverallQualPoor 1.955e+04 1.975e+04 0.990 0.322509
## OverallQualFair 2.690e+04 1.811e+04 1.485 0.137592
## OverallQualBelow_Average 3.679e+04 1.734e+04 2.122 0.033977 *
## OverallQualAverage 5.139e+04 1.725e+04 2.979 0.002926 **
## OverallQualAbove_Average 6.029e+04 1.729e+04 3.488 0.000496 ***
## OverallQualGood 7.692e+04 1.738e+04 4.426 1.00e-05 ***
## OverallQualVery_Good 1.179e+05 1.756e+04 6.714 2.37e-11 ***
## OverallQualExcellent 1.926e+05 1.795e+04 10.726 < 2e-16 ***
## OverallQualVery_Excellent 2.077e+05 1.912e+04 10.862 < 2e-16 ***
## YearBuilt 4.251e+02 3.345e+01 12.710 < 2e-16 ***
## GarageArea 3.218e+01 4.299e+00 7.486 1.00e-13 ***
## TotalBsmtSF 1.563e+01 2.125e+00 7.354 2.65e-13 ***
## FullBath 1.047e+03 1.908e+03 0.549 0.583179
## Bedroom -4.219e+03 1.130e+03 -3.734 0.000193 ***
## LotArea 7.452e-01 8.921e-02 8.353 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34140 on 2329 degrees of freedom
## Multiple R-squared: 0.8237, Adjusted R-squared: 0.8224
## F-statistic: 679.9 on 16 and 2329 DF, p-value: < 2.2e-16
coef_table <- broom::tidy(mod_ols, conf.int = TRUE)
coef_table
## # A tibble: 17 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -8.39e+5 6.72e+4 -12.5 1.13e-34 -9.71e+5 -7.07e+5
## 2 GrLivArea 5.45e+1 2.51e+0 21.7 1.69e-95 4.96e+1 5.94e+1
## 3 OverallQualPoor 1.95e+4 1.98e+4 0.990 3.23e- 1 -1.92e+4 5.83e+4
## 4 OverallQualFair 2.69e+4 1.81e+4 1.49 1.38e- 1 -8.62e+3 6.24e+4
## 5 OverallQualBelow_Av… 3.68e+4 1.73e+4 2.12 3.40e- 2 2.79e+3 7.08e+4
## 6 OverallQualAverage 5.14e+4 1.73e+4 2.98 2.93e- 3 1.76e+4 8.52e+4
## 7 OverallQualAbove_Av… 6.03e+4 1.73e+4 3.49 4.96e- 4 2.64e+4 9.42e+4
## 8 OverallQualGood 7.69e+4 1.74e+4 4.43 1.00e- 5 4.28e+4 1.11e+5
## 9 OverallQualVery_Good 1.18e+5 1.76e+4 6.71 2.37e-11 8.35e+4 1.52e+5
## 10 OverallQualExcellent 1.93e+5 1.80e+4 10.7 3.11e-26 1.57e+5 2.28e+5
## 11 OverallQualVery_Exc… 2.08e+5 1.91e+4 10.9 7.58e-27 1.70e+5 2.45e+5
## 12 YearBuilt 4.25e+2 3.34e+1 12.7 7.79e-36 3.60e+2 4.91e+2
## 13 GarageArea 3.22e+1 4.30e+0 7.49 1.00e-13 2.37e+1 4.06e+1
## 14 TotalBsmtSF 1.56e+1 2.12e+0 7.35 2.65e-13 1.15e+1 1.98e+1
## 15 FullBath 1.05e+3 1.91e+3 0.549 5.83e- 1 -2.69e+3 4.79e+3
## 16 Bedroom -4.22e+3 1.13e+3 -3.73 1.93e- 4 -6.43e+3 -2.00e+3
## 17 LotArea 7.45e-1 8.92e-2 8.35 1.12e-16 5.70e-1 9.20e-1
stats::anova(mod_ols)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## GrLivArea 1 7.7181e+12 7.7181e+12 6621.8077 < 2.2e-16 ***
## OverallQual 9 4.2440e+12 4.7156e+11 404.5768 < 2.2e-16 ***
## YearBuilt 1 3.9810e+11 3.9810e+11 341.5554 < 2.2e-16 ***
## GarageArea 1 1.1760e+11 1.1760e+11 100.8953 < 2.2e-16 ***
## TotalBsmtSF 1 1.0278e+11 1.0278e+11 88.1822 < 2.2e-16 ***
## FullBath 1 7.7809e+07 7.7809e+07 0.0668 0.796141
## Bedroom 1 1.6867e+10 1.6867e+10 14.4716 0.000146 ***
## LotArea 1 8.1330e+10 8.1330e+10 69.7779 < 2.2e-16 ***
## Residuals 2329 2.7146e+12 1.1656e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_step <- MASS::stepAIC(mod_ols, direction = "both", trace = FALSE)
summary(mod_step)
##
## Call:
## stats::lm(formula = SalePrice ~ GrLivArea + OverallQual + YearBuilt +
## GarageArea + TotalBsmtSF + Bedroom + LotArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -549113 -14593 -388 12876 236462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.514e+05 6.352e+04 -13.404 < 2e-16 ***
## GrLivArea 5.500e+01 2.334e+00 23.567 < 2e-16 ***
## OverallQualPoor 2.003e+04 1.973e+04 1.015 0.310122
## OverallQualFair 2.733e+04 1.809e+04 1.510 0.131069
## OverallQualBelow_Average 3.704e+04 1.733e+04 2.137 0.032685 *
## OverallQualAverage 5.154e+04 1.725e+04 2.988 0.002834 **
## OverallQualAbove_Average 6.057e+04 1.728e+04 3.507 0.000463 ***
## OverallQualGood 7.735e+04 1.736e+04 4.456 8.74e-06 ***
## OverallQualVery_Good 1.182e+05 1.754e+04 6.739 2.00e-11 ***
## OverallQualExcellent 1.929e+05 1.794e+04 10.753 < 2e-16 ***
## OverallQualVery_Excellent 2.079e+05 1.912e+04 10.873 < 2e-16 ***
## YearBuilt 4.314e+02 3.141e+01 13.737 < 2e-16 ***
## GarageArea 3.216e+01 4.298e+00 7.482 1.03e-13 ***
## TotalBsmtSF 1.558e+01 2.123e+00 7.340 2.94e-13 ***
## Bedroom -4.113e+03 1.113e+03 -3.695 0.000225 ***
## LotArea 7.443e-01 8.918e-02 8.346 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34140 on 2330 degrees of freedom
## Multiple R-squared: 0.8236, Adjusted R-squared: 0.8225
## F-statistic: 725.4 on 15 and 2330 DF, p-value: < 2.2e-16
AIC(mod_ols, mod_step)
## df AIC
## mod_ols 18 55652.78
## mod_step 17 55651.08
vif_vals <- car::vif(mod_step)
vif_vals
## GVIF Df GVIF^(1/(2*Df))
## GrLivArea 2.839653 1 1.685127
## OverallQual 3.324905 9 1.069025
## YearBuilt 1.806843 1 1.344189
## GarageArea 1.794109 1 1.339443
## TotalBsmtSF 1.776424 1 1.332825
## Bedroom 1.702767 1 1.304901
## LotArea 1.155861 1 1.075110
oldpar <- par(no.readonly = TRUE)
par(mfrow = c(2,2))
plot(mod_step)
par(oldpar)
lmtest::bptest(mod_step)
##
## studentized Breusch-Pagan test
##
## data: mod_step
## BP = 762.35, df = 15, p-value < 2.2e-16
res <- stats::residuals(mod_step)
set.seed(2026)
if(length(res) > 5000){
res_sub <- sample(res, 5000)
} else {
res_sub <- res
}
stats::shapiro.test(res_sub)
##
## Shapiro-Wilk normality test
##
## data: res_sub
## W = 0.76441, p-value < 2.2e-16
h <- stats::hatvalues(mod_step)
rstud <- stats::rstudent(mod_step)
cook <- stats::cooks.distance(mod_step)
n <- nrow(train)
k <- length(stats::coef(mod_step))
seuil_levier <- 2*k/n
seuil_cook <- 4/n
suspects <- which(h > seuil_levier | abs(rstud) > 2 | cook > seuil_cook)
list(
seuil_levier = seuil_levier,
seuil_cook = seuil_cook,
nb_points_suspects = length(suspects),
points_suspects_head = head(suspects, 20)
)
## $seuil_levier
## [1] 0.01364024
##
## $seuil_cook
## [1] 0.00170503
##
## $nb_points_suspects
## [1] 198
##
## $points_suspects_head
## 16 17 42 45 66 72 92 131 137 175 182 186 187 217 218 229 235 242 288 291
## 13 14 38 40 56 62 80 107 111 135 141 144 145 172 173 183 186 192 229 232
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
pred_train <- stats::predict(mod_step, newdata = train)
pred_test <- stats::predict(mod_step, newdata = test)
perf_ols <- tibble::tibble(
Modele = "OLS_stepwise",
Jeu = c("Train", "Test"),
RMSE = c(rmse(train$SalePrice, pred_train), rmse(test$SalePrice, pred_test)),
MAE = c(mae(train$SalePrice, pred_train), mae(test$SalePrice, pred_test)),
R2 = c(r2(train$SalePrice, pred_train), r2(test$SalePrice, pred_test))
)
perf_ols
## # A tibble: 2 × 5
## Modele Jeu RMSE MAE R2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 OLS_stepwise Train 34018. 20429. 0.824
## 2 OLS_stepwise Test 27586. 19571. 0.865
set.seed(2026)
mod_pcr <- pls::pcr(
SalePrice ~ .,
data = train,
scale = TRUE,
validation = "CV",
segments = 10
)
summary(mod_pcr)
## Data: X dimension: 2346 16
## Y dimension: 2346 1
## Fit method: svdpc
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 81038 41881 41647 39517 39070 39008 38842
## adjCV 81038 41869 41753 39490 39035 38958 38790
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 38867 38917 39093 38609 38580 38594 36652
## adjCV 38817 38864 39035 38556 38512 38538 36591
## 14 comps 15 comps 16 comps
## CV 35648 35515 35405
## adjCV 35586 35443 35331
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 22.98 32.21 41.15 49.36 56.51 63.51 70.05
## SalePrice 73.49 73.63 76.65 77.25 77.66 77.90 77.93
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 76.43 82.52 87.89 91.05 94.11 96.71 98.78
## SalePrice 78.05 78.06 78.48 78.71 78.71 80.83 81.88
## 15 comps 16 comps
## X 99.99 100.00
## SalePrice 82.23 82.37
pls::validationplot(mod_pcr, val.type = "MSEP")
set.seed(2026)
mod_pls <- pls::plsr(
SalePrice ~ .,
data = train,
scale = TRUE,
validation = "CV",
segments = 10
)
summary(mod_pls)
## Data: X dimension: 2346 16
## Y dimension: 2346 1
## Fit method: kernelpls
## Number of components considered: 16
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 81038 39186 36708 35743 35469 35496 35507
## adjCV 81038 39166 36645 35659 35403 35427 35435
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 35502 35496 35529 35547 35412 35394 35404
## adjCV 35430 35425 35454 35471 35347 35321 35330
## 14 comps 15 comps 16 comps
## CV 35405 35405 35405
## adjCV 35332 35331 35331
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 22.84 30.72 34.81 40.72 45.43 49.99 56.03
## SalePrice 77.17 80.78 82.04 82.18 82.21 82.23 82.23
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps
## X 60.21 67.26 71.17 75.23 77.37 84.12 90.39
## SalePrice 82.23 82.24 82.26 82.30 82.36 82.37 82.37
## 15 comps 16 comps
## X 93.86 100.00
## SalePrice 82.37 82.37
pls::validationplot(mod_pls, val.type = "MSEP")
rmsep_pcr <- pls::RMSEP(mod_pcr)$val[1,1,-1]
rmsep_pls <- pls::RMSEP(mod_pls)$val[1,1,-1]
ncomp_pcr <- which.min(rmsep_pcr)
ncomp_pls <- which.min(rmsep_pls)
list(ncomp_pcr = ncomp_pcr, ncomp_pls = ncomp_pls)
## $ncomp_pcr
## 16 comps
## 16
##
## $ncomp_pls
## 12 comps
## 12
pred_pcr_test <- as.numeric(stats::predict(mod_pcr, newdata = test, ncomp = ncomp_pcr))
pred_pls_test <- as.numeric(stats::predict(mod_pls, newdata = test, ncomp = ncomp_pls))
comparaison <- tibble::tibble(
Modele = c("OLS_stepwise", "PCR", "PLS"),
RMSE_test = c(
rmse(test$SalePrice, pred_test),
rmse(test$SalePrice, pred_pcr_test),
rmse(test$SalePrice, pred_pls_test)
),
MAE_test = c(
mae(test$SalePrice, pred_test),
mae(test$SalePrice, pred_pcr_test),
mae(test$SalePrice, pred_pls_test)
),
R2_test = c(
r2(test$SalePrice, pred_test),
r2(test$SalePrice, pred_pcr_test),
r2(test$SalePrice, pred_pls_test)
)
)
comparaison
## # A tibble: 3 × 4
## Modele RMSE_test MAE_test R2_test
## <chr> <dbl> <dbl> <dbl>
## 1 OLS_stepwise 27586. 19571. 0.865
## 2 PCR 27613. 19565. 0.865
## 3 PLS 27604. 19565. 0.865
comparaison_long <- tidyr::pivot_longer(
comparaison,
cols = c("RMSE_test", "MAE_test", "R2_test"),
names_to = "Metric",
values_to = "Value"
)
ggplot(comparaison_long, aes(x = Modele, y = Value, fill = Modele)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Metric, scales = "free_y") +
labs(title = "Comparaison finale sur le jeu test")
\[ \hat a = (X^\top X)^{-1}X^\top Y,\quad \hat Y = X\hat a,\quad \hat\varepsilon = Y-\hat Y \]
\[ R^2 = 1-\frac{SCR}{SCT}, \qquad \bar R^2 = 1-\frac{n-1}{n-k-1}(1-R^2) \]
\[ T_j = \frac{\hat a_j-a_j^\star}{\widehat{se}(\hat a_j)} \sim t_{n-k-1} \]
\[ F = \frac{(R^2/k)}{(1-R^2)/(n-k-1)} \]
\[ VIF_j = \frac{1}{1-R_j^2}, \qquad h_i = x_i^\top (X^\top X)^{-1} x_i \]
\[ \hat y_0 = x_0^\top\hat a,\quad \mathrm{Var}(y_0-\hat y_0)=\sigma^2\left(1+x_0^\top(X^\top X)^{-1}x_0\right) \]