#1. Faire une analyse descriptive de la base de données.
library(readr)
donnees <- read_csv("heart_cleveland.csv")
## Rows: 297 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(donnees)
## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 69 1 0 160 234 1 2 131 0 0.1 1
## 2 69 0 0 140 239 0 0 151 0 1.8 0
## 3 66 0 0 150 226 0 0 114 0 2.6 2
## 4 65 1 0 138 282 1 2 174 0 1.4 1
## 5 64 1 0 110 211 0 2 144 1 1.8 1
## 6 64 1 0 170 227 0 2 155 0 0.6 1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, condition <dbl>
str(donnees)
## spc_tbl_ [297 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:297] 69 69 66 65 64 64 63 61 60 59 ...
## $ sex : num [1:297] 1 0 0 1 1 1 1 1 0 1 ...
## $ cp : num [1:297] 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps : num [1:297] 160 140 150 138 110 170 145 134 150 178 ...
## $ chol : num [1:297] 234 239 226 282 211 227 233 234 240 270 ...
## $ fbs : num [1:297] 1 0 0 1 0 0 1 0 0 0 ...
## $ restecg : num [1:297] 2 0 0 2 2 2 2 0 0 2 ...
## $ thalach : num [1:297] 131 151 114 174 144 155 150 145 171 145 ...
## $ exang : num [1:297] 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num [1:297] 0.1 1.8 2.6 1.4 1.8 0.6 2.3 2.6 0.9 4.2 ...
## $ slope : num [1:297] 1 0 2 1 1 1 2 1 0 2 ...
## $ ca : num [1:297] 1 2 0 1 0 0 0 2 0 0 ...
## $ thal : num [1:297] 0 0 0 0 0 2 1 0 0 2 ...
## $ condition: num [1:297] 0 0 0 1 0 0 0 1 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. condition = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
colnames(donnees)
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "condition"
summary(donnees)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :2.000 Median :130.0
## Mean :54.54 Mean :0.6768 Mean :2.158 Mean :131.7
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.0
## Median :243.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :247.4 Mean :0.1448 Mean :0.9966 Mean :149.6
## 3rd Qu.:276.0 3rd Qu.:0.0000 3rd Qu.:2.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.0000 Median :0.0000
## Mean :0.3266 Mean :1.056 Mean :0.6027 Mean :0.6768
## 3rd Qu.:1.0000 3rd Qu.:1.600 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.0000 Max. :3.0000
## thal condition
## Min. :0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.000 Median :0.0000
## Mean :0.835 Mean :0.4613
## 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :2.000 Max. :1.0000
table(donnees$sex)
##
## 0 1
## 96 201
hist(donnees$age, main="Histogramme de l'âge des patients")
# Comme pour l’exercice précédent, ajuster un modèle de régression logistique, interpréter les résultats et décrire les facteurs de risque pour la maladie cardiaque.
model <- glm(condition ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal, data = donnees, family = binomial)
summary(model)
##
## Call:
## glm(formula = condition ~ age + sex + cp + trestbps + chol +
## fbs + restecg + thalach + exang + oldpeak + slope + ca +
## thal, family = binomial, data = donnees)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.118417 2.729300 -1.875 0.06074 .
## age -0.014057 0.024036 -0.585 0.55866
## sex 1.319688 0.486718 2.711 0.00670 **
## cp 0.578582 0.191335 3.024 0.00250 **
## trestbps 0.024182 0.010727 2.254 0.02418 *
## chol 0.004816 0.003775 1.276 0.20202
## fbs -0.991868 0.554947 -1.787 0.07389 .
## restecg 0.246117 0.185238 1.329 0.18396
## thalach -0.021183 0.010275 -2.062 0.03923 *
## exang 0.915651 0.414003 2.212 0.02699 *
## oldpeak 0.249909 0.212418 1.176 0.23940
## slope 0.582699 0.362317 1.608 0.10778
## ca 1.267008 0.265723 4.768 1.86e-06 ***
## thal 0.714003 0.202068 3.533 0.00041 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 203.86 on 283 degrees of freedom
## AIC: 231.86
##
## Number of Fisher Scoring iterations: 6
#Si le coefficient est positif, cela signifie que l'augmentation de la variable augmente les chances de développer une maladie cardiaque.
#Si le coefficient est négatif, cela signifie que l'augmentation de la variable diminue les chances de développer une maladie cardiaque.
#Les variables avec des coefficients significativement différents de zéro sont les facteurs de risque les plus importants.
# Proposer une méthode de sélection de variables permettant de construire le meilleur sous-modèle au sens de la prédiction du diabète. Construire une règle de classification à partir d’un échantillon d’apprentissage et évaluer le taux de mauvaise classification sur des données test à partir des différents modèles étudiés précédemment.
library(glm2)
set.seed(123) # Pour la reproductibilité
train_indices <- sample(1:nrow(donnees), 0.7 * nrow(donnees)) # 70% des données pour l'apprentissage
train_data <- donnees[train_indices, ]
test_data <- donnees[-train_indices, ]
initial_model <- glm(condition ~ ., data = train_data, family = binomial)
final_model <- step(initial_model, direction = "both")
## Start: AIC=174.56
## condition ~ age + sex + cp + trestbps + chol + fbs + restecg +
## thalach + exang + oldpeak + slope + ca + thal
##
## Df Deviance AIC
## - trestbps 1 146.56 172.56
## - thalach 1 146.73 172.73
## - age 1 146.85 172.85
## - oldpeak 1 148.30 174.30
## <none> 146.56 174.56
## - restecg 1 148.92 174.92
## - fbs 1 148.95 174.95
## - slope 1 149.21 175.21
## - chol 1 150.18 176.18
## - thal 1 152.00 178.00
## - cp 1 153.21 179.21
## - exang 1 153.59 179.59
## - sex 1 155.49 181.49
## - ca 1 164.28 190.28
##
## Step: AIC=172.56
## condition ~ age + sex + cp + chol + fbs + restecg + thalach +
## exang + oldpeak + slope + ca + thal
##
## Df Deviance AIC
## - thalach 1 146.74 170.74
## - age 1 146.89 170.89
## - oldpeak 1 148.31 172.31
## <none> 146.56 172.56
## - restecg 1 148.93 172.93
## - fbs 1 148.98 172.98
## - slope 1 149.21 173.21
## - chol 1 150.19 174.19
## + trestbps 1 146.56 174.56
## - thal 1 152.09 176.09
## - cp 1 153.41 177.41
## - exang 1 153.67 177.67
## - sex 1 155.53 179.53
## - ca 1 164.35 188.35
##
## Step: AIC=170.74
## condition ~ age + sex + cp + chol + fbs + restecg + exang + oldpeak +
## slope + ca + thal
##
## Df Deviance AIC
## - age 1 147.40 169.40
## - oldpeak 1 148.60 170.60
## <none> 146.74 170.74
## - restecg 1 149.22 171.22
## - fbs 1 149.25 171.25
## - slope 1 149.81 171.81
## - chol 1 150.26 172.26
## + thalach 1 146.56 172.56
## + trestbps 1 146.73 172.73
## - thal 1 152.34 174.34
## - exang 1 154.12 176.12
## - cp 1 155.50 177.50
## - sex 1 155.62 177.62
## - ca 1 165.26 187.26
##
## Step: AIC=169.4
## condition ~ sex + cp + chol + fbs + restecg + exang + oldpeak +
## slope + ca + thal
##
## Df Deviance AIC
## - oldpeak 1 149.21 169.21
## <none> 147.40 169.40
## - fbs 1 149.59 169.59
## - restecg 1 150.01 170.01
## - slope 1 150.56 170.56
## + age 1 146.74 170.74
## + thalach 1 146.89 170.89
## - chol 1 151.11 171.11
## + trestbps 1 147.38 171.38
## - thal 1 153.51 173.51
## - exang 1 154.74 174.74
## - sex 1 155.70 175.70
## - cp 1 155.85 175.85
## - ca 1 168.82 188.82
##
## Step: AIC=169.21
## condition ~ sex + cp + chol + fbs + restecg + exang + slope +
## ca + thal
##
## Df Deviance AIC
## <none> 149.21 169.21
## + oldpeak 1 147.40 169.40
## - restecg 1 151.53 169.53
## - fbs 1 151.56 169.56
## + thalach 1 148.56 170.56
## + age 1 148.60 170.60
## + trestbps 1 149.14 171.14
## - chol 1 153.51 171.51
## - thal 1 155.99 173.99
## - exang 1 157.47 175.47
## - cp 1 157.69 175.69
## - slope 1 157.88 175.88
## - sex 1 159.25 177.25
## - ca 1 172.29 190.29
# Implémenter l’algorithme des k-plus proches voisins et proposer un choix de la valeur de k. A partir de cet algorithme des k-plus proches voisins, comparer le taux de mauvaise classification par rapport aux modèles de régression logistique utilisés précédemment.
library(class)
set.seed(123) # Pour la reproductibilité
train_indices <- sample(1:nrow(donnees), 0.7 * nrow(donnees)) # 70% des données pour l'apprentissage
train_data <- donnees[train_indices, ]
test_data <- donnees[-train_indices, ]
knn_cols <- c("age", "sex", "cp", "trestbps", "chol")
k_values <- c(1, 3, 5, 7, 9) # Vous pouvez ajuster ces valeurs selon votre choix
results <- data.frame(k = k_values, error = rep(NA, length(k_values)))
for (i in 1:length(k_values)) {
k <- k_values[i]
predictions <- knn(train_data[, knn_cols], test_data[, knn_cols], train_data$condition, k = k)
error_rate <- mean(predictions != test_data$condition)
results[i, "error"] <- error_rate
}
print(results)
## k error
## 1 1 0.5222222
## 2 3 0.4000000
## 3 5 0.4777778
## 4 7 0.4888889
## 5 9 0.4888889
# Conclure sur l’étude.