TP 2 : Régression logistique 2-2

# 1) Faire une analyse descriptive de la base de données.
library(readr)
donnees <- read_csv("heart_cleveland.csv")

## Rows: 297 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(donnees)

## # A tibble: 6 × 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
## 1    69     1     0      160   234     1       2     131     0     0.1     1
## 2    69     0     0      140   239     0       0     151     0     1.8     0
## 3    66     0     0      150   226     0       0     114     0     2.6     2
## 4    65     1     0      138   282     1       2     174     0     1.4     1
## 5    64     1     0      110   211     0       2     144     1     1.8     1
## 6    64     1     0      170   227     0       2     155     0     0.6     1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, condition <dbl>

str(donnees)

## spc_tbl_ [297 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age      : num [1:297] 69 69 66 65 64 64 63 61 60 59 ...
##  $ sex      : num [1:297] 1 0 0 1 1 1 1 1 0 1 ...
##  $ cp       : num [1:297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps : num [1:297] 160 140 150 138 110 170 145 134 150 178 ...
##  $ chol     : num [1:297] 234 239 226 282 211 227 233 234 240 270 ...
##  $ fbs      : num [1:297] 1 0 0 1 0 0 1 0 0 0 ...
##  $ restecg  : num [1:297] 2 0 0 2 2 2 2 0 0 2 ...
##  $ thalach  : num [1:297] 131 151 114 174 144 155 150 145 171 145 ...
##  $ exang    : num [1:297] 0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak  : num [1:297] 0.1 1.8 2.6 1.4 1.8 0.6 2.3 2.6 0.9 4.2 ...
##  $ slope    : num [1:297] 1 0 2 1 1 1 2 1 0 2 ...
##  $ ca       : num [1:297] 1 2 0 1 0 0 0 2 0 0 ...
##  $ thal     : num [1:297] 0 0 0 0 0 2 1 0 0 2 ...
##  $ condition: num [1:297] 0 0 0 1 0 0 0 1 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   condition = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

colnames(donnees)

##  [1] "age"       "sex"       "cp"        "trestbps"  "chol"      "fbs"      
##  [7] "restecg"   "thalach"   "exang"     "oldpeak"   "slope"     "ca"       
## [13] "thal"      "condition"

summary(donnees)

##       age             sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :2.000   Median :130.0  
##  Mean   :54.54   Mean   :0.6768   Mean   :2.158   Mean   :131.7  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.0  
##  Median :243.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :247.4   Mean   :0.1448   Mean   :0.9966   Mean   :149.6  
##  3rd Qu.:276.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope              ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.0000   Median :0.0000  
##  Mean   :0.3266   Mean   :1.056   Mean   :0.6027   Mean   :0.6768  
##  3rd Qu.:1.0000   3rd Qu.:1.600   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.0000   Max.   :3.0000  
##       thal         condition     
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000  
##  Mean   :0.835   Mean   :0.4613  
##  3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :2.000   Max.   :1.0000

table(donnees$sex)

## 
##   0   1 
##  96 201

hist(donnees$age, main="Histogramme de l'âge des patients", col = "red")

# Créer un boxplot de l'âge en fonction de la condition cardiaque
boxplot(age ~ condition, data = donnees,
        main = "Boxplot de l'âge en fonction du sexe",
        xlab = "Sexe", ylab = "Âge",
        col = c("green", "blue"),
        names = c("Pas malade", " malade"))

#Donc, l'âge n'a pas d'effet sur la condition cardiaque.

boxplot(age ~ sex, data = donnees,
        main = "Boxplot de l'âge en fonction du sexe",
        xlab = "Sexe", ylab = "Âge",
        col = c("red","blue" ),
        names = c("Femme", "Homme"))

# 2)Comme pour l’exercice précédent, ajuster un modèle de régression logistique, interpréter les résultats et décrire les facteurs de risque pour la maladie cardiaque.

model <- glm(condition ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal, data = donnees, family = binomial)

summary(model)

## 
## Call:
## glm(formula = condition ~ age + sex + cp + trestbps + chol + 
##     fbs + restecg + thalach + exang + oldpeak + slope + ca + 
##     thal, family = binomial, data = donnees)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.118417   2.729300  -1.875  0.06074 .  
## age         -0.014057   0.024036  -0.585  0.55866    
## sex          1.319688   0.486718   2.711  0.00670 ** 
## cp           0.578582   0.191335   3.024  0.00250 ** 
## trestbps     0.024182   0.010727   2.254  0.02418 *  
## chol         0.004816   0.003775   1.276  0.20202    
## fbs         -0.991868   0.554947  -1.787  0.07389 .  
## restecg      0.246117   0.185238   1.329  0.18396    
## thalach     -0.021183   0.010275  -2.062  0.03923 *  
## exang        0.915651   0.414003   2.212  0.02699 *  
## oldpeak      0.249909   0.212418   1.176  0.23940    
## slope        0.582699   0.362317   1.608  0.10778    
## ca           1.267008   0.265723   4.768 1.86e-06 ***
## thal         0.714003   0.202068   3.533  0.00041 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 409.95  on 296  degrees of freedom
## Residual deviance: 203.86  on 283  degrees of freedom
## AIC: 231.86
## 
## Number of Fisher Scoring iterations: 6

#Si le coefficient est positif, cela signifie que l'augmentation de la variable augmente les chances de développer une maladie cardiaque.

#Si le coefficient est négatif, cela signifie que l'augmentation de la variable diminue les chances de développer une maladie cardiaque.

#Les variables avec des coefficients significativement différents de zéro sont les facteurs de risque les plus importants.

# l'analyse de régression logistique menée sur les données de cette étude sur les maladies cardiaques a permis d'identifier plusieurs facteurs de risque significatifs associés à la condition cardiaque. 

#Ces facteurs de risque incluent le sexe (être de sexe masculin), la présence de douleur thoracique (cp), une pression artérielle au repos élevée (trestbps), une fréquence cardiaque maximale plus basse (thalach), la présence d'angine induite par l'exercice (exang), un plus grand nombre de vaisseaux sanguins principaux colorés par la fluoroscopie (ca) et les résultats de thallium (thal).

#Il est important de noter que d'autres variables, telles que l'âge, le cholestérol sérique, le taux de sucre dans le sang à jeun, les résultats électrocardiographiques au repos, la dépression de l'ECG à l'exercice et la pente du segment ST de l'ECG ,slope, oldpeak ne se sont pas avérées significatives dans ce modèle.

# p>5% on ne rejet pas HO

# 3) Proposer une méthode de sélection de variables permettant de construire le meilleur sous-modèle au sens de la prédiction du diabète. Construire une règle de classification à partir d’un échantillon d’apprentissage et évaluer le taux de mauvaise classification sur des données test à partir des différents modèles étudiés précédemment.
  
library(glm2)

initial_model <- glm(condition ~ ., data = donnees, family = "binomial")

final_model <- step(initial_model, direction = "both")

## Start:  AIC=231.86
## condition ~ age + sex + cp + trestbps + chol + fbs + restecg + 
##     thalach + exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - age       1   204.20 230.20
## - oldpeak   1   205.27 231.27
## - chol      1   205.50 231.50
## - restecg   1   205.64 231.64
## <none>          203.86 231.86
## - slope     1   206.42 232.42
## - fbs       1   207.22 233.22
## - thalach   1   208.28 234.28
## - exang     1   208.71 234.71
## - trestbps  1   209.15 235.15
## - sex       1   211.62 237.62
## - cp        1   213.59 239.59
## - thal      1   216.76 242.76
## - ca        1   232.55 258.55
## 
## Step:  AIC=230.2
## condition ~ sex + cp + trestbps + chol + fbs + restecg + thalach + 
##     exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - chol      1   205.67 229.67
## - oldpeak   1   205.78 229.78
## - restecg   1   205.96 229.96
## <none>          204.20 230.20
## - slope     1   206.70 230.70
## - fbs       1   207.67 231.67
## + age       1   203.86 231.86
## - thalach   1   208.34 232.34
## - trestbps  1   209.15 233.15
## - exang     1   209.29 233.29
## - sex       1   212.59 236.59
## - cp        1   214.30 238.30
## - thal      1   216.96 240.96
## - ca        1   233.41 257.41
## 
## Step:  AIC=229.67
## condition ~ sex + cp + trestbps + fbs + restecg + thalach + exang + 
##     oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - oldpeak   1   207.53 229.53
## <none>          205.67 229.67
## - slope     1   207.91 229.91
## + chol      1   204.20 230.20
## - restecg   1   208.23 230.23
## - fbs       1   209.07 231.07
## - thalach   1   209.46 231.46
## + age       1   205.50 231.50
## - exang     1   210.59 232.59
## - trestbps  1   210.68 232.68
## - sex       1   212.64 234.64
## - cp        1   215.91 237.91
## - thal      1   219.52 241.52
## - ca        1   235.35 257.35
## 
## Step:  AIC=229.53
## condition ~ sex + cp + trestbps + fbs + restecg + thalach + exang + 
##     slope + ca + thal
## 
##            Df Deviance    AIC
## <none>          207.53 229.53
## + oldpeak   1   205.67 229.67
## + chol      1   205.78 229.78
## - restecg   1   209.92 229.92
## + age       1   207.25 231.25
## - fbs       1   211.35 231.35
## - thalach   1   211.98 231.98
## - exang     1   213.30 233.30
## - trestbps  1   213.44 233.44
## - slope     1   213.76 233.76
## - sex       1   215.62 235.62
## - cp        1   217.40 237.40
## - thal      1   221.70 241.70
## - ca        1   241.39 261.39

# Utiliser step avec k = 2 pour calculer le BIC
final_model <- step(initial_model, direction = "backward", k = log(dim(donnees)[1]))

## Start:  AIC=283.57
## condition ~ age + sex + cp + trestbps + chol + fbs + restecg + 
##     thalach + exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - age       1   204.20 278.22
## - oldpeak   1   205.27 279.29
## - chol      1   205.50 279.51
## - restecg   1   205.64 279.66
## - slope     1   206.42 280.44
## - fbs       1   207.22 281.24
## - thalach   1   208.28 282.30
## - exang     1   208.71 282.73
## - trestbps  1   209.15 283.16
## <none>          203.86 283.57
## - sex       1   211.62 285.64
## - cp        1   213.59 287.61
## - thal      1   216.76 290.78
## - ca        1   232.55 306.57
## 
## Step:  AIC=278.22
## condition ~ sex + cp + trestbps + chol + fbs + restecg + thalach + 
##     exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - chol      1   205.67 274.00
## - oldpeak   1   205.78 274.10
## - restecg   1   205.96 274.28
## - slope     1   206.70 275.02
## - fbs       1   207.67 275.99
## - thalach   1   208.34 276.66
## - trestbps  1   209.15 277.47
## - exang     1   209.29 277.62
## <none>          204.20 278.22
## - sex       1   212.59 280.92
## - cp        1   214.30 282.63
## - thal      1   216.96 285.29
## - ca        1   233.41 301.73
## 
## Step:  AIC=274
## condition ~ sex + cp + trestbps + fbs + restecg + thalach + exang + 
##     oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - oldpeak   1   207.53 270.17
## - slope     1   207.91 270.54
## - restecg   1   208.23 270.86
## - fbs       1   209.07 271.70
## - thalach   1   209.46 272.09
## - exang     1   210.59 273.22
## - trestbps  1   210.68 273.31
## <none>          205.67 274.00
## - sex       1   212.64 275.27
## - cp        1   215.91 278.55
## - thal      1   219.52 282.15
## - ca        1   235.35 297.98
## 
## Step:  AIC=270.17
## condition ~ sex + cp + trestbps + fbs + restecg + thalach + exang + 
##     slope + ca + thal
## 
##            Df Deviance    AIC
## - restecg   1   209.92 266.86
## - fbs       1   211.35 268.28
## - thalach   1   211.98 268.92
## <none>          207.53 270.17
## - exang     1   213.30 270.23
## - trestbps  1   213.44 270.38
## - slope     1   213.76 270.70
## - sex       1   215.62 272.56
## - cp        1   217.40 274.34
## - thal      1   221.70 278.63
## - ca        1   241.39 298.33
## 
## Step:  AIC=266.86
## condition ~ sex + cp + trestbps + fbs + thalach + exang + slope + 
##     ca + thal
## 
##            Df Deviance    AIC
## - fbs       1   213.71 264.95
## - thalach   1   214.45 265.69
## <none>          209.92 266.86
## - exang     1   215.87 267.12
## - trestbps  1   216.76 268.01
## - slope     1   217.23 268.47
## - sex       1   218.64 269.88
## - cp        1   219.67 270.91
## - thal      1   223.46 274.70
## - ca        1   245.78 297.02
## 
## Step:  AIC=264.95
## condition ~ sex + cp + trestbps + thalach + exang + slope + ca + 
##     thal
## 
##            Df Deviance    AIC
## - thalach   1   218.80 264.35
## - trestbps  1   219.25 264.80
## - exang     1   219.36 264.91
## <none>          213.71 264.95
## - slope     1   219.97 265.52
## - sex       1   221.33 266.88
## - cp        1   224.22 269.77
## - thal      1   227.73 273.28
## - ca        1   246.38 291.93
## 
## Step:  AIC=264.35
## condition ~ sex + cp + trestbps + exang + slope + ca + thal
## 
##            Df Deviance    AIC
## - trestbps  1   223.74 263.60
## <none>          218.80 264.35
## - sex       1   225.58 265.44
## - exang     1   226.99 266.84
## - slope     1   229.31 269.17
## - cp        1   232.38 272.23
## - thal      1   234.31 274.17
## - ca        1   256.95 296.81
## 
## Step:  AIC=263.6
## condition ~ sex + cp + exang + slope + ca + thal
## 
##         Df Deviance    AIC
## - sex    1   229.06 263.22
## <none>       223.74 263.60
## - exang  1   232.55 266.71
## - slope  1   234.41 268.57
## - cp     1   235.07 269.24
## - thal   1   241.59 275.76
## - ca     1   261.75 295.92
## 
## Step:  AIC=263.22
## condition ~ cp + exang + slope + ca + thal
## 
##         Df Deviance    AIC
## <none>       229.06 263.22
## - exang  1   238.06 266.52
## - cp     1   238.81 267.28
## - slope  1   239.27 267.74
## - thal   1   260.69 289.15
## - ca     1   268.56 297.03

set.seed(123)  # Pour la reproductibilité
train_indices <- sample(1:nrow(donnees), 0.8 *nrow(donnees))  # 80% des données pour l'apprentissage
train_data <- donnees[train_indices, ]
test_data <- donnees[-train_indices, ]

#Taux d'erreur de la classification

# Prédire les valeurs de classe à partir du modèle final
predictions <- predict(final_model, newdata = test_data, type = "response")

# Définir un seuil de classification (par exemple, 0.5)
threshold <- 0.5

# Convertir les prédictions en classe (0 ou 1)
predicted_class <- (predictions > threshold)
predicted_class

##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
##  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
## FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE 
##    40    41    42    43    44    45    46    47    48    49    50    51    52 
## FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE 
##    53    54    55    56    57    58    59    60 
## FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE

# Calculer le taux d'erreur
misclassification_rate <- mean(predicted_class != test_data$condition)
cat("Taux d'erreur de classification sur l'ensemble de test:", misclassification_rate, "\n")

## Taux d'erreur de classification sur l'ensemble de test: 0.1666667

# 4)Implémenter l’algorithme des k-plus proches voisins et proposer un choix de la valeur de k. A partir de cet algorithme des k-plus proches voisins, comparer le taux de mauvaise classification par rapport aux modèles de régression logistique utilisés précédemment.
library(class)

set.seed(123)  # Pour la reproductibilité
train_indices <- sample(1:nrow(donnees), 0.8 * nrow(donnees))  # 80% des données pour l'apprentissage
train_data <- donnees[train_indices, ]
test_data <- donnees[-train_indices, ]


knn_cols <- c("age", "sex", "cp", "trestbps", "chol")

k_values <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)  # Vous pouvez ajuster ces valeurs selon votre choix
results <- data.frame(k = k_values, error = rep(NA, length(k_values)))

for (i in 1:length(k_values)) {
  k <- k_values[i]
  predictions <- knn(train_data[, knn_cols], test_data[, knn_cols], train_data$condition, k = k)
  error_rate <- mean(predictions != test_data$condition)
  results[i, "error"] <- error_rate
}

print(results)

##   k     error
## 1 1 0.5500000
## 2 2 0.5333333
## 3 3 0.4166667
## 4 4 0.5000000
## 5 5 0.4500000
## 6 6 0.4666667
## 7 7 0.4500000
## 8 8 0.4166667
## 9 9 0.4500000

#  5)Conclure sur l’étude.
 # cette étude nous a permis de construire un modèle prédictif de régression logistique pour estimer la probabilité de présence d'une affection cardiaque en fonction des caractéristiques médicales des patients. Cependant, il est essentiel de poursuivre les recherches en utilisant des ensembles de données plus vastes et en explorant d'autres variables potentiellement pertinentes pour améliorer la précision du modèle. De plus, il est recommandé de consulter des professionnels de la santé pour interpréter correctement les résultats de ces analyses dans un contexte clinique.

TP 2 : Régression logistique 2-2

Sandeep-Singh NIRMAL

2023-10-13