TP 2 : Régression logistique 2-2

#1. Faire une analyse descriptive de la base de données.

library(readr)

donnees <- read_csv("heart_cleveland.csv")

## Rows: 297 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(donnees)

## # A tibble: 6 × 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
## 1    69     1     0      160   234     1       2     131     0     0.1     1
## 2    69     0     0      140   239     0       0     151     0     1.8     0
## 3    66     0     0      150   226     0       0     114     0     2.6     2
## 4    65     1     0      138   282     1       2     174     0     1.4     1
## 5    64     1     0      110   211     0       2     144     1     1.8     1
## 6    64     1     0      170   227     0       2     155     0     0.6     1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, condition <dbl>

str(donnees)

## spc_tbl_ [297 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age      : num [1:297] 69 69 66 65 64 64 63 61 60 59 ...
##  $ sex      : num [1:297] 1 0 0 1 1 1 1 1 0 1 ...
##  $ cp       : num [1:297] 0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps : num [1:297] 160 140 150 138 110 170 145 134 150 178 ...
##  $ chol     : num [1:297] 234 239 226 282 211 227 233 234 240 270 ...
##  $ fbs      : num [1:297] 1 0 0 1 0 0 1 0 0 0 ...
##  $ restecg  : num [1:297] 2 0 0 2 2 2 2 0 0 2 ...
##  $ thalach  : num [1:297] 131 151 114 174 144 155 150 145 171 145 ...
##  $ exang    : num [1:297] 0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak  : num [1:297] 0.1 1.8 2.6 1.4 1.8 0.6 2.3 2.6 0.9 4.2 ...
##  $ slope    : num [1:297] 1 0 2 1 1 1 2 1 0 2 ...
##  $ ca       : num [1:297] 1 2 0 1 0 0 0 2 0 0 ...
##  $ thal     : num [1:297] 0 0 0 0 0 2 1 0 0 2 ...
##  $ condition: num [1:297] 0 0 0 1 0 0 0 1 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   condition = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

colnames(donnees)

##  [1] "age"       "sex"       "cp"        "trestbps"  "chol"      "fbs"      
##  [7] "restecg"   "thalach"   "exang"     "oldpeak"   "slope"     "ca"       
## [13] "thal"      "condition"

summary(donnees)

##       age             sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :2.000   Median :130.0  
##  Mean   :54.54   Mean   :0.6768   Mean   :2.158   Mean   :131.7  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.0  
##  Median :243.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :247.4   Mean   :0.1448   Mean   :0.9966   Mean   :149.6  
##  3rd Qu.:276.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope              ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.0000   Median :0.0000  
##  Mean   :0.3266   Mean   :1.056   Mean   :0.6027   Mean   :0.6768  
##  3rd Qu.:1.0000   3rd Qu.:1.600   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.0000   Max.   :3.0000  
##       thal         condition     
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000  
##  Mean   :0.835   Mean   :0.4613  
##  3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :2.000   Max.   :1.0000

table(donnees$sex)

## 
##   0   1 
##  96 201

hist(donnees$age, main="Histogramme de l'âge des patients")

# Comme pour l’exercice précédent, ajuster un modèle de régression logistique, interpréter les résultats et décrire les facteurs de risque pour la maladie cardiaque.

model <- glm(condition ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal, data = donnees, family = binomial)

summary(model)

## 
## Call:
## glm(formula = condition ~ age + sex + cp + trestbps + chol + 
##     fbs + restecg + thalach + exang + oldpeak + slope + ca + 
##     thal, family = binomial, data = donnees)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.118417   2.729300  -1.875  0.06074 .  
## age         -0.014057   0.024036  -0.585  0.55866    
## sex          1.319688   0.486718   2.711  0.00670 ** 
## cp           0.578582   0.191335   3.024  0.00250 ** 
## trestbps     0.024182   0.010727   2.254  0.02418 *  
## chol         0.004816   0.003775   1.276  0.20202    
## fbs         -0.991868   0.554947  -1.787  0.07389 .  
## restecg      0.246117   0.185238   1.329  0.18396    
## thalach     -0.021183   0.010275  -2.062  0.03923 *  
## exang        0.915651   0.414003   2.212  0.02699 *  
## oldpeak      0.249909   0.212418   1.176  0.23940    
## slope        0.582699   0.362317   1.608  0.10778    
## ca           1.267008   0.265723   4.768 1.86e-06 ***
## thal         0.714003   0.202068   3.533  0.00041 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 409.95  on 296  degrees of freedom
## Residual deviance: 203.86  on 283  degrees of freedom
## AIC: 231.86
## 
## Number of Fisher Scoring iterations: 6

#Si le coefficient est positif, cela signifie que l'augmentation de la variable augmente les chances de développer une maladie cardiaque.

#Si le coefficient est négatif, cela signifie que l'augmentation de la variable diminue les chances de développer une maladie cardiaque.

#Les variables avec des coefficients significativement différents de zéro sont les facteurs de risque les plus importants.

# Proposer une méthode de sélection de variables permettant de construire le meilleur sous-modèle au sens de la prédiction du diabète. Construire une règle de classification à partir d’un échantillon d’apprentissage et évaluer le taux de mauvaise classification sur des données test à partir des différents modèles étudiés précédemment.
  
library(glm2)

set.seed(123)  # Pour la reproductibilité
train_indices <- sample(1:nrow(donnees), 0.7 * nrow(donnees))  # 70% des données pour l'apprentissage
train_data <- donnees[train_indices, ]
test_data <- donnees[-train_indices, ]

initial_model <- glm(condition ~ ., data = train_data, family = binomial)

final_model <- step(initial_model, direction = "both")

## Start:  AIC=174.56
## condition ~ age + sex + cp + trestbps + chol + fbs + restecg + 
##     thalach + exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - trestbps  1   146.56 172.56
## - thalach   1   146.73 172.73
## - age       1   146.85 172.85
## - oldpeak   1   148.30 174.30
## <none>          146.56 174.56
## - restecg   1   148.92 174.92
## - fbs       1   148.95 174.95
## - slope     1   149.21 175.21
## - chol      1   150.18 176.18
## - thal      1   152.00 178.00
## - cp        1   153.21 179.21
## - exang     1   153.59 179.59
## - sex       1   155.49 181.49
## - ca        1   164.28 190.28
## 
## Step:  AIC=172.56
## condition ~ age + sex + cp + chol + fbs + restecg + thalach + 
##     exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - thalach   1   146.74 170.74
## - age       1   146.89 170.89
## - oldpeak   1   148.31 172.31
## <none>          146.56 172.56
## - restecg   1   148.93 172.93
## - fbs       1   148.98 172.98
## - slope     1   149.21 173.21
## - chol      1   150.19 174.19
## + trestbps  1   146.56 174.56
## - thal      1   152.09 176.09
## - cp        1   153.41 177.41
## - exang     1   153.67 177.67
## - sex       1   155.53 179.53
## - ca        1   164.35 188.35
## 
## Step:  AIC=170.74
## condition ~ age + sex + cp + chol + fbs + restecg + exang + oldpeak + 
##     slope + ca + thal
## 
##            Df Deviance    AIC
## - age       1   147.40 169.40
## - oldpeak   1   148.60 170.60
## <none>          146.74 170.74
## - restecg   1   149.22 171.22
## - fbs       1   149.25 171.25
## - slope     1   149.81 171.81
## - chol      1   150.26 172.26
## + thalach   1   146.56 172.56
## + trestbps  1   146.73 172.73
## - thal      1   152.34 174.34
## - exang     1   154.12 176.12
## - cp        1   155.50 177.50
## - sex       1   155.62 177.62
## - ca        1   165.26 187.26
## 
## Step:  AIC=169.4
## condition ~ sex + cp + chol + fbs + restecg + exang + oldpeak + 
##     slope + ca + thal
## 
##            Df Deviance    AIC
## - oldpeak   1   149.21 169.21
## <none>          147.40 169.40
## - fbs       1   149.59 169.59
## - restecg   1   150.01 170.01
## - slope     1   150.56 170.56
## + age       1   146.74 170.74
## + thalach   1   146.89 170.89
## - chol      1   151.11 171.11
## + trestbps  1   147.38 171.38
## - thal      1   153.51 173.51
## - exang     1   154.74 174.74
## - sex       1   155.70 175.70
## - cp        1   155.85 175.85
## - ca        1   168.82 188.82
## 
## Step:  AIC=169.21
## condition ~ sex + cp + chol + fbs + restecg + exang + slope + 
##     ca + thal
## 
##            Df Deviance    AIC
## <none>          149.21 169.21
## + oldpeak   1   147.40 169.40
## - restecg   1   151.53 169.53
## - fbs       1   151.56 169.56
## + thalach   1   148.56 170.56
## + age       1   148.60 170.60
## + trestbps  1   149.14 171.14
## - chol      1   153.51 171.51
## - thal      1   155.99 173.99
## - exang     1   157.47 175.47
## - cp        1   157.69 175.69
## - slope     1   157.88 175.88
## - sex       1   159.25 177.25
## - ca        1   172.29 190.29

# Implémenter l’algorithme des k-plus proches voisins et proposer un choix de la valeur de k. A partir de cet algorithme des k-plus proches voisins, comparer le taux de mauvaise classification par rapport aux modèles de régression logistique utilisés précédemment.
library(class)

set.seed(123)  # Pour la reproductibilité
train_indices <- sample(1:nrow(donnees), 0.7 * nrow(donnees))  # 70% des données pour l'apprentissage
train_data <- donnees[train_indices, ]
test_data <- donnees[-train_indices, ]


knn_cols <- c("age", "sex", "cp", "trestbps", "chol")

k_values <- c(1, 3, 5, 7, 9)  # Vous pouvez ajuster ces valeurs selon votre choix
results <- data.frame(k = k_values, error = rep(NA, length(k_values)))

for (i in 1:length(k_values)) {
  k <- k_values[i]
  predictions <- knn(train_data[, knn_cols], test_data[, knn_cols], train_data$condition, k = k)
  error_rate <- mean(predictions != test_data$condition)
  results[i, "error"] <- error_rate
}

print(results)

##   k     error
## 1 1 0.5222222
## 2 3 0.4000000
## 3 5 0.4777778
## 4 7 0.4888889
## 5 9 0.4888889

#  Conclure sur l’étude.

TP 2 : Régression logistique 2-2

Sandeep-Singh NIRMAL

2023-10-13