library(readxl)
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## Loading required package: ggplot2
## Loading required package: lattice

1. Reading and exploring data

df <- read_excel("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Pima Indian Diabetes Dta.xlsx")
str(df)
## tibble [768 × 10] (S3: tbl_df/tbl/data.frame)
##  $ id                      : num [1:768] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ bmi                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
dim(df)
## [1] 768  10
head(df, 20)
## # A tibble: 20 × 10
##       id Pregnancies Glucose BloodPressure SkinThickness Insulin   bmi
##    <dbl>       <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1     1           6     148            72            35       0  33.6
##  2     2           1      85            66            29       0  26.6
##  3     3           8     183            64             0       0  23.3
##  4     4           1      89            66            23      94  28.1
##  5     5           0     137            40            35     168  43.1
##  6     6           5     116            74             0       0  25.6
##  7     7           3      78            50            32      88  31  
##  8     8          10     115             0             0       0  35.3
##  9     9           2     197            70            45     543  30.5
## 10    10           8     125            96             0       0   0  
## 11    11           4     110            92             0       0  37.6
## 12    12          10     168            74             0       0  38  
## 13    13          10     139            80             0       0  27.1
## 14    14           1     189            60            23     846  30.1
## 15    15           5     166            72            19     175  25.8
## 16    16           7     100             0             0       0  30  
## 17    17           0     118            84            47     230  45.8
## 18    18           7     107            74             0       0  29.6
## 19    19           1     103            30            38      83  43.3
## 20    20           1     115            70            30      96  34.6
## # … with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## #   Outcome <dbl>
tail(df, 10)
## # A tibble: 10 × 10
##       id Pregnancies Glucose BloodPressure SkinThickness Insulin   bmi
##    <dbl>       <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1   759           1     106            76             0       0  37.5
##  2   760           6     190            92             0       0  35.5
##  3   761           2      88            58            26      16  28.4
##  4   762           9     170            74            31       0  44  
##  5   763           9      89            62             0       0  22.5
##  6   764          10     101            76            48     180  32.9
##  7   765           2     122            70            27       0  36.8
##  8   766           5     121            72            23     112  26.2
##  9   767           1     126            60             0       0  30.1
## 10   768           1      93            70            31       0  30.4
## # … with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## #   Outcome <dbl>
colnames(df)
##  [1] "id"                       "Pregnancies"             
##  [3] "Glucose"                  "BloodPressure"           
##  [5] "SkinThickness"            "Insulin"                 
##  [7] "bmi"                      "DiabetesPedigreeFunction"
##  [9] "Age"                      "Outcome"
df$Y = as.factor(df$Outcome)
table(df$Y)
## 
##   0   1 
## 500 268
levels(df$Y) = c("No", "Yes")
table(df$Y)
## 
##  No Yes 
## 500 268
table(df$Outcome, df$Y)
##    
##      No Yes
##   0 500   0
##   1   0 268

2. Cross-validation

control1 <- trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary,
                        classProbs = T)

fit1 <- train(form = Y ~ Age + bmi + Pregnancies + BloodPressure + Insulin + Glucose +
               SkinThickness, data = df, method = "glm", family = "binomial", trControl = control1)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit1
## Generalized Linear Model 
## 
## 768 samples
##   7 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 692, 691, 691, 691, 691, ... 
## Resampling results:
## 
##   ROC        Sens   Spec     
##   0.8255128  0.872  0.5675214

3. Bootstrap

control2 <- trainControl(method = "boot", number = 1000, summaryFunction = twoClassSummary,
                         classProbs = T)

fit2 <- train(form = Y ~ Age + bmi + Pregnancies + BloodPressure + Insulin + Glucose +
               SkinThickness, data = df, method = "glm", family = "binomial", trControl = control2)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit2
## Generalized Linear Model 
## 
## 768 samples
##   7 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (1000 reps) 
## Summary of sample sizes: 768, 768, 768, 768, 768, 768, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8212199  0.8693072  0.5676405

4. Splitting sample

index = createDataPartition(df$Y, p = 0.7, list = F)
training = df[index,]
testing = df[-index,]
dim(training)
## [1] 538  11
dim(testing)
## [1] 230  11
fit3 <- train(form = Y ~ Age + bmi + Pregnancies + BloodPressure + Insulin + Glucose +
                SkinThickness, data = training, method = "glm", family = "binomial", trControl = control2)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit3
## Generalized Linear Model 
## 
## 538 samples
##   7 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (1000 reps) 
## Summary of sample sizes: 538, 538, 538, 538, 538, 538, ... 
## Resampling results:
## 
##   ROC        Sens       Spec    
##   0.8268958  0.8737254  0.575895

Model testing

testing$predicted = predict(fit3, testing, type = "prob")
head(testing)
## # A tibble: 6 × 12
##      id Pregnancies Glucose BloodPressure SkinThickness Insulin   bmi
##   <dbl>       <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1     1           6     148            72            35       0  33.6
## 2     2           1      85            66            29       0  26.6
## 3     3           8     183            64             0       0  23.3
## 4     5           0     137            40            35     168  43.1
## 5     6           5     116            74             0       0  25.6
## 6     9           2     197            70            45     543  30.5
## # … with 5 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## #   Outcome <dbl>, Y <fct>, predicted <df[,2]>
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.2
colAUC(testing$predicted, testing$Y)
##                   No       Yes
## No vs. Yes 0.8079167 0.8079167
testing$predicted = predict(fit3, testing, type = "raw")
confusionMatrix(testing$predicted, testing$Y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  120  29
##        Yes  30  51
##                                           
##                Accuracy : 0.7435          
##                  95% CI : (0.6819, 0.7986)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.001872        
##                                           
##                   Kappa : 0.4362          
##                                           
##  Mcnemar's Test P-Value : 1.000000        
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.6375          
##          Pos Pred Value : 0.8054          
##          Neg Pred Value : 0.6296          
##              Prevalence : 0.6522          
##          Detection Rate : 0.5217          
##    Detection Prevalence : 0.6478          
##       Balanced Accuracy : 0.7188          
##                                           
##        'Positive' Class : No              
## 

Prof. Tuan online lectures https://www.youtube.com/watch?v=3VEgrLX7Gs0&list=PLbRKZL7ww3qhPFZAIhusefzLbQFg6nyT4&index=8