library(readxl)
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## Loading required package: ggplot2
## Loading required package: lattice
df <- read_excel("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Pima Indian Diabetes Dta.xlsx")
str(df)
## tibble [768 × 10] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:768] 1 2 3 4 5 6 7 8 9 10 ...
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ bmi : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
dim(df)
## [1] 768 10
head(df, 20)
## # A tibble: 20 × 10
## id Pregnancies Glucose BloodPressure SkinThickness Insulin bmi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 148 72 35 0 33.6
## 2 2 1 85 66 29 0 26.6
## 3 3 8 183 64 0 0 23.3
## 4 4 1 89 66 23 94 28.1
## 5 5 0 137 40 35 168 43.1
## 6 6 5 116 74 0 0 25.6
## 7 7 3 78 50 32 88 31
## 8 8 10 115 0 0 0 35.3
## 9 9 2 197 70 45 543 30.5
## 10 10 8 125 96 0 0 0
## 11 11 4 110 92 0 0 37.6
## 12 12 10 168 74 0 0 38
## 13 13 10 139 80 0 0 27.1
## 14 14 1 189 60 23 846 30.1
## 15 15 5 166 72 19 175 25.8
## 16 16 7 100 0 0 0 30
## 17 17 0 118 84 47 230 45.8
## 18 18 7 107 74 0 0 29.6
## 19 19 1 103 30 38 83 43.3
## 20 20 1 115 70 30 96 34.6
## # … with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## # Outcome <dbl>
tail(df, 10)
## # A tibble: 10 × 10
## id Pregnancies Glucose BloodPressure SkinThickness Insulin bmi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 759 1 106 76 0 0 37.5
## 2 760 6 190 92 0 0 35.5
## 3 761 2 88 58 26 16 28.4
## 4 762 9 170 74 31 0 44
## 5 763 9 89 62 0 0 22.5
## 6 764 10 101 76 48 180 32.9
## 7 765 2 122 70 27 0 36.8
## 8 766 5 121 72 23 112 26.2
## 9 767 1 126 60 0 0 30.1
## 10 768 1 93 70 31 0 30.4
## # … with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## # Outcome <dbl>
colnames(df)
## [1] "id" "Pregnancies"
## [3] "Glucose" "BloodPressure"
## [5] "SkinThickness" "Insulin"
## [7] "bmi" "DiabetesPedigreeFunction"
## [9] "Age" "Outcome"
df$Y = as.factor(df$Outcome)
table(df$Y)
##
## 0 1
## 500 268
levels(df$Y) = c("No", "Yes")
table(df$Y)
##
## No Yes
## 500 268
table(df$Outcome, df$Y)
##
## No Yes
## 0 500 0
## 1 0 268
control1 <- trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary,
classProbs = T)
fit1 <- train(form = Y ~ Age + bmi + Pregnancies + BloodPressure + Insulin + Glucose +
SkinThickness, data = df, method = "glm", family = "binomial", trControl = control1)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit1
## Generalized Linear Model
##
## 768 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 691, 692, 691, 691, 691, 691, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8255128 0.872 0.5675214
control2 <- trainControl(method = "boot", number = 1000, summaryFunction = twoClassSummary,
classProbs = T)
fit2 <- train(form = Y ~ Age + bmi + Pregnancies + BloodPressure + Insulin + Glucose +
SkinThickness, data = df, method = "glm", family = "binomial", trControl = control2)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit2
## Generalized Linear Model
##
## 768 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (1000 reps)
## Summary of sample sizes: 768, 768, 768, 768, 768, 768, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8212199 0.8693072 0.5676405
index = createDataPartition(df$Y, p = 0.7, list = F)
training = df[index,]
testing = df[-index,]
dim(training)
## [1] 538 11
dim(testing)
## [1] 230 11
fit3 <- train(form = Y ~ Age + bmi + Pregnancies + BloodPressure + Insulin + Glucose +
SkinThickness, data = training, method = "glm", family = "binomial", trControl = control2)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit3
## Generalized Linear Model
##
## 538 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (1000 reps)
## Summary of sample sizes: 538, 538, 538, 538, 538, 538, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8268958 0.8737254 0.575895
testing$predicted = predict(fit3, testing, type = "prob")
head(testing)
## # A tibble: 6 × 12
## id Pregnancies Glucose BloodPressure SkinThickness Insulin bmi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 148 72 35 0 33.6
## 2 2 1 85 66 29 0 26.6
## 3 3 8 183 64 0 0 23.3
## 4 5 0 137 40 35 168 43.1
## 5 6 5 116 74 0 0 25.6
## 6 9 2 197 70 45 543 30.5
## # … with 5 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## # Outcome <dbl>, Y <fct>, predicted <df[,2]>
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.2
colAUC(testing$predicted, testing$Y)
## No Yes
## No vs. Yes 0.8079167 0.8079167
testing$predicted = predict(fit3, testing, type = "raw")
confusionMatrix(testing$predicted, testing$Y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 120 29
## Yes 30 51
##
## Accuracy : 0.7435
## 95% CI : (0.6819, 0.7986)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.001872
##
## Kappa : 0.4362
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8000
## Specificity : 0.6375
## Pos Pred Value : 0.8054
## Neg Pred Value : 0.6296
## Prevalence : 0.6522
## Detection Rate : 0.5217
## Detection Prevalence : 0.6478
## Balanced Accuracy : 0.7188
##
## 'Positive' Class : No
##
Prof. Tuan online lectures https://www.youtube.com/watch?v=3VEgrLX7Gs0&list=PLbRKZL7ww3qhPFZAIhusefzLbQFg6nyT4&index=8