Using some packages

library(readxl)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()

library(caTools)

1. Importing data

df = read_excel("~/Desktop/R-dir/R studying/dataset/Pima Indian Diabetes Dta.xlsx")
head(df)

## # A tibble: 6 × 10
##      id Pregnancies Glucose BloodP…¹ SkinT…² Insulin   bmi Diabe…³   Age Outcome
##   <dbl>       <dbl>   <dbl>    <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>   <dbl>
## 1     1           6     148       72      35       0  33.6   0.627    50       1
## 2     2           1      85       66      29       0  26.6   0.351    31       0
## 3     3           8     183       64       0       0  23.3   0.672    32       1
## 4     4           1      89       66      23      94  28.1   0.167    21       0
## 5     5           0     137       40      35     168  43.1   2.29     33       1
## 6     6           5     116       74       0       0  25.6   0.201    30       0
## # … with abbreviated variable names ¹BloodPressure, ²SkinThickness,
## #   ³DiabetesPedigreeFunction

tail(df)

## # A tibble: 6 × 10
##      id Pregnancies Glucose BloodP…¹ SkinT…² Insulin   bmi Diabe…³   Age Outcome
##   <dbl>       <dbl>   <dbl>    <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>   <dbl>
## 1   763           9      89       62       0       0  22.5   0.142    33       0
## 2   764          10     101       76      48     180  32.9   0.171    63       0
## 3   765           2     122       70      27       0  36.8   0.34     27       0
## 4   766           5     121       72      23     112  26.2   0.245    30       0
## 5   767           1     126       60       0       0  30.1   0.349    47       1
## 6   768           1      93       70      31       0  30.4   0.315    23       0
## # … with abbreviated variable names ¹BloodPressure, ²SkinThickness,
## #   ³DiabetesPedigreeFunction

head(as.data.frame(df))

##   id Pregnancies Glucose BloodPressure SkinThickness Insulin  bmi
## 1  1           6     148            72            35       0 33.6
## 2  2           1      85            66            29       0 26.6
## 3  3           8     183            64             0       0 23.3
## 4  4           1      89            66            23      94 28.1
## 5  5           0     137            40            35     168 43.1
## 6  6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

dim(df)

## [1] 768  10

str(df)

## tibble [768 × 10] (S3: tbl_df/tbl/data.frame)
##  $ id                      : num [1:768] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ bmi                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...

2. Relationship btw glucose and diabetes

# Conventional 
df$Y = as.factor(df$Outcome)
df$Db = ifelse(df$Outcome == 1, "Yes", "No")
head(as.data.frame(df))

##   id Pregnancies Glucose BloodPressure SkinThickness Insulin  bmi
## 1  1           6     148            72            35       0 33.6
## 2  2           1      85            66            29       0 26.6
## 3  3           8     183            64             0       0 23.3
## 4  4           1      89            66            23      94 28.1
## 5  5           0     137            40            35     168 43.1
## 6  6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome Y  Db
## 1                    0.627  50       1 1 Yes
## 2                    0.351  31       0 0  No
## 3                    0.672  32       1 1 Yes
## 4                    0.167  21       0 0  No
## 5                    2.288  33       1 1 Yes
## 6                    0.201  30       0 0  No

summary(glm(Outcome ~ Age + Glucose, family = binomial, data = df))

## 
## Call:
## glm(formula = Outcome ~ Age + Glucose, family = binomial, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3367  -0.7775  -0.5087   0.8367   3.1630  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.912449   0.462620  -12.78  < 2e-16 ***
## Age          0.024778   0.007374    3.36 0.000778 ***
## Glucose      0.035644   0.003290   10.83  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 797.36  on 765  degrees of freedom
## AIC: 803.36
## 
## Number of Fisher Scoring iterations: 4

3. Machine Learning

3.1. K-fold validation

control = trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary, classProbs = T)

fit = train(form = Db ~ Age + Pregnancies + Glucose, data = df, method = "glm",
            family ="binomial", trControl = control)

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.

summary(fit)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3087  -0.7654  -0.4897   0.8183   3.1788  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.888017   0.466245 -12.629  < 2e-16 ***
## Age          0.008107   0.008693   0.933 0.351035    
## Pregnancies  0.108410   0.030062   3.606 0.000311 ***
## Glucose      0.036445   0.003334  10.930  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 784.08  on 764  degrees of freedom
## AIC: 792.08
## 
## Number of Fisher Scoring iterations: 4

fit

## Generalized Linear Model 
## 
## 768 samples
##   3 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 691, 691, 691, 692, 692, ... 
## Resampling results:
## 
##   ROC        Sens   Spec     
##   0.8015214  0.878  0.5032764

3.2. Splitting sample

index = createDataPartition(df$Y, p = 0.7, list = F)
train = df[index,]
test = df[-index,]
dim(df)

## [1] 768  12

dim(train)

## [1] 538  12

dim(test)

## [1] 230  12

# Fit a logistic model to the training data set
logistic = glm(form = Y ~ Age + Pregnancies + Glucose + BloodPressure + bmi + Insulin, data = train,family ="binomial")
logistic

## 
## Call:  glm(formula = Y ~ Age + Pregnancies + Glucose + BloodPressure + 
##     bmi + Insulin, family = "binomial", data = train)
## 
## Coefficients:
##   (Intercept)            Age    Pregnancies        Glucose  BloodPressure  
##    -8.2430295      0.0184821      0.1136256      0.0359087     -0.0149310  
##           bmi        Insulin  
##     0.0959414     -0.0007296  
## 
## Degrees of Freedom: 537 Total (i.e. Null);  531 Residual
## Null Deviance:       696.3 
## Residual Deviance: 508.3     AIC: 522.3

summary(logistic)

## 
## Call:
## glm(formula = Y ~ Age + Pregnancies + Glucose + BloodPressure + 
##     bmi + Insulin, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2106  -0.7181  -0.4057   0.7627   2.8614  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.2430295  0.8376861  -9.840  < 2e-16 ***
## Age            0.0184821  0.0106565   1.734  0.08286 .  
## Pregnancies    0.1136256  0.0368388   3.084  0.00204 ** 
## Glucose        0.0359087  0.0043069   8.338  < 2e-16 ***
## BloodPressure -0.0149310  0.0061965  -2.410  0.01597 *  
## bmi            0.0959414  0.0179505   5.345 9.05e-08 ***
## Insulin       -0.0007296  0.0010061  -0.725  0.46831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 508.29  on 531  degrees of freedom
## AIC: 522.29
## 
## Number of Fisher Scoring iterations: 5

importances <- varImp(logistic)
importances

##                 Overall
## Age           1.7343437
## Pregnancies   3.0844005
## Glucose       8.3375426
## BloodPressure 2.4095804
## bmi           5.3447596
## Insulin       0.7252267

# Make a predictions on the testing data
prob <- predict(logistic, newdata = test, type = "response")
predictions <- ifelse(prob >0.5, 1, 0)

# Confusion matrix
#predictions = factor(predictions, levels = c("1", "0"))
#test$Y = factor(test$Y, levels = c("1", "0"))

confusionMatrix(factor(predictions), test$Y, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 131  37
##          1  19  43
##                                           
##                Accuracy : 0.7565          
##                  95% CI : (0.6958, 0.8105)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.000421        
##                                           
##                   Kappa : 0.4336          
##                                           
##  Mcnemar's Test P-Value : 0.023103        
##                                           
##             Sensitivity : 0.5375          
##             Specificity : 0.8733          
##          Pos Pred Value : 0.6935          
##          Neg Pred Value : 0.7798          
##              Prevalence : 0.3478          
##          Detection Rate : 0.1870          
##    Detection Prevalence : 0.2696          
##       Balanced Accuracy : 0.7054          
##                                           
##        'Positive' Class : 1               
##

colAUC(prob, test$Y, plot = T)

##              [,1]
## 0 vs. 1 0.8204167

ML - Logistic Regression cont.

Vu Thien

2023-03-09

Using some packages

1. Importing data

2. Relationship btw glucose and diabetes

3. Machine Learning

3.1. K-fold validation

3.2. Splitting sample