library(readxl)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
library(caTools)
df = read_excel("~/Desktop/R-dir/R studying/dataset/Pima Indian Diabetes Dta.xlsx")
head(df)
## # A tibble: 6 × 10
## id Pregnancies Glucose BloodP…¹ SkinT…² Insulin bmi Diabe…³ Age Outcome
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 148 72 35 0 33.6 0.627 50 1
## 2 2 1 85 66 29 0 26.6 0.351 31 0
## 3 3 8 183 64 0 0 23.3 0.672 32 1
## 4 4 1 89 66 23 94 28.1 0.167 21 0
## 5 5 0 137 40 35 168 43.1 2.29 33 1
## 6 6 5 116 74 0 0 25.6 0.201 30 0
## # … with abbreviated variable names ¹​BloodPressure, ²​SkinThickness,
## # ³​DiabetesPedigreeFunction
tail(df)
## # A tibble: 6 × 10
## id Pregnancies Glucose BloodP…¹ SkinT…² Insulin bmi Diabe…³ Age Outcome
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 763 9 89 62 0 0 22.5 0.142 33 0
## 2 764 10 101 76 48 180 32.9 0.171 63 0
## 3 765 2 122 70 27 0 36.8 0.34 27 0
## 4 766 5 121 72 23 112 26.2 0.245 30 0
## 5 767 1 126 60 0 0 30.1 0.349 47 1
## 6 768 1 93 70 31 0 30.4 0.315 23 0
## # … with abbreviated variable names ¹​BloodPressure, ²​SkinThickness,
## # ³​DiabetesPedigreeFunction
head(as.data.frame(df))
## id Pregnancies Glucose BloodPressure SkinThickness Insulin bmi
## 1 1 6 148 72 35 0 33.6
## 2 2 1 85 66 29 0 26.6
## 3 3 8 183 64 0 0 23.3
## 4 4 1 89 66 23 94 28.1
## 5 5 0 137 40 35 168 43.1
## 6 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
dim(df)
## [1] 768 10
str(df)
## tibble [768 × 10] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:768] 1 2 3 4 5 6 7 8 9 10 ...
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ bmi : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
# Conventional
df$Y = as.factor(df$Outcome)
df$Db = ifelse(df$Outcome == 1, "Yes", "No")
head(as.data.frame(df))
## id Pregnancies Glucose BloodPressure SkinThickness Insulin bmi
## 1 1 6 148 72 35 0 33.6
## 2 2 1 85 66 29 0 26.6
## 3 3 8 183 64 0 0 23.3
## 4 4 1 89 66 23 94 28.1
## 5 5 0 137 40 35 168 43.1
## 6 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome Y Db
## 1 0.627 50 1 1 Yes
## 2 0.351 31 0 0 No
## 3 0.672 32 1 1 Yes
## 4 0.167 21 0 0 No
## 5 2.288 33 1 1 Yes
## 6 0.201 30 0 0 No
summary(glm(Outcome ~ Age + Glucose, family = binomial, data = df))
##
## Call:
## glm(formula = Outcome ~ Age + Glucose, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3367 -0.7775 -0.5087 0.8367 3.1630
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.912449 0.462620 -12.78 < 2e-16 ***
## Age 0.024778 0.007374 3.36 0.000778 ***
## Glucose 0.035644 0.003290 10.83 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 797.36 on 765 degrees of freedom
## AIC: 803.36
##
## Number of Fisher Scoring iterations: 4
control = trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary, classProbs = T)
fit = train(form = Db ~ Age + Pregnancies + Glucose, data = df, method = "glm",
family ="binomial", trControl = control)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
summary(fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3087 -0.7654 -0.4897 0.8183 3.1788
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.888017 0.466245 -12.629 < 2e-16 ***
## Age 0.008107 0.008693 0.933 0.351035
## Pregnancies 0.108410 0.030062 3.606 0.000311 ***
## Glucose 0.036445 0.003334 10.930 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 784.08 on 764 degrees of freedom
## AIC: 792.08
##
## Number of Fisher Scoring iterations: 4
fit
## Generalized Linear Model
##
## 768 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 691, 691, 691, 691, 692, 692, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8015214 0.878 0.5032764
index = createDataPartition(df$Y, p = 0.7, list = F)
train = df[index,]
test = df[-index,]
dim(df)
## [1] 768 12
dim(train)
## [1] 538 12
dim(test)
## [1] 230 12
# Fit a logistic model to the training data set
logistic = glm(form = Y ~ Age + Pregnancies + Glucose + BloodPressure + bmi + Insulin, data = train,family ="binomial")
logistic
##
## Call: glm(formula = Y ~ Age + Pregnancies + Glucose + BloodPressure +
## bmi + Insulin, family = "binomial", data = train)
##
## Coefficients:
## (Intercept) Age Pregnancies Glucose BloodPressure
## -8.2430295 0.0184821 0.1136256 0.0359087 -0.0149310
## bmi Insulin
## 0.0959414 -0.0007296
##
## Degrees of Freedom: 537 Total (i.e. Null); 531 Residual
## Null Deviance: 696.3
## Residual Deviance: 508.3 AIC: 522.3
summary(logistic)
##
## Call:
## glm(formula = Y ~ Age + Pregnancies + Glucose + BloodPressure +
## bmi + Insulin, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2106 -0.7181 -0.4057 0.7627 2.8614
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.2430295 0.8376861 -9.840 < 2e-16 ***
## Age 0.0184821 0.0106565 1.734 0.08286 .
## Pregnancies 0.1136256 0.0368388 3.084 0.00204 **
## Glucose 0.0359087 0.0043069 8.338 < 2e-16 ***
## BloodPressure -0.0149310 0.0061965 -2.410 0.01597 *
## bmi 0.0959414 0.0179505 5.345 9.05e-08 ***
## Insulin -0.0007296 0.0010061 -0.725 0.46831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 696.28 on 537 degrees of freedom
## Residual deviance: 508.29 on 531 degrees of freedom
## AIC: 522.29
##
## Number of Fisher Scoring iterations: 5
importances <- varImp(logistic)
importances
## Overall
## Age 1.7343437
## Pregnancies 3.0844005
## Glucose 8.3375426
## BloodPressure 2.4095804
## bmi 5.3447596
## Insulin 0.7252267
# Make a predictions on the testing data
prob <- predict(logistic, newdata = test, type = "response")
predictions <- ifelse(prob >0.5, 1, 0)
# Confusion matrix
#predictions = factor(predictions, levels = c("1", "0"))
#test$Y = factor(test$Y, levels = c("1", "0"))
confusionMatrix(factor(predictions), test$Y, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 131 37
## 1 19 43
##
## Accuracy : 0.7565
## 95% CI : (0.6958, 0.8105)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.000421
##
## Kappa : 0.4336
##
## Mcnemar's Test P-Value : 0.023103
##
## Sensitivity : 0.5375
## Specificity : 0.8733
## Pos Pred Value : 0.6935
## Neg Pred Value : 0.7798
## Prevalence : 0.3478
## Detection Rate : 0.1870
## Detection Prevalence : 0.2696
## Balanced Accuracy : 0.7054
##
## 'Positive' Class : 1
##
colAUC(prob, test$Y, plot = T)
## [,1]
## 0 vs. 1 0.8204167