Diabetes Analysis & Prediction

Libraries

library(tidyverse)
library("xlsx")
library(ggplot2)
library(reshape2)
library(data.table)
library(caret)
library(e1071)
library(corrplot)
library(kernlab)

Load Data

Load the Data

df <- read.xlsx2("labW9.xlsx", 1, header=TRUE)


View Data

df %>% head(2)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
df %>% glimpse()
## Rows: 768
## Columns: 9
## $ Pregnancies              <chr> "6", "1", "8", "1", "0", "5", "3", "10", "2",~
## $ Glucose                  <chr> "148", "85", "183", "89", "137", "116", "78",~
## $ BloodPressure            <chr> "72", "66", "64", "66", "40", "74", "50", "0"~
## $ SkinThickness            <chr> "35", "29", "0", "23", "35", "0", "32", "0", ~
## $ Insulin                  <chr> "0", "0", "0", "94", "168", "0", "88", "0", "~
## $ BMI                      <chr> "33.6", "26.6", "23.3", "28.1", "43.1", "25.6~
## $ DiabetesPedigreeFunction <chr> "0.627", "0.351", "0.672", "0.167", "2.288", ~
## $ Age                      <chr> "50", "31", "32", "21", "33", "30", "26", "29~
## $ Outcome                  <chr> "1", "0", "1", "0", "1", "0", "1", "0", "1", ~

Data Cleaning

Check for Missing Values

colSums(is.na(df))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Remarks: No missing values in the dataset.


Check for Duplicates

sum(duplicated(df))
## [1] 0

Remarks: No duplicates in the dataset.


Check Variable Data Types

sapply(df, class)
##              Pregnancies                  Glucose            BloodPressure 
##              "character"              "character"              "character" 
##            SkinThickness                  Insulin                      BMI 
##              "character"              "character"              "character" 
## DiabetesPedigreeFunction                      Age                  Outcome 
##              "character"              "character"              "character"

Remarks: All variables are character type, we need to change it to numeric!


Correct Data Types

df %>% select(BMI, DiabetesPedigreeFunction) %>% sapply(as.numeric) -> df[, c("BMI", "DiabetesPedigreeFunction")]
df %>% select(1:5, 8, 9) %>% sapply(as.integer) -> df[, c(1:5, 8, 9)]
df2 <- copy(df) #make a backup


Check for Outliers

df_long <- melt(df %>% select(-Insulin)) 
ggplot(df_long, aes(x = variable, y = value)) + geom_boxplot()

df_long <- melt(df %>% select(Insulin)) 
ggplot(df_long, aes(x = variable, y = value)) + geom_boxplot()

summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

Remarks: Outliers detected in SkinThickness, Glucose, BloodPressure, BMI, Insulin.


Fixing Outliers
SkinThickness

max(df$SkinThickness)
## [1] 99
df <- df %>% filter(SkinThickness < 99)

Remarks: We can see that the mean & median for SkinThickness is around 20, but there is a single value of 99. We will get rid of that.


Unusual “zero” Values

df_zero <- df %>% select(SkinThickness, Glucose, BloodPressure, BMI, Insulin)
colSums(df_zero==0) %>% as.data.frame() %>% mutate("Missing %" = (./nrow(df)*100)) %>% rename("Count of Zero Values" = ".")
##               Count of Zero Values  Missing %
## SkinThickness                  227 29.5958279
## Glucose                          5  0.6518905
## BloodPressure                   35  4.5632334
## BMI                             11  1.4341591
## Insulin                        373 48.6310300

Remarks: It is illogical for these variables to be zero, must be a mistake. We cannot drop these rows as the observations in our dataset is little.

# replace "zero" values with NA - easier to fix
df[, c("SkinThickness", "Glucose", "BloodPressure", "BMI", "Insulin")][df[, c("SkinThickness", "Glucose", "BloodPressure", "BMI", "Insulin")] == 0] <- NA


Fix “zero” Values

df %>% group_by(Outcome) %>% summarise(SkinThicknessMedian = median(SkinThickness, na.rm=TRUE),
                                       GlucoseMedian = median(Glucose, na.rm=TRUE),
                                       BloodPressureMedian = median(BloodPressure, na.rm=TRUE),
                                       BMIMedian = median(BMI, na.rm=TRUE),
                                       InsulinMedian = median(Insulin, na.rm=TRUE))
## # A tibble: 2 x 6
##   Outcome SkinThicknessMedian GlucoseMedian BloodPressureMedian BMIMedian
##     <int>               <int>         <int>               <int>     <dbl>
## 1       0                  27           107                  70      30.1
## 2       1                  32           140                  75      34.3
## # ... with 1 more variable: InsulinMedian <dbl>

Remarks: We can see when the outcome is 1, the values are higher. So, we impute using group median.


#Impute by group median
df <- df %>% 
  group_by(Outcome) %>% 
  mutate(SkinThickness = ifelse(is.na(SkinThickness), 
                            median(SkinThickness, na.rm = TRUE), 
                            SkinThickness),
         Glucose = ifelse(is.na(Glucose), 
                            median(Glucose, na.rm = TRUE), 
                            Glucose),
         BloodPressure = ifelse(is.na(BloodPressure), 
                            median(BloodPressure, na.rm = TRUE), 
                            BloodPressure),
         BMI = ifelse(is.na(BMI), 
                            median(BMI, na.rm = TRUE), 
                            BMI),
         Insulin = ifelse(is.na(Insulin), 
                            median(Insulin, na.rm = TRUE), 
                            Insulin)
         )

df %>% ungroup() -> df


Verifying Cleaning

#Checking for Missing Values
colSums(is.na(df))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0
#Checking for zero values
df_zero <- df %>% select(-Outcome, -Pregnancies)
colSums(df_zero==0)
##                  Glucose            BloodPressure            SkinThickness 
##                        0                        0                        0 
##                  Insulin                      BMI DiabetesPedigreeFunction 
##                        0                        0                        0 
##                      Age 
##                        0

Exploratory Data Analysis


Dataset Variable Distribution: before cleaning

df2 %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_histogram(bins = 15)


Dataset Variable Distribution: after cleaning

df %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_histogram(bins = 15)


Linear Correlation

cor.table = cor(df)
corrplot(cor.table)


Distribution of Target Variable: Outcome

df_outcome <- as.data.frame(table(df$Outcome)) %>% rename("Outcome" = Var1)
ggplot(df_outcome, aes(x="", y=Freq, fill=Outcome)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) + 
  theme_void() + 
  scale_fill_brewer(palette="Set1")


Glucose vs Age

dfplt <- copy(df)
dfplt$Outcome <- as.factor(df$Outcome)
ggplot(dfplt, aes(x = Age, y = Glucose)) + geom_point(aes(color = Outcome)) +
  labs(title = "Glucose Levels by Age", caption = "Remarks: When the outcome is 0, the distribution is \nconcentrated below 150 glucose level mark")


Glucose vs Insulin

ggplot(dfplt, aes(x = Glucose, y = Insulin)) + geom_point(aes(color = Outcome)) + geom_smooth(method = "lm") +
  labs(title = "Glucose vs Insulin", caption = "Remarks: There is a clear positive relationship between glucose and insulin.\nThe higher the glucose levels, the higher is insulin\nAlso, those with outcome of 0 have lower glucose levels")
## `geom_smooth()` using formula 'y ~ x'


Glucose vs BMI

ggplot(dfplt, aes(x = BMI, y = Glucose)) + geom_point(aes(color = Outcome))+ geom_smooth() +
  labs(title = "BMI vs Glucose", caption = "Remarks: There isn't really a strong linear relationship between Glucose and BMI")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'


BMI Distribution

df_bmi <- copy(df)
df_bmi$BMI <- ifelse(df$BMI<19,"Underweight", ifelse(df$BMI>=19 & df$BMI<=25, "Normal", ifelse(df$BMI>=25 & df$BMI<=30, "Overweight","Obese"))) %>% factor(levels=c("Underweight","Normal", "Overweight","Obese"))
bmi <- as.data.frame(table(df_bmi$BMI)) %>% rename("BMI" = "Var1")
ggplot(bmi, aes(x = "", y = Freq, fill=BMI)) + 
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) + 
  theme_void() + 
  scale_fill_brewer(palette="Set1") +
  labs(title = "BMI Distribution", caption = "Remarks: A large proportion of observations are obese!")

Modeling

Preparing Data


Split Data

split = 0.8
trainIndex = createDataPartition(df$Outcome, p = split, list = F)
train_set <- df[trainIndex,]
test_set <- df[-trainIndex,]
train_set[["Outcome"]] = factor(train_set[["Outcome"]])
test_set[["Outcome"]] = factor(test_set[["Outcome"]])

Remarks: We split the data by 80:20.


View Training Data

summary(train_set)
##   Pregnancies       Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.00   Min.   : 44.0   Min.   : 30.00   Min.   : 7.00  
##  1st Qu.: 1.00   1st Qu.:100.0   1st Qu.: 64.00   1st Qu.:25.00  
##  Median : 3.00   Median :118.0   Median : 72.00   Median :28.00  
##  Mean   : 3.92   Mean   :122.1   Mean   : 72.85   Mean   :28.87  
##  3rd Qu.: 6.00   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.00   Max.   :199.0   Max.   :122.00   Max.   :54.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 15.0   Min.   :18.20   Min.   :0.0850           Min.   :21.00  
##  1st Qu.:102.5   1st Qu.:27.50   1st Qu.:0.2442           1st Qu.:24.00  
##  Median :105.0   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   :142.9   Mean   :32.33   Mean   :0.4749           Mean   :33.25  
##  3rd Qu.:169.5   3rd Qu.:36.67   3rd Qu.:0.6458           3rd Qu.:41.00  
##  Max.   :744.0   Max.   :67.10   Max.   :2.3290           Max.   :81.00  
##  Outcome
##  0:400  
##  1:214  
##         
##         
##         
## 
dim(train_set)
## [1] 614   9


View Training Data

summary(test_set)
##   Pregnancies        Glucose      BloodPressure   SkinThickness  
##  Min.   : 0.000   Min.   : 65.0   Min.   : 24.0   Min.   : 8.00  
##  1st Qu.: 1.000   1st Qu.: 98.0   1st Qu.: 62.0   1st Qu.:25.00  
##  Median : 2.000   Median :111.0   Median : 70.0   Median :27.00  
##  Mean   : 3.556   Mean   :119.3   Mean   : 70.6   Mean   :29.52  
##  3rd Qu.: 6.000   3rd Qu.:137.0   3rd Qu.: 78.0   3rd Qu.:32.00  
##  Max.   :15.000   Max.   :198.0   Max.   :108.0   Max.   :63.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.0   Min.   :19.90   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:102.5   1st Qu.:27.80   1st Qu.:0.2400           1st Qu.:25.00  
##  Median :102.5   Median :32.40   Median :0.3680           Median :29.00  
##  Mean   :136.8   Mean   :32.85   Mean   :0.4591           Mean   :33.03  
##  3rd Qu.:169.5   3rd Qu.:36.10   3rd Qu.:0.5910           3rd Qu.:38.00  
##  Max.   :846.0   Max.   :59.40   Max.   :2.4200           Max.   :68.00  
##  Outcome
##  0:100  
##  1: 53  
##         
##         
##         
## 
dim(test_set)
## [1] 153   9


Train Control

set.seed(333) #reproducibility
tc <- trainControl(method = "repeatedcv",
                              number = 10, repeats = 3)

Remarks: We make use of Repeated K-fold for Cross Validation

Build the Model


1. Support Vector Machines (SVM)

1.1 Linear SVM

svm_Lin <- train(Outcome ~., data = train_set, method = "svmLinear",
                    trControl=tc,
                    preProcess = c("center", "scale"),
                    tuneLength = 10)


1.2 Linear SVM w/ Cost Values

svm_LinWCost <- train(Outcome ~., 
                     data = train_set, 
                     method = "svmLinear", 
                     trControl = tc,  
                     preProcess = c("center","scale"), 
                     tuneGrid = expand.grid(C = seq(0, 2, length = 20)))
plot(svm_LinWCost)


1.3 SVM Radial (Non-Linear Kernel)

svm_Rad <- train(Outcome ~., data = train_set, method = "svmRadial", trControl = tc, preProcess = c("center","scale"), tuneLength = 10)
plot(svm_Rad)


1.4 SVM Polynomial (Non-Linear Kernel)

svm_Poly <- train(Outcome ~., data = train_set, method = "svmPoly", trControl = tc, preProcess = c("center","scale"), tuneLength = 4)
plot(svm_Poly)


Comparing SVM Performances

svm_perf = tibble(
  Models = c("Linear", "Linear w/ Cost Values", "Radial", "Polynomial"),
  Accuracy = c(
    svm_Lin$result[2][[1]],
    svm_LinWCost$results[which.min(svm_LinWCost$results[,2]),]$Accuracy,
    svm_Rad$results[which.min(svm_Rad$results[,2]),]$Accuracy,
    svm_Poly$results[which.min(svm_Poly$results[,2]),]$Accuracy))

svm_perf %>% arrange(Accuracy)
## # A tibble: 4 x 2
##   Models                Accuracy
##   <chr>                    <dbl>
## 1 Polynomial               0.652
## 2 Linear                   0.770
## 3 Linear w/ Cost Values    0.773
## 4 Radial                   0.812


2. Logistic Regression

log_reg <- glm(Outcome ~.,family=binomial(link='logit'),data=train_set)


3. Naive Bayes

naive <- train(Outcome ~., data = train_set, trControl = tc, method = "nb")
plot(naive)


4. KNN

knn <- train(Outcome ~., data = train_set, trControl = tc, method = "knn", tuneGrid = data.frame(k = seq(1, 25)))
plot(knn)


5. Random Forest

rf <- train(Outcome ~., data = train_set, trControl = tc, method = "rf")
plot(rf)

Model Evaluation
predictSVMLin <- predict(svm_Lin, newdata = test_set)
predictSVMLinWCost <- predict(svm_LinWCost, newdata = test_set)
predictSVMRad <- predict(svm_Rad, newdata = test_set)
predictSVMPoly <- predict(svm_Poly, newdata = test_set)
predictNaive <- predict(naive, newdata = test_set)
predictLogReg <- predict(log_reg, newdata = test_set, type="response")
predictKNN <- predict(knn, newdata = test_set)
predictRF <- predict(rf, newdata = test_set)


1. Support Vector Machines (SVM)

1.1 Linear SVM

confusionMatrix(predictSVMLin, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 87 19
##          1 13 34
##                                           
##                Accuracy : 0.7908          
##                  95% CI : (0.7178, 0.8523)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 0.0001499       
##                                           
##                   Kappa : 0.5255          
##                                           
##  Mcnemar's Test P-Value : 0.3767591       
##                                           
##             Sensitivity : 0.8700          
##             Specificity : 0.6415          
##          Pos Pred Value : 0.8208          
##          Neg Pred Value : 0.7234          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5686          
##    Detection Prevalence : 0.6928          
##       Balanced Accuracy : 0.7558          
##                                           
##        'Positive' Class : 0               
## 


1.2 Linear SVM w/ Cost Values

confusionMatrix(predictSVMLinWCost, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 87 19
##          1 13 34
##                                           
##                Accuracy : 0.7908          
##                  95% CI : (0.7178, 0.8523)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 0.0001499       
##                                           
##                   Kappa : 0.5255          
##                                           
##  Mcnemar's Test P-Value : 0.3767591       
##                                           
##             Sensitivity : 0.8700          
##             Specificity : 0.6415          
##          Pos Pred Value : 0.8208          
##          Neg Pred Value : 0.7234          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5686          
##    Detection Prevalence : 0.6928          
##       Balanced Accuracy : 0.7558          
##                                           
##        'Positive' Class : 0               
## 


1.3 SVM Radial

confusionMatrix(predictSVMRad, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 83 14
##          1 17 39
##                                          
##                Accuracy : 0.7974         
##                  95% CI : (0.7249, 0.858)
##     No Information Rate : 0.6536         
##     P-Value [Acc > NIR] : 7.169e-05      
##                                          
##                   Kappa : 0.5584         
##                                          
##  Mcnemar's Test P-Value : 0.7194         
##                                          
##             Sensitivity : 0.8300         
##             Specificity : 0.7358         
##          Pos Pred Value : 0.8557         
##          Neg Pred Value : 0.6964         
##              Prevalence : 0.6536         
##          Detection Rate : 0.5425         
##    Detection Prevalence : 0.6340         
##       Balanced Accuracy : 0.7829         
##                                          
##        'Positive' Class : 0              
## 


1.3 SVM Polynomial

confusionMatrix(predictSVMPoly, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 88 12
##          1 12 41
##                                           
##                Accuracy : 0.8431          
##                  95% CI : (0.7757, 0.8968)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 1.313e-07       
##                                           
##                   Kappa : 0.6536          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8800          
##             Specificity : 0.7736          
##          Pos Pred Value : 0.8800          
##          Neg Pred Value : 0.7736          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5752          
##    Detection Prevalence : 0.6536          
##       Balanced Accuracy : 0.8268          
##                                           
##        'Positive' Class : 0               
## 


2. Naive Bayes

confusionMatrix(predictNaive, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 87 11
##          1 13 42
##                                           
##                Accuracy : 0.8431          
##                  95% CI : (0.7757, 0.8968)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 1.313e-07       
##                                           
##                   Kappa : 0.6566          
##                                           
##  Mcnemar's Test P-Value : 0.8383          
##                                           
##             Sensitivity : 0.8700          
##             Specificity : 0.7925          
##          Pos Pred Value : 0.8878          
##          Neg Pred Value : 0.7636          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5686          
##    Detection Prevalence : 0.6405          
##       Balanced Accuracy : 0.8312          
##                                           
##        'Positive' Class : 0               
## 


3. Logistic Regression

confusionMatrix(data = factor(as.integer(predictLogReg>0.5)), reference = test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 89 19
##          1 11 34
##                                           
##                Accuracy : 0.8039          
##                  95% CI : (0.7321, 0.8636)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 3.3e-05         
##                                           
##                   Kappa : 0.5511          
##                                           
##  Mcnemar's Test P-Value : 0.2012          
##                                           
##             Sensitivity : 0.8900          
##             Specificity : 0.6415          
##          Pos Pred Value : 0.8241          
##          Neg Pred Value : 0.7556          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5817          
##    Detection Prevalence : 0.7059          
##       Balanced Accuracy : 0.7658          
##                                           
##        'Positive' Class : 0               
## 


4. KNN

confusionMatrix(predictKNN, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 91 12
##          1  9 41
##                                          
##                Accuracy : 0.8627         
##                  95% CI : (0.7979, 0.913)
##     No Information Rate : 0.6536         
##     P-Value [Acc > NIR] : 4.438e-09      
##                                          
##                   Kappa : 0.6928         
##                                          
##  Mcnemar's Test P-Value : 0.6625         
##                                          
##             Sensitivity : 0.9100         
##             Specificity : 0.7736         
##          Pos Pred Value : 0.8835         
##          Neg Pred Value : 0.8200         
##              Prevalence : 0.6536         
##          Detection Rate : 0.5948         
##    Detection Prevalence : 0.6732         
##       Balanced Accuracy : 0.8418         
##                                          
##        'Positive' Class : 0              
## 


5. Random Forest

confusionMatrix(predictRF, test_set$Outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 94  9
##          1  6 44
##                                           
##                Accuracy : 0.902           
##                  95% CI : (0.8435, 0.9441)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 1.12e-12        
##                                           
##                   Kappa : 0.7806          
##                                           
##  Mcnemar's Test P-Value : 0.6056          
##                                           
##             Sensitivity : 0.9400          
##             Specificity : 0.8302          
##          Pos Pred Value : 0.9126          
##          Neg Pred Value : 0.8800          
##              Prevalence : 0.6536          
##          Detection Rate : 0.6144          
##    Detection Prevalence : 0.6732          
##       Balanced Accuracy : 0.8851          
##                                           
##        'Positive' Class : 0               
## 


Comparison between Models

resamps <- resamples(list(SVM_Linear = svm_LinWCost, 
                          SVM_Poly = svm_Poly, 
                          SVM_Radial = svm_Rad, 
                          KNN = knn, 
                          NaiveBayes = naive, 
                          RandomForest = rf))
bwplot(resamps, metric = "Accuracy")


Remarks: We achieved approx. 90% accuracy with KNN & RandomForest.

Model Selection

Remarks: We will select the RandomForest model for predicting the Outcome, as it provides approx. 90% accuracy.

cm <- confusionMatrix(predictRF, test_set$Outcome)
draw_confusion_matrix(cm)


Evaluate Outcome using Own Data

df_predict <- data.frame(
  list(Pregnancies = c(4, 0, 10), 
       Glucose = c(100, 120, 200), 
       BloodPressure = c(69, 99, 100), 
       SkinThickness = c(20, 30, 50), 
       Insulin = c(120, 100, 60), 
       BMI = c(40.5, 38.5, 50), 
       DiabetesPedigreeFunction = c(0.69, 0.5, 0.9), 
       Age = c(21, 40, 60))
)


Predict Outcome using own Data

predict(rf, newdata = df_predict, method = "prediction")
## [1] 0 0 1
## Levels: 0 1

Remarks: Our model predicted patient 1 and 2 have no diabetes and patient 3 with diabetes.


END