Import the necessary libraries

library('tidyverse') #For data frame manipulation and plotting
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library('caret') #For machine learning
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library('readxl') #For Excel reading

Read the Dataset and do descriptive analysis

DF <- read_excel('/Users/salahkaf/Downloads/labW9.xlsx') #Read the excel file as a tibble
head(DF) #Shows top 6 rows
## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre…
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           6     148            72            35       0  33.6            0.627
## 2           1      85            66            29       0  26.6            0.351
## 3           8     183            64             0       0  23.3            0.672
## 4           1      89            66            23      94  28.1            0.167
## 5           0     137            40            35     168  43.1            2.29 
## 6           5     116            74             0       0  25.6            0.201
## # … with 2 more variables: Age <dbl>, Outcome <dbl>
tail(DF) #Shows last 6 rows
## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre…
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           9      89            62             0       0  22.5            0.142
## 2          10     101            76            48     180  32.9            0.171
## 3           2     122            70            27       0  36.8            0.34 
## 4           5     121            72            23     112  26.2            0.245
## 5           1     126            60             0       0  30.1            0.349
## 6           1      93            70            31       0  30.4            0.315
## # … with 2 more variables: Age <dbl>, Outcome <dbl>
dim(DF) #Shows number of columns and rows
## [1] 768   9
str(DF) #Presents DF structure
## tibble [768 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
summary(DF) #Presents DF summary
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

Checking for any missing values

# Total number of missing values in the data set:
cat("The total number of missing values in the dataset is" , sum(is.na(DF)))
## The total number of missing values in the dataset is 0
# Total number of missing values in the dataset per column name
colSums(is.na(DF)) 
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Changing the “outcome” column (Target value) into categorical data to apply classification

DF$Outcome<-gsub(1,"diabetic", as.character(DF$Outcome)) #Changing 1 to diabetic
DF$Outcome<-gsub(0,"Non-diabetic", as.character(DF$Outcome)) #Changing 0 to Non-diabetic
DF$Outcome <- as.factor(DF$Outcome) #Make it as a factor in order to apply classification

Checking the final form of the DF

head(DF)
## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre…
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           6     148            72            35       0  33.6            0.627
## 2           1      85            66            29       0  26.6            0.351
## 3           8     183            64             0       0  23.3            0.672
## 4           1      89            66            23      94  28.1            0.167
## 5           0     137            40            35     168  43.1            2.29 
## 6           5     116            74             0       0  25.6            0.201
## # … with 2 more variables: Age <dbl>, Outcome <fct>

Splitting the dataset into 70/30

split = 0.7
trainIndex <- createDataPartition(DF$Outcome, p = split, list = F)
data_train <- DF[trainIndex,]
data_test <- DF[-trainIndex,]

Checking the training and testing subsets

dim(data_train)
## [1] 538   9
dim(data_test)
## [1] 230   9
#### Equal number of columns (9), rows with 70% & 30% split

Apply cross validation

train_control <- trainControl(method="cv", number=10) #10 subsets

Train the data using K nearest neighbor

model <- train(Outcome~., data = data_train, trControl=train_control, method="knn")

Ploting the model

plot(model)

Predict the values of test data

predictions <- predict(model, newdata = data_test)

#Evaluate the outcome by using confusion Matrix

cm <-confusionMatrix(predictions, data_test$Outcome)
cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     diabetic Non-diabetic
##   diabetic           43           26
##   Non-diabetic       37          124
##                                           
##                Accuracy : 0.7261          
##                  95% CI : (0.6636, 0.7826)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.01019         
##                                           
##                   Kappa : 0.3762          
##                                           
##  Mcnemar's Test P-Value : 0.20771         
##                                           
##             Sensitivity : 0.5375          
##             Specificity : 0.8267          
##          Pos Pred Value : 0.6232          
##          Neg Pred Value : 0.7702          
##              Prevalence : 0.3478          
##          Detection Rate : 0.1870          
##    Detection Prevalence : 0.3000          
##       Balanced Accuracy : 0.6821          
##                                           
##        'Positive' Class : diabetic        
##