LAB 10 - S2108437

Import the necessary libraries

library('tidyverse') #For data frame manipulation and plotting

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library('caret') #For machine learning

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library('readxl') #For Excel reading

Read the Dataset and do descriptive analysis

DF <- read_excel('/Users/salahkaf/Downloads/labW9.xlsx') #Read the excel file as a tibble
head(DF) #Shows top 6 rows

## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre…
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           6     148            72            35       0  33.6            0.627
## 2           1      85            66            29       0  26.6            0.351
## 3           8     183            64             0       0  23.3            0.672
## 4           1      89            66            23      94  28.1            0.167
## 5           0     137            40            35     168  43.1            2.29 
## 6           5     116            74             0       0  25.6            0.201
## # … with 2 more variables: Age <dbl>, Outcome <dbl>

tail(DF) #Shows last 6 rows

## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre…
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           9      89            62             0       0  22.5            0.142
## 2          10     101            76            48     180  32.9            0.171
## 3           2     122            70            27       0  36.8            0.34 
## 4           5     121            72            23     112  26.2            0.245
## 5           1     126            60             0       0  30.1            0.349
## 6           1      93            70            31       0  30.4            0.315
## # … with 2 more variables: Age <dbl>, Outcome <dbl>

dim(DF) #Shows number of columns and rows

## [1] 768   9

str(DF) #Presents DF structure

## tibble [768 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...

summary(DF) #Presents DF summary

##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

Checking for any missing values

# Total number of missing values in the data set:
cat("The total number of missing values in the dataset is" , sum(is.na(DF)))

## The total number of missing values in the dataset is 0

# Total number of missing values in the dataset per column name
colSums(is.na(DF))

##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Changing the “outcome” column (Target value) into categorical data to apply classification

DF$Outcome<-gsub(1,"diabetic", as.character(DF$Outcome)) #Changing 1 to diabetic
DF$Outcome<-gsub(0,"Non-diabetic", as.character(DF$Outcome)) #Changing 0 to Non-diabetic
DF$Outcome <- as.factor(DF$Outcome) #Make it as a factor in order to apply classification

Checking the final form of the DF

head(DF)

## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre…
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           6     148            72            35       0  33.6            0.627
## 2           1      85            66            29       0  26.6            0.351
## 3           8     183            64             0       0  23.3            0.672
## 4           1      89            66            23      94  28.1            0.167
## 5           0     137            40            35     168  43.1            2.29 
## 6           5     116            74             0       0  25.6            0.201
## # … with 2 more variables: Age <dbl>, Outcome <fct>

Splitting the dataset into 70/30

split = 0.7
trainIndex <- createDataPartition(DF$Outcome, p = split, list = F)
data_train <- DF[trainIndex,]
data_test <- DF[-trainIndex,]

Checking the training and testing subsets

dim(data_train)

## [1] 538   9

dim(data_test)

## [1] 230   9

#### Equal number of columns (9), rows with 70% & 30% split

Apply cross validation

train_control <- trainControl(method="cv", number=10) #10 subsets

Train the data using K nearest neighbor

model <- train(Outcome~., data = data_train, trControl=train_control, method="knn")

Ploting the model

plot(model)

Predict the values of test data

predictions <- predict(model, newdata = data_test)

#Evaluate the outcome by using confusion Matrix

cm <-confusionMatrix(predictions, data_test$Outcome)
cm

## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     diabetic Non-diabetic
##   diabetic           43           26
##   Non-diabetic       37          124
##                                           
##                Accuracy : 0.7261          
##                  95% CI : (0.6636, 0.7826)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.01019         
##                                           
##                   Kappa : 0.3762          
##                                           
##  Mcnemar's Test P-Value : 0.20771         
##                                           
##             Sensitivity : 0.5375          
##             Specificity : 0.8267          
##          Pos Pred Value : 0.6232          
##          Neg Pred Value : 0.7702          
##              Prevalence : 0.3478          
##          Detection Rate : 0.1870          
##    Detection Prevalence : 0.3000          
##       Balanced Accuracy : 0.6821          
##                                           
##        'Positive' Class : diabetic        
##

LAB 10 - S2108437

Salah

1/5/2022

Import the necessary libraries

Read the Dataset and do descriptive analysis

Checking for any missing values

Changing the “outcome” column (Target value) into categorical data to apply classification

Checking the final form of the DF

Splitting the dataset into 70/30

Checking the training and testing subsets

Apply cross validation

Train the data using K nearest neighbor

Ploting the model

Predict the values of test data