library('tidyverse') #For data frame manipulation and plotting
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('caret') #For machine learning
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library('readxl') #For Excel reading
DF <- read_excel('/Users/salahkaf/Downloads/labW9.xlsx') #Read the excel file as a tibble
head(DF) #Shows top 6 rows
## # A tibble: 6 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre…
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6 0.627
## 2 1 85 66 29 0 26.6 0.351
## 3 8 183 64 0 0 23.3 0.672
## 4 1 89 66 23 94 28.1 0.167
## 5 0 137 40 35 168 43.1 2.29
## 6 5 116 74 0 0 25.6 0.201
## # … with 2 more variables: Age <dbl>, Outcome <dbl>
tail(DF) #Shows last 6 rows
## # A tibble: 6 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre…
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9 89 62 0 0 22.5 0.142
## 2 10 101 76 48 180 32.9 0.171
## 3 2 122 70 27 0 36.8 0.34
## 4 5 121 72 23 112 26.2 0.245
## 5 1 126 60 0 0 30.1 0.349
## 6 1 93 70 31 0 30.4 0.315
## # … with 2 more variables: Age <dbl>, Outcome <dbl>
dim(DF) #Shows number of columns and rows
## [1] 768 9
str(DF) #Presents DF structure
## tibble [768 × 9] (S3: tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
summary(DF) #Presents DF summary
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
# Total number of missing values in the data set:
cat("The total number of missing values in the dataset is" , sum(is.na(DF)))
## The total number of missing values in the dataset is 0
# Total number of missing values in the dataset per column name
colSums(is.na(DF))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
DF$Outcome<-gsub(1,"diabetic", as.character(DF$Outcome)) #Changing 1 to diabetic
DF$Outcome<-gsub(0,"Non-diabetic", as.character(DF$Outcome)) #Changing 0 to Non-diabetic
DF$Outcome <- as.factor(DF$Outcome) #Make it as a factor in order to apply classification
head(DF)
## # A tibble: 6 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre…
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6 0.627
## 2 1 85 66 29 0 26.6 0.351
## 3 8 183 64 0 0 23.3 0.672
## 4 1 89 66 23 94 28.1 0.167
## 5 0 137 40 35 168 43.1 2.29
## 6 5 116 74 0 0 25.6 0.201
## # … with 2 more variables: Age <dbl>, Outcome <fct>
split = 0.7
trainIndex <- createDataPartition(DF$Outcome, p = split, list = F)
data_train <- DF[trainIndex,]
data_test <- DF[-trainIndex,]
dim(data_train)
## [1] 538 9
dim(data_test)
## [1] 230 9
#### Equal number of columns (9), rows with 70% & 30% split
train_control <- trainControl(method="cv", number=10) #10 subsets
model <- train(Outcome~., data = data_train, trControl=train_control, method="knn")
plot(model)
predictions <- predict(model, newdata = data_test)
#Evaluate the outcome by using confusion Matrix
cm <-confusionMatrix(predictions, data_test$Outcome)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction diabetic Non-diabetic
## diabetic 43 26
## Non-diabetic 37 124
##
## Accuracy : 0.7261
## 95% CI : (0.6636, 0.7826)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.01019
##
## Kappa : 0.3762
##
## Mcnemar's Test P-Value : 0.20771
##
## Sensitivity : 0.5375
## Specificity : 0.8267
## Pos Pred Value : 0.6232
## Neg Pred Value : 0.7702
## Prevalence : 0.3478
## Detection Rate : 0.1870
## Detection Prevalence : 0.3000
## Balanced Accuracy : 0.6821
##
## 'Positive' Class : diabetic
##