Machine Learning for Data Science

Import the packages

# Excel file reading package
if (!require('readxl')) install.packages('readxl'); library('readxl')
# Data cleaning packages
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
if (!require('tidyr')) install.packages('tidyr'); library('tidyr')
if (!require('lubridate')) install.packages('lubridate'); library('lubridate')
# Plotting packages
if (!require('GGally')) install.packages('GGally'); library('GGally')
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
# Machine learning packages
if (!require('caret')) install.packages('caret'); library('caret')
if (!require('klaR')) install.packages('klaR'); library('klaR')
if (!require('randomForest')) install.packages('randomForest'); library('randomForest')

Loading data from Excel encoded type into R dataframe type.

df_diabetes <- data.frame(read_excel(path="./labW9.xlsx", sheet="diabetes"))

Data exploration & cleaning

Check underlying classes, structures, and dimensions

class(df_diabetes)
## [1] "data.frame"
str(df_diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num  1 0 1 0 1 0 1 0 1 1 ...
dim(df_diabetes)
## [1] 768   9

Converting column Outcome as factor data type as classifier

df_diabetes <- df_diabetes %>% 
  mutate(Outcome=as.factor(Outcome)) %>%
  mutate(Outcome=recode_factor(Outcome, "0"="Neg", "1"="Pos"))
str(df_diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : Factor w/ 2 levels "Neg","Pos": 2 1 2 1 2 1 2 1 2 2 ...

Check first 5 rows of the dataset

head(df_diabetes, 5)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50     Pos
## 2                    0.351  31     Neg
## 3                    0.672  32     Pos
## 4                    0.167  21     Neg
## 5                    2.288  33     Pos

Check last 5 rows of the dataset

tail(df_diabetes, 5)
##     Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 764          10     101            76            48     180 32.9
## 765           2     122            70            27       0 36.8
## 766           5     121            72            23     112 26.2
## 767           1     126            60             0       0 30.1
## 768           1      93            70            31       0 30.4
##     DiabetesPedigreeFunction Age Outcome
## 764                    0.171  63     Neg
## 765                    0.340  27     Neg
## 766                    0.245  30     Neg
## 767                    0.349  47     Pos
## 768                    0.315  23     Neg

Check if there is any NA data in the dataset

sum(is.na(df_diabetes))
## [1] 0

Plotting the dataset with scatterplot matrix

Machine Learning

Setting the split percentage for training & testing targetted on column Outcome

split <- 0.70
train_index <- createDataPartition(df_diabetes$Outcome, p=split, list=FALSE)
data_train <- df_diabetes[train_index,]
data_test <- df_diabetes[-train_index,]

Checking and asserting the dimensions of the subsets

dim(data_train)
## [1] 538   9
print(dim(data_train)/dim(df_diabetes))
## [1] 0.7005208 1.0000000
dim(data_test)
## [1] 230   9
print(dim(data_test)/dim(df_diabetes))
## [1] 0.2994792 1.0000000

Cross Validating model against the dataset

# Randomized resampling fold count
train_number <- sample(1:100, 1)
train_control <- trainControl(method="cv", number=train_number)

K-Nearest Neighbors Cross Validating

knn_model <- train(
  Outcome~.,
  data=df_diabetes,
  method="knn", 
  trControl=train_control)
knn_model
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'Neg', 'Pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 615, 614, 615, 614, 614 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7186232  0.3579085
##   7  0.7199813  0.3650231
##   9  0.7278245  0.3827447
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Random Forest Validating

rf_model <- train(
  Outcome~.,
  data=df_diabetes,
  method="rf", 
  trControl=train_control)
rf_model
## Random Forest 
## 
## 768 samples
##   8 predictor
##   2 classes: 'Neg', 'Pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 614, 615, 615, 614, 614 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7747390  0.4881509
##   5     0.7551481  0.4488195
##   8     0.7551736  0.4527435
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Comparing the accuracy of cross validated models

plot(knn_model) 

plot(rf_model)

Picking Random Forest model because of higher accuracy

Pre-checking Out of Bag (OOB) Score for the RF model

rf_model$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 22.92%
## Confusion matrix:
##     Neg Pos class.error
## Neg 433  67   0.1340000
## Pos 109 159   0.4067164
plot(rf_model$finalModel)
legend("right", colnames(rf_model$finalModel$err.rate),col=1:3, cex=1, fill=1:3)

Training the final Random Forest model with data_train

rf_final_model <- train(
  Outcome~.,
  data=data_train,
  method="rf", 
  trControl=train_control)
rf_final_model
## Random Forest 
## 
## 538 samples
##   8 predictor
##   2 classes: 'Neg', 'Pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 431, 430, 430, 431, 430 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7675839  0.4692797
##   5     0.7564036  0.4473214
##   8     0.7507961  0.4370385
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

Plotting the Random Forest final model

plot(rf_final_model)

Using the Random Forest model to perform prediction

rf_prediction <- predict(rf_final_model, newdata=data_test)
table(rf_prediction)
## rf_prediction
## Neg Pos 
## 157  73

Evaluate model using confusion matrix

rf_confusion_matrix <- table(rf_prediction, data_test$Outcome)
confusionMatrix(rf_confusion_matrix)
## Confusion Matrix and Statistics
## 
##              
## rf_prediction Neg Pos
##           Neg 126  31
##           Pos  24  49
##                                           
##                Accuracy : 0.7609          
##                  95% CI : (0.7004, 0.8145)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.000245        
##                                           
##                   Kappa : 0.4619          
##                                           
##  Mcnemar's Test P-Value : 0.418492        
##                                           
##             Sensitivity : 0.8400          
##             Specificity : 0.6125          
##          Pos Pred Value : 0.8025          
##          Neg Pred Value : 0.6712          
##              Prevalence : 0.6522          
##          Detection Rate : 0.5478          
##    Detection Prevalence : 0.6826          
##       Balanced Accuracy : 0.7263          
##                                           
##        'Positive' Class : Neg             
## 

End of Lab