Machine Learning for Data Science
Import the packages
# Excel file reading package
if (!require('readxl')) install.packages('readxl'); library('readxl')
# Data cleaning packages
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
if (!require('tidyr')) install.packages('tidyr'); library('tidyr')
if (!require('lubridate')) install.packages('lubridate'); library('lubridate')
# Plotting packages
if (!require('GGally')) install.packages('GGally'); library('GGally')
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
# Machine learning packages
if (!require('caret')) install.packages('caret'); library('caret')
if (!require('klaR')) install.packages('klaR'); library('klaR')
if (!require('randomForest')) install.packages('randomForest'); library('randomForest')Loading data from Excel encoded type into R dataframe type.
df_diabetes <- data.frame(read_excel(path="./labW9.xlsx", sheet="diabetes"))Data exploration & cleaning
Check underlying classes, structures, and dimensions
class(df_diabetes)## [1] "data.frame"
str(df_diabetes)## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : num 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num 1 0 1 0 1 0 1 0 1 1 ...
dim(df_diabetes)## [1] 768 9
Converting column Outcome as factor data type as classifier
df_diabetes <- df_diabetes %>%
mutate(Outcome=as.factor(Outcome)) %>%
mutate(Outcome=recode_factor(Outcome, "0"="Neg", "1"="Pos"))
str(df_diabetes)## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : num 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "Neg","Pos": 2 1 2 1 2 1 2 1 2 2 ...
Check first 5 rows of the dataset
head(df_diabetes, 5)## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 Pos
## 2 0.351 31 Neg
## 3 0.672 32 Pos
## 4 0.167 21 Neg
## 5 2.288 33 Pos
Check last 5 rows of the dataset
tail(df_diabetes, 5)## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 764 10 101 76 48 180 32.9
## 765 2 122 70 27 0 36.8
## 766 5 121 72 23 112 26.2
## 767 1 126 60 0 0 30.1
## 768 1 93 70 31 0 30.4
## DiabetesPedigreeFunction Age Outcome
## 764 0.171 63 Neg
## 765 0.340 27 Neg
## 766 0.245 30 Neg
## 767 0.349 47 Pos
## 768 0.315 23 Neg
Check if there is any NA data in the dataset
sum(is.na(df_diabetes))## [1] 0
Plotting the dataset with scatterplot matrix
Machine Learning
Setting the split percentage for training & testing targetted on column Outcome
split <- 0.70
train_index <- createDataPartition(df_diabetes$Outcome, p=split, list=FALSE)
data_train <- df_diabetes[train_index,]
data_test <- df_diabetes[-train_index,]Checking and asserting the dimensions of the subsets
dim(data_train)## [1] 538 9
print(dim(data_train)/dim(df_diabetes))## [1] 0.7005208 1.0000000
dim(data_test)## [1] 230 9
print(dim(data_test)/dim(df_diabetes))## [1] 0.2994792 1.0000000
Cross Validating model against the dataset
# Randomized resampling fold count
train_number <- sample(1:100, 1)
train_control <- trainControl(method="cv", number=train_number)K-Nearest Neighbors Cross Validating
knn_model <- train(
Outcome~.,
data=df_diabetes,
method="knn",
trControl=train_control)
knn_model## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'Neg', 'Pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 615, 614, 615, 614, 614
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7186232 0.3579085
## 7 0.7199813 0.3650231
## 9 0.7278245 0.3827447
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Random Forest Validating
rf_model <- train(
Outcome~.,
data=df_diabetes,
method="rf",
trControl=train_control)
rf_model## Random Forest
##
## 768 samples
## 8 predictor
## 2 classes: 'Neg', 'Pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 614, 615, 615, 614, 614
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7747390 0.4881509
## 5 0.7551481 0.4488195
## 8 0.7551736 0.4527435
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
Comparing the accuracy of cross validated models
plot(knn_model) plot(rf_model)Picking Random Forest model because of higher accuracy
Pre-checking Out of Bag (OOB) Score for the RF model
rf_model$finalModel##
## Call:
## randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x)))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 22.92%
## Confusion matrix:
## Neg Pos class.error
## Neg 433 67 0.1340000
## Pos 109 159 0.4067164
plot(rf_model$finalModel)
legend("right", colnames(rf_model$finalModel$err.rate),col=1:3, cex=1, fill=1:3)Training the final Random Forest model with data_train
rf_final_model <- train(
Outcome~.,
data=data_train,
method="rf",
trControl=train_control)
rf_final_model## Random Forest
##
## 538 samples
## 8 predictor
## 2 classes: 'Neg', 'Pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 431, 430, 430, 431, 430
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7675839 0.4692797
## 5 0.7564036 0.4473214
## 8 0.7507961 0.4370385
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
Plotting the Random Forest final model
plot(rf_final_model)Using the Random Forest model to perform prediction
rf_prediction <- predict(rf_final_model, newdata=data_test)
table(rf_prediction)## rf_prediction
## Neg Pos
## 157 73
Evaluate model using confusion matrix
rf_confusion_matrix <- table(rf_prediction, data_test$Outcome)
confusionMatrix(rf_confusion_matrix)## Confusion Matrix and Statistics
##
##
## rf_prediction Neg Pos
## Neg 126 31
## Pos 24 49
##
## Accuracy : 0.7609
## 95% CI : (0.7004, 0.8145)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.000245
##
## Kappa : 0.4619
##
## Mcnemar's Test P-Value : 0.418492
##
## Sensitivity : 0.8400
## Specificity : 0.6125
## Pos Pred Value : 0.8025
## Neg Pred Value : 0.6712
## Prevalence : 0.6522
## Detection Rate : 0.5478
## Detection Prevalence : 0.6826
## Balanced Accuracy : 0.7263
##
## 'Positive' Class : Neg
##