## corrplot 0.88 loaded
## Loading required package: lattice
## Loading required package: ggplot2
penguins <- read.csv ('penguins.csv', fileEncoding = 'UTF-8-BOM', header=TRUE)
dim(penguins)
## [1] 344 7
Because KNN will not allow missing data in its modeling, we have to drop all the missing values. ALso, the sex shall contain 2 values only, therefore, we are tranforming the seemingly data entry mistake of 3 caterory into 2.
The island varialbe is not of interest and we also drop it.
data <- penguins
data2 <- data %>% drop_na()
data2$island <- NULL
unique(data$species)
## [1] Adelie Chinstrap Gentoo
## Levels: Adelie Chinstrap Gentoo
colors <- c("#bfe8ff", "#72adcf", "#0075a4")
names <- unique
sprintf("We have explored the data now lets start with the K-nearest neighbor")
## [1] "We have explored the data now lets start with the K-nearest neighbor"
data_knn <- data2
sprintf("Tranforming Gender data into 0 and 1")
## [1] "Tranforming Gender data into 0 and 1"
data_knn$sex <- as.numeric(data2$sex)-1
sprintf("Diving dataset into Trainging and Test Dataset")
## [1] "Diving dataset into Trainging and Test Dataset"
set.seed(8042020)#seed for division
divison <- createDataPartition(data_knn$species, p = .8, list = FALSE)
data_training <- data_knn[divison,]
data_testing <- data_knn[-divison,]
sprintf("Lets perform 10-fold cross-validation")
## [1] "Lets perform 10-fold cross-validation"
knn.fit <- train(species ~ .,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trainControl(method = "cv", number = 10),
preProcess = c("center","scale"),
metric = "Accuracy",
data = data_training)
knn.fit
## k-Nearest Neighbors
##
## 275 samples
## 5 predictor
## 3 classes: 'Adelie', 'Chinstrap', 'Gentoo'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 247, 248, 248, 247, 248, 248, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9962963 0.9942184
## 2 0.9928571 0.9887550
## 3 0.9964286 0.9943775
## 4 0.9925926 0.9884368
## 5 0.9927249 0.9885959
## 6 0.9964286 0.9943775
## 7 0.9890212 0.9824570
## 8 0.9927249 0.9884172
## 9 0.9927249 0.9884172
## 10 0.9855729 0.9769749
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 6.
sprintf("Lets do the prediction")
## [1] "Lets do the prediction"
Prediction <- predict(knn.fit, newdata =data_testing)
sprintf("Confusion Matrix")
## [1] "Confusion Matrix"
confusionMatrix(Prediction, data_testing$species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 30 0 0
## Chinstrap 0 13 0
## Gentoo 0 0 24
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9464, 1)
## No Information Rate : 0.4478
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 1.0000 1.000 1.0000
## Specificity 1.0000 1.000 1.0000
## Pos Pred Value 1.0000 1.000 1.0000
## Neg Pred Value 1.0000 1.000 1.0000
## Prevalence 0.4478 0.194 0.3582
## Detection Rate 0.4478 0.194 0.3582
## Detection Prevalence 0.4478 0.194 0.3582
## Balanced Accuracy 1.0000 1.000 1.0000
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 10,
classProbs = TRUE,
summaryFunction = multiClassSummary)
knncv <- train(species~.,
data=data_training,
method = "knn",
trControl= fitControl,
metric ='Accuracy' )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
print(knncv)
## k-Nearest Neighbors
##
## 275 samples
## 5 predictor
## 3 classes: 'Adelie', 'Chinstrap', 'Gentoo'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 248, 248, 249, 247, 247, 247, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa Mean_F1
## 5 1.5057224 0.8748407 0.3770200 0.7535008 0.5977267 0.6912203
## 7 1.2005633 0.8723551 0.4320604 0.7636067 0.6096352 0.6958227
## 9 0.9110897 0.8672196 0.4353256 0.7654045 0.6095385 0.6939984
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6749558 0.8641537 0.7216953 0.8835251
## 0.6748319 0.8666055 0.7650131 0.8915912
## 0.6691011 0.8656607 0.8033904 0.8955893
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.7216953 0.6749558 0.2511669 0.7695548
## 0.7650131 0.6748319 0.2545356 0.7707187
## 0.8033904 0.6691011 0.2551348 0.7673809
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
# There were missing values in resampled performance measures.k-Nearest Neighbors
# Plot the KNN results
colors <- c("Accuracy" = "red" )
knn.fit$results %>%
ggplot(aes(k)) +
geom_line(aes(y = Accuracy, col = "Accuracy")) +
geom_point(aes(y = Accuracy)) +
labs(title = "K-NN with 10 repeated 10 fold Cross Validation",
subtitle = "Accuracy ",
color = "Legend") +
scale_color_manual(values = colors) +
theme(legend.position = "top",
axis.title.y = element_blank())
The KNN model can predict the penguin species with relatively modest go high accuracy. The accuracy ranges from 0.89 to 0.996, depends on the K. When K is at 6 nearest means, the accuracy is the highest (at 0.996), however, it dropped as K increases to 8 nearest means.
Although the autogrid search concludes that the final value used for the model was K=9. Actually, the K=6 give equally satisfying results. In order to save computation energy and prevent unexpected errors, I would settle down with K=2 and stop there.
The more nearest means do not turn out to be the merrier (more accurate results).