library(cvms) #plot confusion matrix
library(broom) # tidy()
library(tibble) # tibble()
library(caret) #confusionMatrix
library(class) #classification algrithms, knn
df<- read.csv("Kid.csv")
str(df)
## 'data.frame': 673 obs. of 17 variables:
## $ Buy : int 0 1 0 1 0 0 0 0 0 0 ...
## $ Income : int 24000 75000 46000 70000 43000 24000 26000 38000 39000 49000 ...
## $ Is.Female : int 1 1 1 0 1 1 1 1 1 0 ...
## $ Is.Married : int 0 1 1 1 0 1 1 1 0 1 ...
## $ Has.College : int 1 1 0 0 0 0 1 0 1 0 ...
## $ Is.Professional : int 1 1 0 1 0 0 0 0 1 0 ...
## $ Is.Retired : int 0 0 0 0 0 0 1 1 0 1 ...
## $ Unemployed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Residence.Length: int 26 15 36 55 27 41 20 8 17 31 ...
## $ Dual.Income : int 0 1 1 0 0 0 0 0 0 0 ...
## $ Minors : int 0 0 1 0 0 0 1 0 0 0 ...
## $ Own : int 0 1 1 1 0 1 1 1 0 1 ...
## $ House : int 1 1 1 1 0 1 1 1 0 1 ...
## $ Is.USBorn : int 0 1 1 1 1 0 1 1 1 1 ...
## $ English : int 0 1 1 1 1 0 1 1 1 1 ...
## $ Prev.Child.Mag : int 0 1 0 1 0 0 0 0 0 0 ...
## $ Prev.Parent.Mag : int 0 0 0 0 1 0 0 0 0 0 ...
train_idx<- sample(dim(df)[1], 0.8*dim(df)[1] )
train<- df[train_idx, ]
test<- df[-train_idx, ]
train.labels<- train$Buy
test.labels<- test$Buy
knn.1<- knn(train, test, cl= train.labels, k= 2)
summary (knn.1)
## 0 1
## 108 27
acc<- 100* sum(test.labels== knn.1)/NROW(test.labels)
acc
## [1] 86.66667
table(knn.1, test.labels)
## test.labels
## knn.1 0 1
## 0 102 6
## 1 12 15
confusionMatrix(table(knn.1, test.labels))
## Confusion Matrix and Statistics
##
## test.labels
## knn.1 0 1
## 0 102 6
## 1 12 15
##
## Accuracy : 0.8667
## 95% CI : (0.7975, 0.919)
## No Information Rate : 0.8444
## P-Value [Acc > NIR] : 0.2825
##
## Kappa : 0.5455
##
## Mcnemar's Test P-Value : 0.2386
##
## Sensitivity : 0.8947
## Specificity : 0.7143
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.5556
## Prevalence : 0.8444
## Detection Rate : 0.7556
## Detection Prevalence : 0.8000
## Balanced Accuracy : 0.8045
##
## 'Positive' Class : 0
##
In the dataset, the Income and Residence.Length variables have a very different range from other variables. In this case you may normalise or standarise the variables. Let’s try and see what difference this might make in the outcome of classification and on the accuracy of classification model.
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
#normalize onlt two variables
#df[, c("Income", "Residence.Length")]<- normalize(df[, c("Income", "Residence.Length")])
#normalise all variables
df[, 2:17]<- normalize(df[, 2:17])
train_idx<- sample(dim(df)[1], 0.8*dim(df)[1] )
train<- df[train_idx, ]
test<- df[-train_idx, ]
train.labels<- train$Buy
test.labels<- test$Buy
knn.2<- knn(train, test, cl= train.labels, k= 2)
summary (knn.2)
## 0 1
## 109 26
acc<- 100* sum(test.labels== knn.2)/NROW(test.labels)
acc
## [1] 100
table(knn.2, test.labels)
## test.labels
## knn.2 0 1
## 0 109 0
## 1 0 26
confusionMatrix(table(knn.2, test.labels))
## Confusion Matrix and Statistics
##
## test.labels
## knn.2 0 1
## 0 109 0
## 1 0 26
##
## Accuracy : 1
## 95% CI : (0.973, 1)
## No Information Rate : 0.8074
## P-Value [Acc > NIR] : 2.868e-13
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8074
## Detection Rate : 0.8074
## Detection Prevalence : 0.8074
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
This time we want to see the impact of scaling on the performance of our classification model. In this part, we use the scale() function to scale all features except the target variable Buy as it indicated the two target classes.
df[, 2:17]<- scale(df[, 2:17])
train_idx<- sample(dim(df)[1], 0.8*dim(df)[1] )
train<- df[train_idx, ]
test<- df[-train_idx, ]
train.labels<- train$Buy
test.labels<- test$Buy
knn.3<- knn(train, test, cl= train.labels, k= 2)
summary (knn.3)
## 0 1
## 113 22
acc<- 100* sum(test.labels== knn.3)/NROW(test.labels)
acc
## [1] 85.18519
table(knn.3, test.labels)
## test.labels
## knn.3 0 1
## 0 101 12
## 1 8 14
confusionMatrix(table(knn.3, test.labels))
## Confusion Matrix and Statistics
##
## test.labels
## knn.3 0 1
## 0 101 12
## 1 8 14
##
## Accuracy : 0.8519
## 95% CI : (0.7805, 0.9071)
## No Information Rate : 0.8074
## P-Value [Acc > NIR] : 0.1127
##
## Kappa : 0.494
##
## Mcnemar's Test P-Value : 0.5023
##
## Sensitivity : 0.9266
## Specificity : 0.5385
## Pos Pred Value : 0.8938
## Neg Pred Value : 0.6364
## Prevalence : 0.8074
## Detection Rate : 0.7481
## Detection Prevalence : 0.8370
## Balanced Accuracy : 0.7325
##
## 'Positive' Class : 0
##
There might be several classification models atempted to find the best model fitting your data. If you have several models and want to compare them, you can use AIC (Akaike information criterion) for making a comparison and finding th best model. See this articles for more information on how you can use AIC apporach in R: https://www.scribbr.com/statistics/akaike-information-criterion/ aictab() is an R function that you can use from the AICcmodavg package.
KNN Algorithm Using R | KNN Algorithm Example | Data Science Training | Edureka - Youtube https://www.youtube.com/watch?v=XSoau_q0kz8&t=1222s
HOW TO NORMALIZE AND STANDARDIZE DATA IN R FOR GREAT HEATMAP VISUALIZATION https://www.datanovia.com/en/blog/how-to-normalize-and-standardize-data-in-r-for-great-heatmap-visualization/