To understand differences between KNN and SVM.
To understand the statistics behind it.
suppressWarnings(suppressMessages(library(class))) #k-nearest neighbors
suppressWarnings(suppressMessages(library(e1071))) #svm
suppressWarnings(suppressMessages(library(kernlab))) # #assist with SVM feature selection
suppressWarnings(suppressMessages(library(caret))) # select tuning parameters
suppressWarnings(suppressMessages(library(reshape2))) #assist in creating boxplots
suppressWarnings(suppressMessages(library(ggplot2))) #create boxplots
rawdata <- read.csv("Cryotherapy.csv")
str(rawdata)
## 'data.frame': 90 obs. of 7 variables:
## $ sex : int 1 1 1 1 1 1 1 1 1 2 ...
## $ age : int 35 29 50 32 67 41 36 59 20 34 ...
## $ Time : num 12 7 8 11.75 9.25 ...
## $ Number_of_Warts : int 5 5 1 7 1 2 2 3 12 3 ...
## $ Type : int 1 1 3 3 1 2 1 3 1 3 ...
## $ Area : int 100 96 132 750 42 20 8 20 6 150 ...
## $ Result_of_Treatment: int 0 1 0 0 0 1 0 0 1 0 ...
head(rawdata)
## sex age Time Number_of_Warts Type Area Result_of_Treatment
## 1 1 35 12.00 5 1 100 0
## 2 1 29 7.00 5 1 96 1
## 3 1 50 8.00 1 3 132 0
## 4 1 32 11.75 7 3 750 0
## 5 1 67 9.25 1 1 42 0
## 6 1 41 8.00 2 2 20 1
It is important to have the features on the same scale with a mean of zero and a standard deviation of one. If not, then the distance calculations in the nearest neighbor calculation are flawed.
rawdata.scale <- data.frame(scale(rawdata[,-7]))
#add back the result_of_treatment factor
rawdata.scale$result_of_treatment <- rawdata$Result_of_Treatment
suppressWarnings(suppressMessages(library(dplyr)))
rawdata.scale <- rawdata.scale %>% mutate(result_of_treatment =ifelse(result_of_treatment == 0, "NO", "YES"))
rawdata.scale$result_of_treatment <- as.factor(as.character(rawdata.scale$result_of_treatment))
rawdata.scale.melt <- melt(rawdata.scale, id.var="result_of_treatment")
ggplot(data=rawdata.scale.melt, aes(x = result_of_treatment, y = value)) + geom_boxplot() + facet_wrap(~ variable, ncol = 2)
table(rawdata.scale$result_of_treatment)
##
## NO YES
## 42 48
set.seed(502)
ind <- sample(2, nrow(rawdata.scale), replace = TRUE, prob = c(0.7, 0.3))
train <- rawdata.scale[ind == 1, ]
test <- rawdata.scale[ind == 2, ]
str(train)
## 'data.frame': 67 obs. of 7 variables:
## $ sex : num -0.951 -0.951 -0.951 -0.951 -0.951 ...
## $ age : num 1.602 0.928 0.554 2.275 -0.644 ...
## $ Time : num 0.0978 0.0978 0.9785 -1.2231 -0.9296 ...
## $ Number_of_Warts : num -1.265 -0.984 -0.984 -0.704 1.819 ...
## $ Type : num 1.436 0.331 -0.773 1.436 -0.773 ...
## $ Area : num 0.35 -0.5 -0.591 -0.5 -0.606 ...
## $ result_of_treatment: Factor w/ 2 levels "NO","YES": 1 2 1 1 2 1 2 1 2 1 ...
str(test)
## 'data.frame': 23 obs. of 7 variables:
## $ sex : num -0.951 -0.951 -0.951 -0.951 1.04 ...
## $ age : num 0.479 0.0299 0.2545 2.8741 0.4042 ...
## $ Time : num 1.272 -0.196 1.199 0.465 1.052 ...
## $ Number_of_Warts : num -0.143 -0.143 0.417 -1.265 -0.704 ...
## $ Type : num -0.773 -0.773 1.436 -0.773 1.436 ...
## $ Area : num 0.1075 0.0772 5.0418 -0.3327 0.4871 ...
## $ result_of_treatment: Factor w/ 2 levels "NO","YES": 1 2 1 1 1 2 2 2 2 2 ...
#create a grid of inputs for the experiment, with k ranging from 2 to 20 by an increment of 1.
grid1 <- expand.grid(.k = seq(2, 20, by = 1))
control = trainControl(method = "cv")
grid1
## .k
## 1 2
## 2 3
## 3 4
## 4 5
## 5 6
## 6 7
## 7 8
## 8 9
## 9 10
## 10 11
## 11 12
## 12 13
## 13 14
## 14 15
## 15 16
## 16 17
## 17 18
## 18 19
## 19 20
set.seed(123)
knn.train <- train(result_of_treatment ~ ., data = train, method = "knn", trControl = control, tuneGrid = grid1)
knn.train
## k-Nearest Neighbors
##
## 67 samples
## 6 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 60, 61, 60, 61, 60, 60, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 2 0.9000000 0.8054541
## 3 0.9119048 0.8305556
## 4 0.9238095 0.8470000
## 5 0.9404762 0.8803333
## 6 0.8976190 0.7864928
## 7 0.8833333 0.7560580
## 8 0.8976190 0.7837260
## 9 0.8976190 0.7890290
## 10 0.8809524 0.7556957
## 11 0.8666667 0.7224941
## 12 0.8833333 0.7558274
## 13 0.8833333 0.7558274
## 14 0.8833333 0.7558274
## 15 0.8976190 0.7837260
## 16 0.8809524 0.7503926
## 17 0.8809524 0.7503926
## 18 0.8809524 0.7503926
## 19 0.8976190 0.7837260
## 20 0.8666667 0.7200593
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
pred <- knn(train[, -7], test[, -7], train[, 7], k = 5)
table(pred, test$result_of_treatment)
##
## pred NO YES
## NO 9 3
## YES 2 9
suppressWarnings(suppressMessages(library(Metrics)))
# tp / tp+tn (9+9)/23
accuracy(test$result_of_treatment, pred)
## [1] 0.7826087
# classification error = 1-accuracy above
ce(test$result_of_treatment, pred)
## [1] 0.2173913
The formula for the statistic is Kappa = (per cent of agreement - per cent of chance agreement) / (1 - per cent of chance agreement).
The per cent of agreement is the rate that the evaluators agreed on for the class (accuracy).
The percent of chance agreement is the rate that the evaluators randomly agreed on.
The higher the statistic, the better they performed with the maximum agreement being one.
<0.20 Poor
0.21-0.40 Fair
0.41-0.60 Moderate
0.61-0.80 Good
0.81-1.00 Very good
prob.agree <- (9+9)/23
prob.chance <- ((9+9)/23) * ((9+2)/23)
prob.chance
## [1] 0.3742911
kappa <- (prob.agree - prob.chance) / (1 - prob.chance)
kappa #0.65% on train set
## [1] 0.652568
See whether we can use SVM to make improvements over KNN
#tune the sigmoid
set.seed(123)
sigmoid.tune <- tune.svm(result_of_treatment ~ ., data = train, kernel = "sigmoid", gamma = c(0.1, 0.5, 1, 2, 3, 4), coef0 = c(0.1, 0.5, 1, 2, 3, 4))
#summary(sigmoid.tune)
best.sigmoid <- sigmoid.tune$best.model
sigmoid.test <- predict(best.sigmoid, newdata = test)
#linear tune
set.seed(123)
linear.tune <- tune.svm(result_of_treatment ~ ., data = train, kernel = "linear", cost = c(0.001, 0.01, 0.1, 1, 5, 10))
#summary(linear.tune)
best.linear <- linear.tune$best.model
tune.test <- predict(best.linear, newdata = test)
# compare the 2 tables for sigmoid and tune test under SVM methods
table(sigmoid.test, test$result_of_treatment)
##
## sigmoid.test NO YES
## NO 9 3
## YES 2 9
table(tune.test, test$result_of_treatment)
##
## tune.test NO YES
## NO 10 3
## YES 1 9
confusionMatrix(sigmoid.test, test$result_of_treatment, positive = "YES")
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 9 3
## YES 2 9
##
## Accuracy : 0.7826
## 95% CI : (0.563, 0.9254)
## No Information Rate : 0.5217
## P-Value [Acc > NIR] : 0.009401
##
## Kappa : 0.566
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.7500
## Specificity : 0.8182
## Pos Pred Value : 0.8182
## Neg Pred Value : 0.7500
## Prevalence : 0.5217
## Detection Rate : 0.3913
## Detection Prevalence : 0.4783
## Balanced Accuracy : 0.7841
##
## 'Positive' Class : YES
##
confusionMatrix(tune.test, test$result_of_treatment, positive = "YES")
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 10 3
## YES 1 9
##
## Accuracy : 0.8261
## 95% CI : (0.6122, 0.9505)
## No Information Rate : 0.5217
## P-Value [Acc > NIR] : 0.002491
##
## Kappa : 0.6541
## Mcnemar's Test P-Value : 0.617075
##
## Sensitivity : 0.7500
## Specificity : 0.9091
## Pos Pred Value : 0.9000
## Neg Pred Value : 0.7692
## Prevalence : 0.5217
## Detection Rate : 0.3913
## Detection Prevalence : 0.4348
## Balanced Accuracy : 0.8295
##
## 'Positive' Class : YES
##
What we have done is just thrown all the variables together as the feature input space and let the blackbox SVM calculations give us a predicted classification.
One of the issues with SVMs is that the findings are very difficult to interpret.
Let’s do some feature selection.
#features selection
set.seed(123)
rfeCNTL <- rfeControl(functions = lrFuncs, method = "cv", number = 10)
svm.features <- rfe(train[, 1:6], train[, 7], sizes = c(7, 6, 5, 4),rfeControl = rfeCNTL, method = "svmLinear")
svm.features
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 4 0.8690 0.7377 0.1256 0.2505
## 5 0.8690 0.7377 0.1256 0.2505
## 6 0.8976 0.7961 0.1188 0.2349 *
##
## The top 5 variables (out of 6):
## Time, age, Type, sex, Area
#The top 5 variables (out of 6):Time, age, Type, sex, Area
svm.5 <- svm(result_of_treatment ~ Time + age + Type + sex + Area, data = train, kernel = "linear")
svm.5.predict = predict(svm.5, newdata=test[c(1,2,3,5,6)])
table(svm.5.predict, test$result_of_treatment)
##
## svm.5.predict NO YES
## NO 9 3
## YES 2 9
svm.6 <- svm(result_of_treatment ~ ., data = train, kernel = "linear")
svm.6.predict = predict(svm.5, newdata=test[c(1,2,3,4,5,6)])
table(svm.6.predict, test$result_of_treatment)
##
## svm.6.predict NO YES
## NO 9 3
## YES 2 9
Whether it is using 5 variables or 6 variables, results for the classification remains the same. Choosing lesser variables makes model simpler.
Changing the TRAIN/TEST sets with different probabilities.
Trial and error on different techniques.
Larger datasets.