Menguji akurasi model, data berisi informasi user tentang pembelian produk online, data disimpan di lokal disc,  terdiri dari variable Id,gender,age,estimated salary dan purchased(1 jika beli, dan 0 jika tidak), Model yang akan diuji terdiri dari Model logistic, Random forest, Decision Tree, SVM,KNN dan Naive bayes.

Preparation

1. Random forest

Importing the raw dataset

## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
dataset = read.csv('D:/SHINY-APP/SN_ad.csv')
head(dataset,5)
##    User.ID Gender Age EstimatedSalary Purchased
## 1 15624510   Male  19           19000         0
## 2 15810944   Male  35           20000         0
## 3 15668575 Female  26           43000         0
## 4 15603246 Female  27           57000         0
## 5 15804002   Male  19           76000         0

Select Colomn analyzed

dataset = dataset[3:5]
head(dataset,5)
##   Age EstimatedSalary Purchased
## 1  19           19000         0
## 2  35           20000         0
## 3  26           43000         0
## 4  27           57000         0
## 5  19           76000         0

Encoding the target feature as factor

dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

Splitting the dataset into the Training set and Test set

set.seed(123)
split = sample(nrow(dataset),nrow(dataset)*0.75)
training_set = dataset[split,]
test_set = dataset[-split,]
rf <- randomForest(Purchased~Age+EstimatedSalary, data=training_set)

Accuracy

pred_rf <- predict(rf, test_set)
confusionMatrix(pred_rf, test_set$Purchased)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 53  5
##          1  7 35
##                                           
##                Accuracy : 0.88            
##                  95% CI : (0.7998, 0.9364)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 6.593e-10       
##                                           
##                   Kappa : 0.7521          
##                                           
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.8833          
##             Specificity : 0.8750          
##          Pos Pred Value : 0.9138          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.6000          
##          Detection Rate : 0.5300          
##    Detection Prevalence : 0.5800          
##       Balanced Accuracy : 0.8792          
##                                           
##        'Positive' Class : 0               
## 

2. Decision Tree Classification

library(rpart)
classifier = rpart(formula = Purchased ~ .,
                   data = training_set)

Predicting the Test set results

y_pred = predict(classifier, newdata = test_set[-3], type = 'class')

Accuracy

pred_clas <- predict(classifier,newdata=test_set,type = 'class') 
cm = table(test_set$Purchased, y_pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    y_pred
##      0  1
##   0 52  8
##   1  1 39
##                                         
##                Accuracy : 0.91          
##                  95% CI : (0.836, 0.958)
##     No Information Rate : 0.53          
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.8178        
##                                         
##  Mcnemar's Test P-Value : 0.0455        
##                                         
##             Sensitivity : 0.9811        
##             Specificity : 0.8298        
##          Pos Pred Value : 0.8667        
##          Neg Pred Value : 0.9750        
##              Prevalence : 0.5300        
##          Detection Rate : 0.5200        
##    Detection Prevalence : 0.6000        
##       Balanced Accuracy : 0.9055        
##                                         
##        'Positive' Class : 0             
## 

3. Logistic Regression

classifier = glm(formula = Purchased ~ .,
                 family = binomial, #for logistic reg mention binomial
                 data = training_set)

predicting the test set results

prob_pred = predict(classifier, type = 'response',newdata = test_set[-3])
#use type = response for logistic reg
#prob_pred = predict(classifier, type = 'response',newdata = test_set$Purchased)
#that will give the prob listed in the single vector

### prob listed in the single vector
y_pred = ifelse(prob_pred > 0.5, 1, 0)
y_pred
##   2   4   5   6  12  13  14  15  16  19  22  23  24  30  32  35  37  38  42  45 
##   0   0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0 
##  54  60  63  64  74  76  77  87  91 101 105 111 112 117 118 135 138 142 144 147 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 153 158 159 169 171 186 188 189 193 205 218 219 223 226 228 229 234 235 237 239 
##   0   0   0   0   0   0   0   0   0   1   0   1   1   0   1   0   1   1   0   1 
## 241 242 251 253 260 261 265 275 282 288 293 295 297 300 301 303 306 311 313 319 
##   1   0   0   1   1   0   1   1   0   1   1   0   0   1   1   1   0   0   0   0 
## 321 323 325 328 330 335 337 342 343 348 355 368 370 376 377 389 390 394 398 399 
##   1   0   1   0   1   1   1   0   0   1   0   1   1   0   1   0   0   1   0   0

Accuracy

cm = table(test_set[,3], y_pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    y_pred
##      0  1
##   0 56  4
##   1 15 25
##                                           
##                Accuracy : 0.81            
##                  95% CI : (0.7193, 0.8816)
##     No Information Rate : 0.71            
##     P-Value [Acc > NIR] : 0.01543         
##                                           
##                   Kappa : 0.5852          
##                                           
##  Mcnemar's Test P-Value : 0.02178         
##                                           
##             Sensitivity : 0.7887          
##             Specificity : 0.8621          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.7100          
##          Detection Rate : 0.5600          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.8254          
##                                           
##        'Positive' Class : 0               
## 

4. Support Vector Machine

library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
classifier = svm(formula = Purchased ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

Predicting the Test set results

y_pred = predict(classifier, newdata = test_set[-3])
y_pred
##   2   4   5   6  12  13  14  15  16  19  22  23  24  30  32  35  37  38  42  45 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  54  60  63  64  74  76  77  87  91 101 105 111 112 117 118 135 138 142 144 147 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 153 158 159 169 171 186 188 189 193 205 218 219 223 226 228 229 234 235 237 239 
##   0   0   0   0   0   0   0   0   0   1   0   1   1   0   1   0   1   1   0   1 
## 241 242 251 253 260 261 265 275 282 288 293 295 297 300 301 303 306 311 313 319 
##   1   0   0   1   1   0   1   1   0   1   1   0   0   1   1   1   0   0   0   0 
## 321 323 325 328 330 335 337 342 343 348 355 368 370 376 377 389 390 394 398 399 
##   1   0   1   0   1   1   1   0   0   1   0   1   1   0   1   0   0   1   0   0 
## Levels: 0 1

Accuracy

cm = table(test_set[, 3], y_pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    y_pred
##      0  1
##   0 56  4
##   1 17 23
##                                           
##                Accuracy : 0.79            
##                  95% CI : (0.6971, 0.8651)
##     No Information Rate : 0.73            
##     P-Value [Acc > NIR] : 0.105687        
##                                           
##                   Kappa : 0.5374          
##                                           
##  Mcnemar's Test P-Value : 0.008829        
##                                           
##             Sensitivity : 0.7671          
##             Specificity : 0.8519          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.5750          
##              Prevalence : 0.7300          
##          Detection Rate : 0.5600          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.8095          
##                                           
##        'Positive' Class : 0               
## 

5. KNN

library(class)
y_pred = knn(train = training_set[, -3],
             test = test_set[, -3],
             cl = training_set[, 3],
             k = 5)

Accuracy

cm = table(test_set[, 3], y_pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    y_pred
##      0  1
##   0 54  6
##   1 12 28
##                                           
##                Accuracy : 0.82            
##                  95% CI : (0.7305, 0.8897)
##     No Information Rate : 0.66            
##     P-Value [Acc > NIR] : 0.0003021       
##                                           
##                   Kappa : 0.6154          
##                                           
##  Mcnemar's Test P-Value : 0.2385928       
##                                           
##             Sensitivity : 0.8182          
##             Specificity : 0.8235          
##          Pos Pred Value : 0.9000          
##          Neg Pred Value : 0.7000          
##              Prevalence : 0.6600          
##          Detection Rate : 0.5400          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.8209          
##                                           
##        'Positive' Class : 0               
## 

6. Naive bayes

Encoding the target feature as factor

dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1)) #labels /levels -both are same
classifier = naiveBayes(x = training_set[-3],
                        y = training_set$Purchased) 

Predicting the Test set results

y_pred = predict(classifier, newdata = test_set[-3])
y_pred
##   [1] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 0 1
##  [75] 1 1 0 0 0 0 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 1 1 1 0
## Levels: 0 1

Accuracy

cm = table(test_set[, 3], y_pred)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    y_pred
##      0  1
##   0 56  4
##   1 11 29
##                                           
##                Accuracy : 0.85            
##                  95% CI : (0.7647, 0.9135)
##     No Information Rate : 0.67            
##     P-Value [Acc > NIR] : 3.791e-05       
##                                           
##                   Kappa : 0.6781          
##                                           
##  Mcnemar's Test P-Value : 0.1213          
##                                           
##             Sensitivity : 0.8358          
##             Specificity : 0.8788          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.7250          
##              Prevalence : 0.6700          
##          Detection Rate : 0.5600          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.8573          
##                                           
##        'Positive' Class : 0               
## 

Kesimpulan

Hasil Uji 6 Model diatas, dapat diangking hasilnya: 
1. Dec Tree: 0.91, 2. Random Forest: 0.88, 3. Naive Bayes: 0.85, 4. KNN: 0.82, 5. Logistic: 0.81
6. SVM: 0.79