library('ggplot2'); library('lattice'); library('rpart')
library('rpart.plot'); library('knitr'); library('caret')
library('e1071'); library('AUC'); library('robustbase')
library('cvTools'); library('class'); library('gplots')
library('e1071'); library('ROCR'); library('class')
library('dplyr'); library('pROC'); library('rattle')
prediction_classifier <- function(predicted, actual)
{
#Sort the predicted values in decreasing order
predicted <- sort(predicted, decreasing = TRUE)
#Initialize Variables
L <- length(predicted); #length of predicted = actual
P <- 0;
N <- 0;
TP <- 0;
TN <- 0;
FP <- 0;
FN <- 0;
TPR <- 0;
FPR <- 0;
Prec <- 0;
Acc <- 0;
Err <- 0;
tpn <- 0;
fpn <- 0;
tfp <- 0;
TP_V <- vector(mode = "numeric", length = 0);
FP_V <- vector(mode = "numeric", length = 0);
TN_V <- vector(mode = "numeric", length = 0);
FN_V <- vector(mode = "numeric", length = 0);
TPR_V <- vector(mode = "numeric", length = 0);
TNR_V <- vector(mode = "numeric", length = 0);
Prec_V <- vector(mode = "numeric", length = 0);
Acc_V <- vector(mode = "numeric", length = 0);
Err_V <- vector(mode = "numeric", length = 0);
#P, N, TP, FP
for(i in 1:L)
{
if(actual[i] == 1)
{
P <- P + 1;
TP <- TP + 1;
}
else
{
N <- N + 1;
FP <- FP + 1;
}
TP_V <- c(TP_V, TP)
FP_V <- c(FP_V, FP)
}
d <- P + N;
#TN, FN
for(i in 1:L)
{
TN <- N - FP_V[i]
FN <- P - TP_V[i]
TN_V <- c(TN_V, TN)
FN_V <- c(FN_V, FN)
}
#TPR (sensitivity, recall), TNR (specificity)
for(i in 1:L)
{
TPR <- TP_V[i]/P
TNR <- TN_V[i]/N
TPR_V <- c(TPR_V, TPR)
TNR_V <- c(TNR_V, TNR)
}
#Prec
for(i in 1:L)
{
tfp <- TP_V[i] + FP_V[i]
Prec <- TP_V[i]/tfp
Prec_V <- c(Prec_V, Prec)
}
#Acc
for(i in 1:L)
{
tpn <- TP_V[i] + TN_V[i]
Acc <- tpn/d
Acc_V <- c(Acc_V, Acc)
}
#Err
for(i in 1:L)
{
fpn <- FP_V[i] + FN_V[i]
Err <- fpn/d
Err_V <- c(Err_V, Err)
}
#Show results in Data Frame
df <- data.frame(actual, predicted, TP_V, FP_V, TN_V, FN_V, TPR_V,
TNR_V, Prec_V, Acc_V, Err_V);
}
Ytrue <- c(1,0,1,1,0,0,1,0,1,0)
Ypred <- c(0.98,0.92,0.85,0.77,0.71,0.64,0.50,0.39,0.34,0.31)
#using function from Problem 1
result <- prediction_classifier(Ypred, Ytrue)
#Use TNR (specificity) to find FPR (1 - specificity)
FPR <- 0
FPR_V <- vector(mode = "numeric", length = 0);
for(i in 1:length(result$TNR_V))
{
FPR <- 1 - result$TNR_V[i]
FPR_V <- c(FPR_V, FPR)
}
#Show TPR, FPR, and Acc
problem2 <- result[,c("TPR_V","Acc_V")]
problem2$FPR_V <- FPR_V
knitr::kable(problem2, format = "html")
| TPR_V | Acc_V | FPR_V |
|---|---|---|
| 0.2 | 0.6 | 0.0 |
| 0.2 | 0.5 | 0.2 |
| 0.4 | 0.6 | 0.2 |
| 0.6 | 0.7 | 0.2 |
| 0.6 | 0.6 | 0.4 |
| 0.6 | 0.5 | 0.6 |
| 0.8 | 0.6 | 0.6 |
| 0.8 | 0.5 | 0.8 |
| 1.0 | 0.6 | 0.8 |
| 1.0 | 0.5 | 1.0 |
plot(problem2$FPR_V, problem2$TPR_V, type = "o", xlab = "False Positive
Rate", ylab = "True Positive Rate", main="ROC Curve")
M1 <- c(30.5, 32.2, 20.7, 20.6, 31.0, 41.0, 27.7, 26.0, 21.5, 26.0)
M2 <- c(22.4, 14.5, 22.4, 19.6, 20.7, 20.4, 22.1, 19.4, 16.2, 35.0)
#1% significance
t.test(M1, M2, conf.level = .99, alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: M1 and M2
## t = 2.4376, df = 17.649, p-value = 0.02561
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## -1.184208 14.084208
## sample estimates:
## mean of x mean of y
## 27.72 21.27
The t-value falls within the confidence interval meaning there is not enough evidence to suggest that the means are different. Therefore, we can not state which model is better.
#GINI index calculation
GINIcalc <- function(numData)
{
GINI_N <- 1 -((numData[1]/sum(numData))^2)-
((numData[2]/sum(numData))^2)- ((numData[3]/sum(numData))^2)
print(GINI_N)
}
#Gain in GINI index for 2 splits
GAIN_GINI <- function(N, N1, N2)
{
GAIN_N <- GINIcalc(N) - sum((length(N1)*GINIcalc(N1)),length(N2)*GINIcalc(N2))/(length(N1)+length(N2))
print(GAIN_N)
}
#Gain in GINI index for 3 splits
GAIN_GINI_3 <- function(N, N1, N2, N3)
{
GAIN_N_3 <- GINIcalc(N) - sum((length(N1)*GINIcalc(N1)),length(N2)*GINIcalc(N2)
,length(N3)*GINIcalc(N3))/(length(N1)+length(N2)+length(N3))
print(GAIN_N_3)
}
#Problem 5 Data set
N <- c(100,50,60)
N11 <- c(62,8,0)
N12 <- c(38,42,60)
N21 <- c(65,20,0)
N22 <- c(21,19,20)
N23 <- c(14,11,40)
#First Split GINI Index Gain:
GAIN_GINI(N,N11,N12)
## [1] 0.6349206
## [1] 0.202449
## [1] 0.6526531
## [1] 0.2073696
#Second Split GINI Index Gain:
GAIN_GINI_3(N, N21, N22, N23)
## [1] 0.6349206
## [1] 0.3598616
## [1] 0.6661111
## [1] 0.5462722
## [1] 0.110839
The first split gives the largest gain, and the node N12 has the highest GINI index.
#Entropy Calculation
EntropyCalc <- function(N)
{
entropy <- 0
calc <- TRUE
#test to see if calculation possible
for(i in 1:length(N))
{
if(N[i] == 0)
{
#print("There is a '0' element. This calculation requires log(0) which is undefined.")
calc <- FALSE
}
}
#entropy calculation
if(calc == TRUE)
{
for(i in 1:length(N))
{
entropy = entropy -N[i]/sum(N)*log(N[i]/sum(N))
}
print(entropy)
}
}
#Gain in Entropy for 2 splits
GAIN_entropy <- function(N, N1, N2)
{
GAIN_E <- EntropyCalc(N) - sum((length(N1)*EntropyCalc(N1)),
length(N2)*EntropyCalc(N2))/(length(N1)+length(N2))
print(GAIN_E)
}
#Gain in Entropy for 2 splits
GAIN_entropy_3 <- function(N, N1, N2, N3){
GAIN_E_3 <- EntropyCalc(N) - sum((length(N1)*EntropyCalc(N1)),length(N2)*EntropyCalc(N2),
length(N3)*EntropyCalc(N3))/(length(N1)+length(N2)+length(N3))
print(GAIN_E_3)
}
#First Split Entropy Gain: 0.5137838
GAIN_entropy(N,N11,N12)
## [1] 1.052923
## [1] 1.078278
## [1] 0.5137838
#Second Split Entropy Gain: 0.3769638
GAIN_entropy_3(N,N21,N22,N23)
## [1] 1.052923
## [1] 1.097779
## [1] 0.9300976
## [1] 0.3769638
The first split and the second node have the highest gain and entropy, respectively.
spam.main = read.csv("spam.csv", header=T)
spam = subset(spam.main, select = -c(id, isuid, domain, spampct, category, cappct))
kable(head(spam, n = 5))
| day.of.week | time.of.day | size.kb | box | local | digits | name | special | credit | sucker | porn | chain | username | large.text | spam |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Thu | 0 | 7 | no | no | 0 | name | 1 | no | no | no | no | no | no | no |
| Thu | 0 | 2 | no | no | 0 | name | 5 | no | no | no | no | no | no | yes |
| Thu | 14 | 3 | no | yes | 0 | name | 2 | no | no | no | yes | no | no | no |
| Thu | 3 | 3 | yes | no | 0 | name | 0 | no | no | no | no | no | no | no |
| Thu | 3 | 4 | no | no | 0 | name | 2 | no | no | no | no | no | no | no |
set.seed(143498)
row.spam = nrow(spam)
sample.count = round(0.2 * row.spam, 0)
sample.spam = sample(1:row.spam, sample.count)
spam.train = spam[-sample.spam, ]
spam.test = spam[sample.spam, ]
kable(head(spam.train, n = 5))
| day.of.week | time.of.day | size.kb | box | local | digits | name | special | credit | sucker | porn | chain | username | large.text | spam | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Thu | 0 | 7 | no | no | 0 | name | 1 | no | no | no | no | no | no | no |
| 2 | Thu | 0 | 2 | no | no | 0 | name | 5 | no | no | no | no | no | no | yes |
| 3 | Thu | 14 | 3 | no | yes | 0 | name | 2 | no | no | no | yes | no | no | no |
| 5 | Thu | 3 | 4 | no | no | 0 | name | 2 | no | no | no | no | no | no | no |
| 6 | Thu | 4 | 4 | no | no | 0 | name | 1 | no | no | no | no | no | no | no |
| <br / | > |
kable(head(spam.test, n = 5))
| day.of.week | time.of.day | size.kb | box | local | digits | name | special | credit | sucker | porn | chain | username | large.text | spam | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1951 | Tue | 15 | 1 | no | yes | 0 | name | 0 | no | no | no | no | no | no | no |
| 554 | Fri | 8 | 2 | yes | yes | 0 | name | 7 | no | no | no | no | no | no | no |
| 1076 | Wed | 23 | 2 | yes | yes | 0 | name | 1 | no | no | no | no | no | no | no |
| 1422 | Wed | 9 | 1 | no | yes | 0 | name | 1 | no | no | no | no | no | no | no |
| 2144 | Wed | 13 | 2 | yes | yes | 0 | name | 1 | no | no | no | no | no | no | no |
spam.tree = rpart(spam ~., data = spam.train, method = "class")
spam.tree
## n= 1737
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1737 563 no (0.67587795 0.32412205)
## 2) box=yes 886 24 no (0.97291196 0.02708804) *
## 3) box=no 851 312 yes (0.36662750 0.63337250)
## 6) local=yes 163 8 no (0.95092025 0.04907975) *
## 7) local=no 688 157 yes (0.22819767 0.77180233)
## 14) large.text=no 418 151 yes (0.36124402 0.63875598)
## 28) sucker=no 371 150 yes (0.40431267 0.59568733)
## 56) name=name 235 115 yes (0.48936170 0.51063830)
## 112) day.of.week=Mon,Thu 93 33 no (0.64516129 0.35483871)
## 224) time.of.day>=5.5 70 18 no (0.74285714 0.25714286) *
## 225) time.of.day< 5.5 23 8 yes (0.34782609 0.65217391) *
## 113) day.of.week=Fri,Sat,Sun,Tue,Wed 142 55 yes (0.38732394 0.61267606) *
## 57) name=empty,single 136 35 yes (0.25735294 0.74264706) *
## 29) sucker=yes 47 1 yes (0.02127660 0.97872340) *
## 15) large.text=yes 270 6 yes (0.02222222 0.97777778) *
print(summary(spam.tree))
## Call:
## rpart(formula = spam ~ ., data = spam.train, method = "class")
## n= 1737
##
## CP nsplit rel error xerror xstd
## 1 0.40319716 0 1.0000000 1.0000000 0.03464814
## 2 0.26110124 1 0.5968028 0.5968028 0.02924020
## 3 0.01198934 2 0.3357016 0.3357016 0.02305197
## 4 0.01000000 7 0.2753108 0.3303730 0.02289044
##
## Variable importance
## box local large.text size.kb sucker name
## 32 29 12 8 8 7
## chain day.of.week time.of.day
## 2 1 1
##
## Node number 1: 1737 observations, complexity param=0.4031972
## predicted class=no expected loss=0.324122 P(node) =1
## class counts: 1174 563
## probabilities: 0.676 0.324
## left son=2 (886 obs) right son=3 (851 obs)
## Primary splits:
## box splits as RL, improve=319.1144, (0 missing)
## local splits as RL, improve=263.5342, (0 missing)
## large.text splits as LR, improve=234.1405, (0 missing)
## sucker splits as LR, improve=156.2093, (0 missing)
## digits < 0.5 to the left, improve=100.8942, (0 missing)
## Surrogate splits:
## local splits as RL, agree=0.740, adj=0.469, (0 split)
## large.text splits as LR, agree=0.637, adj=0.260, (0 split)
## size.kb < 3.5 to the left, agree=0.622, adj=0.228, (0 split)
## sucker splits as LR, agree=0.599, adj=0.182, (0 split)
## name splits as RLR, agree=0.598, adj=0.180, (0 split)
##
## Node number 2: 886 observations
## predicted class=no expected loss=0.02708804 P(node) =0.5100748
## class counts: 862 24
## probabilities: 0.973 0.027
##
## Node number 3: 851 observations, complexity param=0.2611012
## predicted class=yes expected loss=0.3666275 P(node) =0.4899252
## class counts: 312 539
## probabilities: 0.367 0.633
## left son=6 (163 obs) right son=7 (688 obs)
## Primary splits:
## local splits as RL, improve=137.66380, (0 missing)
## large.text splits as LR, improve= 93.81813, (0 missing)
## sucker splits as LR, improve= 52.26650, (0 missing)
## name splits as RLR, improve= 37.45028, (0 missing)
## size.kb < 5.5 to the left, improve= 32.19858, (0 missing)
## Surrogate splits:
## chain splits as RL, agree=0.831, adj=0.117, (0 split)
## size.kb < 75.5 to the right, agree=0.810, adj=0.006, (0 split)
##
## Node number 6: 163 observations
## predicted class=no expected loss=0.04907975 P(node) =0.09383995
## class counts: 155 8
## probabilities: 0.951 0.049
##
## Node number 7: 688 observations, complexity param=0.01198934
## predicted class=yes expected loss=0.2281977 P(node) =0.3960852
## class counts: 157 531
## probabilities: 0.228 0.772
## left son=14 (418 obs) right son=15 (270 obs)
## Primary splits:
## large.text splits as LR, improve=37.708290, (0 missing)
## sucker splits as LR, improve=19.745120, (0 missing)
## size.kb < 5.5 to the left, improve= 9.249632, (0 missing)
## day.of.week splits as RLRRLLL, improve= 8.219463, (0 missing)
## name splits as LLR, improve= 8.072838, (0 missing)
## Surrogate splits:
## sucker splits as LR, agree=0.706, adj=0.252, (0 split)
## size.kb < 9.5 to the left, agree=0.692, adj=0.215, (0 split)
## digits < 7.5 to the left, agree=0.644, adj=0.093, (0 split)
## time.of.day < 21.5 to the left, agree=0.618, adj=0.026, (0 split)
## special < 6.5 to the left, agree=0.618, adj=0.026, (0 split)
##
## Node number 14: 418 observations, complexity param=0.01198934
## predicted class=yes expected loss=0.361244 P(node) =0.2406448
## class counts: 151 267
## probabilities: 0.361 0.639
## left son=28 (371 obs) right son=29 (47 obs)
## Primary splits:
## sucker splits as LR, improve=12.240660, (0 missing)
## name splits as RLR, improve=11.158850, (0 missing)
## day.of.week splits as RLRRLLL, improve=10.519770, (0 missing)
## username splits as LR, improve= 7.547117, (0 missing)
## digits < 0.5 to the left, improve= 6.553703, (0 missing)
##
## Node number 15: 270 observations
## predicted class=yes expected loss=0.02222222 P(node) =0.1554404
## class counts: 6 264
## probabilities: 0.022 0.978
##
## Node number 28: 371 observations, complexity param=0.01198934
## predicted class=yes expected loss=0.4043127 P(node) =0.2135866
## class counts: 150 221
## probabilities: 0.404 0.596
## left son=56 (235 obs) right son=57 (136 obs)
## Primary splits:
## name splits as RLR, improve=9.274097, (0 missing)
## day.of.week splits as LLRRLLL, improve=9.153530, (0 missing)
## username splits as LR, improve=8.229523, (0 missing)
## digits < 0.5 to the left, improve=8.170205, (0 missing)
## time.of.day < 5.5 to the right, improve=6.251197, (0 missing)
## Surrogate splits:
## credit splits as LR, agree=0.658, adj=0.066, (0 split)
## special < 6.5 to the left, agree=0.652, adj=0.051, (0 split)
## size.kb < 2.5 to the right, agree=0.644, adj=0.029, (0 split)
## day.of.week splits as LLLRLLL, agree=0.639, adj=0.015, (0 split)
## digits < 1.5 to the left, agree=0.636, adj=0.007, (0 split)
##
## Node number 29: 47 observations
## predicted class=yes expected loss=0.0212766 P(node) =0.02705815
## class counts: 1 46
## probabilities: 0.021 0.979
##
## Node number 56: 235 observations, complexity param=0.01198934
## predicted class=yes expected loss=0.4893617 P(node) =0.1352907
## class counts: 115 120
## probabilities: 0.489 0.511
## left son=112 (93 obs) right son=113 (142 obs)
## Primary splits:
## day.of.week splits as RLRRLRR, improve=7.471797, (0 missing)
## time.of.day < 5.5 to the right, improve=6.739738, (0 missing)
## username splits as LR, improve=5.200525, (0 missing)
## digits < 0.5 to the left, improve=4.889873, (0 missing)
## size.kb < 1.5 to the right, improve=4.558681, (0 missing)
## Surrogate splits:
## size.kb < 50 to the right, agree=0.613, adj=0.022, (0 split)
## chain splits as RL, agree=0.613, adj=0.022, (0 split)
## porn splits as RL, agree=0.609, adj=0.011, (0 split)
##
## Node number 57: 136 observations
## predicted class=yes expected loss=0.2573529 P(node) =0.07829591
## class counts: 35 101
## probabilities: 0.257 0.743
##
## Node number 112: 93 observations, complexity param=0.01198934
## predicted class=no expected loss=0.3548387 P(node) =0.05354059
## class counts: 60 33
## probabilities: 0.645 0.355
## left son=224 (70 obs) right son=225 (23 obs)
## Primary splits:
## time.of.day < 5.5 to the right, improve=5.40300500, (0 missing)
## special < 2.5 to the left, improve=3.44218400, (0 missing)
## size.kb < 2.5 to the right, improve=2.82090500, (0 missing)
## digits < 0.5 to the left, improve=0.69456920, (0 missing)
## day.of.week splits as -L--R--, improve=0.02083655, (0 missing)
## Surrogate splits:
## credit splits as LR, agree=0.796, adj=0.174, (0 split)
## special < 2.5 to the left, agree=0.785, adj=0.130, (0 split)
## porn splits as LR, agree=0.774, adj=0.087, (0 split)
##
## Node number 113: 142 observations
## predicted class=yes expected loss=0.3873239 P(node) =0.08175014
## class counts: 55 87
## probabilities: 0.387 0.613
##
## Node number 224: 70 observations
## predicted class=no expected loss=0.2571429 P(node) =0.04029937
## class counts: 52 18
## probabilities: 0.743 0.257
##
## Node number 225: 23 observations
## predicted class=yes expected loss=0.3478261 P(node) =0.01324122
## class counts: 8 15
## probabilities: 0.348 0.652
##
## n= 1737
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1737 563 no (0.67587795 0.32412205)
## 2) box=yes 886 24 no (0.97291196 0.02708804) *
## 3) box=no 851 312 yes (0.36662750 0.63337250)
## 6) local=yes 163 8 no (0.95092025 0.04907975) *
## 7) local=no 688 157 yes (0.22819767 0.77180233)
## 14) large.text=no 418 151 yes (0.36124402 0.63875598)
## 28) sucker=no 371 150 yes (0.40431267 0.59568733)
## 56) name=name 235 115 yes (0.48936170 0.51063830)
## 112) day.of.week=Mon,Thu 93 33 no (0.64516129 0.35483871)
## 224) time.of.day>=5.5 70 18 no (0.74285714 0.25714286) *
## 225) time.of.day< 5.5 23 8 yes (0.34782609 0.65217391) *
## 113) day.of.week=Fri,Sat,Sun,Tue,Wed 142 55 yes (0.38732394 0.61267606) *
## 57) name=empty,single 136 35 yes (0.25735294 0.74264706) *
## 29) sucker=yes 47 1 yes (0.02127660 0.97872340) *
## 15) large.text=yes 270 6 yes (0.02222222 0.97777778) *
fancyRpartPlot(spam.tree, sub = "Decision Tree: Using The Training Dataset")
pred = predict(spam.tree, spam.test, type = "class")
conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table
## Reference
## Prediction no yes
## no 267 14
## yes 20 133
spam.acc = conf.matrix$overall[1]
spam.acc
## Accuracy
## 0.921659
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc, " = ", round(1 - spam.acc, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.921658986175115 = 0.07834"
pred2 = prediction(as.numeric(pred), as.numeric( spam.test$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.91753774680604"
pred.train = predict(spam.tree, spam.train, type = "class")
conf.matrix.train = confusionMatrix(pred.train, spam.train$spam)
conf.matrix.train$table
## Reference
## Prediction no yes
## no 1069 50
## yes 105 513
spam.acc.train = conf.matrix.train$overall[1]
spam.acc.train
## Accuracy
## 0.9107657
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc.train, " = ", round(1 - spam.acc.train, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.91076568796776 = 0.08923"
pred2 = prediction(as.numeric(pred.train), as.numeric(spam.train$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.910876116932592"
pruned.1 = prune(spam.tree, cp = 0.1)
fancyRpartPlot(pruned.1, sub = "Pruned Decision Tree - With Complex Parameter 0.1")
pred = predict(pruned.1, spam.test, type = "class")
conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table
## Reference
## Prediction no yes
## no 259 6
## yes 28 141
spam.acc = conf.matrix$overall[1]
spam.acc
## Accuracy
## 0.921659
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc, " = ", round(1 - spam.acc, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.921658986175115 = 0.07834"
pred2 = prediction(as.numeric(pred), as.numeric(spam.test$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.930811348929816"
Accuracy and error rate remained the same, however, AUC increased.
pred.train = predict(pruned.1, spam.train, type = "class")
conf.matrix.train = confusionMatrix(pred.train, spam.train$spam)
conf.matrix.train$table
## Reference
## Prediction no yes
## no 1017 32
## yes 157 531
spam.acc.train = conf.matrix.train$overall[1]
spam.acc.train
## Accuracy
## 0.8911917
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc.train, " = ", round(1 - spam.acc.train, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.89119170984456 = 0.10881"
pred2 = prediction(as.numeric(pred.train), as.numeric(spam.train$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.904715399675019"
Accuracy has reduced, as a result the error rate increased.Also, AUC reduced.
Thus, the pruned decision tree is not performing well on the training data set.
pruned.2 = prune(spam.tree, cp = 0.3)
fancyRpartPlot(pruned.2, sub = "Pruned Decision Tree - With Complex Parameter 0.3")
music.data = read.csv("music.csv", header=T)
music = subset(music.data, select = -c(songtitle, artistname, songID,
artistID, timesignature_confidence, tempo_confidence, key_confidence))
normalize = function(data)
{
return ((data - min(data)) / (max(data) - min(data)))
}
music = as.data.frame(lapply(music, normalize))
kable(head(music, n = 5))
| year | timesignature | loudness | tempo | key | energy | pitch | timbre_0_min | timbre_0_max | timbre_1_min | timbre_1_max | timbre_2_min | timbre_2_max | timbre_3_min | timbre_3_max | timbre_4_min | timbre_4_max | timbre_5_min | timbre_5_max | timbre_6_min | timbre_6_max | timbre_7_min | timbre_7_max | timbre_8_min | timbre_8_max | timbre_9_min | timbre_9_max | timbre_10_min | timbre_10_max | timbre_11_min | timbre_11_max | Top10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.4285714 | 0.8727717 | 0.3746311 | 1.0000000 | 0.9681145 | 0.0443623 | 4.14e-05 | 0.8703938 | 0.7153206 | 0.3931543 | 0.6766215 | 0.2411221 | 0.4438659 | 0.5065554 | 0.6448221 | 0.2874073 | 0.7162544 | 0.5504263 | 0.4046581 | 0.3109549 | 0.6531844 | 0.3353556 | 0.6825367 | 0.3806657 | 0.7576586 | 0.4129704 | 0.4156798 | 0.1258552 | 0.7248566 | 0.1822900 | 0 |
| 1 | 0.5714286 | 0.8775939 | 0.5732460 | 0.9090909 | 0.9861960 | 0.0462107 | 0.00e+00 | 0.8717939 | 0.6478704 | 0.3932136 | 0.4876386 | 0.4552745 | 0.2431661 | 0.7628129 | 0.6128019 | 0.3910397 | 0.7953092 | 0.1746606 | 0.4166321 | 0.3206946 | 0.6773724 | 0.4581107 | 0.6230959 | 0.3587984 | 0.4485917 | 0.4321209 | 0.5298796 | 0.6454200 | 0.7671133 | 0.1485661 | 0 |
| 1 | 0.5714286 | 0.8885639 | 0.6570094 | 0.1818182 | 0.9913949 | 0.0480591 | 6.20e-05 | 0.8719494 | 0.6918732 | 0.3931015 | 0.7009716 | 0.3989601 | 0.6149570 | 0.4897756 | 0.5455583 | 0.5910034 | 0.7435350 | 0.4386697 | 0.4040007 | 0.2749548 | 0.6699796 | 0.3260446 | 0.6329889 | 0.4208143 | 0.6851381 | 0.3363030 | 0.5071476 | 0.1995160 | 0.7323115 | 0.1799131 | 0 |
| 1 | 0.5714286 | 0.8829875 | 0.3991904 | 0.0909091 | 0.9406241 | 0.0240296 | 0.00e+00 | 0.8786193 | 0.6593864 | 0.4729224 | 0.5182236 | 0.4377612 | 0.8903700 | 0.7408849 | 0.5859346 | 0.4635638 | 0.8395755 | 0.4381688 | 0.5194258 | 0.2649187 | 0.6871022 | 0.4066693 | 0.5122079 | 0.3922019 | 0.6658812 | 0.5684082 | 0.5355916 | 0.2655300 | 0.6194016 | 0.2899138 | 0 |
| 1 | 0.5714286 | 0.8626017 | 0.5732664 | 0.5454545 | 0.9892287 | 0.1164510 | 0.00e+00 | 0.8612543 | 0.2400185 | 0.3932136 | 0.4944161 | 0.4192729 | 0.7751513 | 0.7736043 | 0.6311931 | 0.3868533 | 0.7550011 | 0.1626208 | 0.2643276 | 0.3050375 | 0.7222566 | 0.4752561 | 0.6542392 | 0.3715339 | 0.6699038 | 0.3863739 | 0.7872832 | 0.1471355 | 0.6842820 | 0.2479626 | 0 |
paste0("Below is the class distribution of the entire data set")
## [1] "Below is the class distribution of the entire data set"
prop.table(table(music$Top10))
##
## 0 1
## 0.8522577 0.1477423
set.seed(120634)
n.folds = 10
fold.strat = createFolds(music$Top10, k=10) #k-folds with stratification. caret package.
dt.prop.test = data.frame('false'=1:n.folds, 'true'=1:n.folds)
dt.prop.train = data.frame('false'=1:n.folds, 'true'=1:n.folds)
for(i in 1:n.folds)
{
te.index = as.numeric(unlist(fold.strat[i]))
test.cv = music[te.index, ]
prop = prop.table(table(test.cv$Top10))
dt.prop.test[i,] = prop
train.cv = music[-te.index, ]
prop = prop.table(table(train.cv$Top10))
dt.prop.train[i,] = prop
}
r.names = c("Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fold07", "Fold08", "Fold09", "Fold10")
row.names(dt.prop.test) = names(fold.strat)
row.names(dt.prop.train) = names(fold.strat)
kable(dt.prop.test)
| false | true | |
|---|---|---|
| Fold01 | 0.8496042 | 0.1503958 |
| Fold02 | 0.8575198 | 0.1424802 |
| Fold03 | 0.8614776 | 0.1385224 |
| Fold04 | 0.8548813 | 0.1451187 |
| Fold05 | 0.8454425 | 0.1545575 |
| Fold06 | 0.8612946 | 0.1387054 |
| Fold07 | 0.8401585 | 0.1598415 |
| Fold08 | 0.8639366 | 0.1360634 |
| Fold09 | 0.8428005 | 0.1571995 |
| Fold10 | 0.8454425 | 0.1545575 |
kable(dt.prop.train)
| false | true | |
|---|---|---|
| Fold01 | 0.8525528 | 0.1474472 |
| Fold02 | 0.8516725 | 0.1483275 |
| Fold03 | 0.8512324 | 0.1487676 |
| Fold04 | 0.8519660 | 0.1480340 |
| Fold05 | 0.8530145 | 0.1469855 |
| Fold06 | 0.8512542 | 0.1487458 |
| Fold07 | 0.8536013 | 0.1463987 |
| Fold08 | 0.8509608 | 0.1490392 |
| Fold09 | 0.8533079 | 0.1466921 |
| Fold10 | 0.8530145 | 0.1469855 |
set.seed(1874)
KNN.test.error = rep(NA, n.folds)
KNN.test.accu = rep(NA, n.folds)
KNN.test.auc = rep(NA, n.folds)
minErr = list(NA,NA,NA)
minAccu = list(NA,NA,NA)
minAuc = list(NA,NA,NA)
j = 1
k.values = c(3,5,9)
for (k in k.values)
{
for(i in 1:n.folds)
{
fold.index = as.numeric(unlist(fold.strat[i]))
test_data = music[fold.index, ][, -32]
test_class = music[fold.index, ][, 32]
train_data = music[-fold.index, ][, -32]
train_class = music[-fold.index, ][, 32]
test.fit = knn(train=train_data, test=test_data, cl=train_class, k = k)
#train.fit = knn(train=train_data, train_data, cl=train_data, k = k)
KNN.test.error[i] = c(1 - sum(test.fit == test_class) / nrow(test_data))
KNN.test.accu[i] = c(sum(test.fit == test_class) / nrow(test_data))
pred = prediction(as.numeric(test.fit), test_class)
auc.te = performance(pred, "auc")
KNN.test.auc[i] = c(as.numeric(auc.te@y.values))
}
minErr[j] = data.frame(test.error = KNN.test.error)
minAccu[j] = data.frame(test.accu = KNN.test.accu)
minAuc[j] = data.frame(test.auc = KNN.test.auc)
j = j + 1
}
error.k_3 = data.frame(fold=1:10, "error" = minErr[1], "accuracy" = minAccu[1], "auc" = minAuc[1])
error.k_5 = data.frame(fold=1:10, "error" = minErr[2], "accuracy" = minAccu[2], "auc" = minAuc[2])
error.k_9 = data.frame(fold=1:10, "error" = minErr[3], "accuracy" = minAccu[3], "auc" = minAuc[3])
names(error.k_3) = c("fold", "error", "accuracy", "auc")
names(error.k_5) = c("fold", "error", "accuracy", "auc")
names(error.k_9) = c("fold", "error", "accuracy", "auc")
#mean(knn.kcv.error)
kable(error.k_3, rownames = F)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.1728232 | 0.8271768 | 0.5878691 |
| 2 | 0.1701847 | 0.8298153 | 0.6035185 |
| 3 | 0.1715040 | 0.8284960 | 0.5687742 |
| 4 | 0.1503958 | 0.8496042 | 0.6063552 |
| 5 | 0.1730515 | 0.8269485 | 0.5658921 |
| 6 | 0.1690885 | 0.8309115 | 0.5822378 |
| 7 | 0.1690885 | 0.8309115 | 0.6249935 |
| 8 | 0.1558785 | 0.8441215 | 0.5866883 |
| 9 | 0.1585205 | 0.8414795 | 0.6120097 |
| 10 | 0.1770145 | 0.8229855 | 0.5845019 |
paste0("Mean_Error = ", mean(error.k_3$error))
## [1] "Mean_Error = 0.166754965964106"
paste0("Mean_Accuracy = ", mean(error.k_3$accuracy))
## [1] "Mean_Accuracy = 0.833245034035894"
paste0("Mean_AUC = ", mean(error.k_3$auc))
## [1] "Mean_AUC = 0.592284032694927"
kable(error.k_5, rownames = F)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.1715040 | 0.8284960 | 0.5597690 |
| 2 | 0.1583113 | 0.8416887 | 0.5795584 |
| 3 | 0.1437995 | 0.8562005 | 0.5688690 |
| 4 | 0.1424802 | 0.8575198 | 0.5921156 |
| 5 | 0.1532365 | 0.8467635 | 0.5741186 |
| 6 | 0.1519155 | 0.8480845 | 0.5722320 |
| 7 | 0.1466314 | 0.8533686 | 0.6283201 |
| 8 | 0.1321004 | 0.8678996 | 0.6004498 |
| 9 | 0.1532365 | 0.8467635 | 0.5843827 |
| 10 | 0.1571995 | 0.8428005 | 0.5997129 |
paste0("Mean_Error = ", mean(error.k_5$error))
## [1] "Mean_Error = 0.151041466976644"
paste0("Mean_Accuracy = ", mean(error.k_5$accuracy))
## [1] "Mean_Accuracy = 0.848958533023356"
paste0("Mean_AUC = ", mean(error.k_5$auc))
## [1] "Mean_AUC = 0.585952791648983"
kable(error.k_9, rownames = F)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.1583113 | 0.8416887 | 0.5422660 |
| 2 | 0.1332454 | 0.8667546 | 0.5710114 |
| 3 | 0.1306069 | 0.8693931 | 0.5725297 |
| 4 | 0.1358839 | 0.8641161 | 0.5657828 |
| 5 | 0.1492734 | 0.8507266 | 0.5485243 |
| 6 | 0.1373844 | 0.8626156 | 0.5606924 |
| 7 | 0.1453104 | 0.8546896 | 0.5889534 |
| 8 | 0.1387054 | 0.8612946 | 0.5434592 |
| 9 | 0.1545575 | 0.8454425 | 0.5528371 |
| 10 | 0.1558785 | 0.8441215 | 0.5585871 |
paste0("Mean_Error = ", mean(error.k_9$error))
## [1] "Mean_Error = 0.143915713673262"
paste0("Mean_Accuracy = ", mean(error.k_9$accuracy))
## [1] "Mean_Accuracy = 0.856084286326738"
paste0("Mean_AUC = ", mean(error.k_9$auc))
## [1] "Mean_AUC = 0.560464345557562"
set.seed(1294)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)
accu.p1 = rep(NA, n.folds)
err.p1 = rep(NA, n.folds)
auc.p1 = rep(NA, n.folds)
accu.p2 = rep(NA, n.folds)
err.p2 = rep(NA, n.folds)
auc.p2 = rep(NA, n.folds)
for(i in 1:10)
{
fold.index = as.numeric(unlist(fold.strat[i]))
test_data = music[fold.index, ]
test_class = music[fold.index, ][, 32]
train_data = music[-fold.index, ]
tree = rpart(Top10 ~., train_data, method="class")
fancyRpartPlot(tree, sub = paste("Decision Tree For Fold", i))
pred = predict(tree, test_data, type = "class")
conf.matrix = confusionMatrix(pred, test_class)
cf.mat.tab = conf.matrix$table
acc = conf.matrix$overall[1]
values = as.numeric(pred)-1
pred = prediction(as.numeric(values), test_class)
auc.te = performance(pred, "auc")
accu[i] = c(round(1-(1 - acc), 5))
err[i] = c(round(1 - acc, 5))
auc[i] = c(as.numeric(auc.te@y.values))
#Prune with cp = 0.1
tree.prune.1 = prune(tree, cp = 0.1)
#fancyRpartPlot(tree.prune.1)
pred = predict(tree.prune.1, test_data, type = "class")
conf.matrix = confusionMatrix(pred, test_class)
cf.mat.tab = conf.matrix$table
acc = conf.matrix$overall[1]
values = as.numeric(pred)-1
pred = prediction(as.numeric(values), test_class)
auc.te = performance(pred, "auc")
accu.p1[i] = c(round(1-(1 - acc), 5))
err.p1[i] = c(round(1 - acc, 5))
auc.p1[i] = c(as.numeric(auc.te@y.values))
#Prune with cp = 0.3
tree.prune.2 = prune(tree, cp = 0.03)
#fancyRpartPlot(tree.prune.2, sub = paste("cp=0.3 - Pruned Decision Tree - Fold", i))
pred = predict(tree.prune.2, test_data, type = "class")
conf.matrix = confusionMatrix(pred, test_class)
cf.mat.tab = conf.matrix$table
acc = conf.matrix$overall[1]
values = as.numeric(pred)-1
pred = prediction(as.numeric(values), test_class)
auc.te = performance(pred, "auc")
accu.p2[i] = c(round(1-(1 - acc), 5))
err.p2[i] = c(round(1 - acc, 5))
auc.p2[i] = c(as.numeric(auc.te@y.values))
}
result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.14248 | 0.85752 | 0.5804593 |
| 2 | 0.13720 | 0.86280 | 0.5725641 |
| 3 | 0.14512 | 0.85488 | 0.5601108 |
| 4 | 0.12665 | 0.87335 | 0.5825056 |
| 5 | 0.15456 | 0.84544 | 0.5768296 |
| 6 | 0.13474 | 0.86526 | 0.5422509 |
| 7 | 0.13871 | 0.86129 | 0.5928842 |
| 8 | 0.13738 | 0.86262 | 0.5278644 |
| 9 | 0.16380 | 0.83620 | 0.5405153 |
| 10 | 0.14663 | 0.85337 | 0.5954861 |
names(result) = c("fold", "error", "accuracy", "auc")
paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.142727"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.857273"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.567147039875754"
result = data.frame(fold=1:10, "error" = err.p1, "accuracy" = accu.p1, "auc" = auc.p1)
kable(result)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.15040 | 0.84960 | 0.5 |
| 2 | 0.14248 | 0.85752 | 0.5 |
| 3 | 0.13852 | 0.86148 | 0.5 |
| 4 | 0.14512 | 0.85488 | 0.5 |
| 5 | 0.15456 | 0.84544 | 0.5 |
| 6 | 0.13871 | 0.86129 | 0.5 |
| 7 | 0.15984 | 0.84016 | 0.5 |
| 8 | 0.13606 | 0.86394 | 0.5 |
| 9 | 0.15720 | 0.84280 | 0.5 |
| 10 | 0.15456 | 0.84544 | 0.5 |
names(result) = c("fold", "error", "accuracy", "auc")
paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.147745"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.852255"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.5"
result = data.frame(fold=1:10, "error" = err.p2, "accuracy" = accu.p2, "auc" = auc.p2)
kable(result)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.15040 | 0.84960 | 0.5 |
| 2 | 0.14248 | 0.85752 | 0.5 |
| 3 | 0.13852 | 0.86148 | 0.5 |
| 4 | 0.14512 | 0.85488 | 0.5 |
| 5 | 0.15456 | 0.84544 | 0.5 |
| 6 | 0.13871 | 0.86129 | 0.5 |
| 7 | 0.15984 | 0.84016 | 0.5 |
| 8 | 0.13606 | 0.86394 | 0.5 |
| 9 | 0.15720 | 0.84280 | 0.5 |
| 10 | 0.15456 | 0.84544 | 0.5 |
names(result) = c("fold", "error", "accuracy", "auc")
paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.147745"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.852255"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.5"
set.seed(123694)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)
for(i in 1:10)
{
fold.index = as.numeric(unlist(fold.strat[i]))
test_data = music[fold.index, ]
test_data$Top10 = as.factor(test_data$Top10)
test_class = music[fold.index, ][, 32]
train_data = music[-fold.index, ]
train_data$Top10 = as.factor(train_data$Top10)
train_class = music[-fold.index, ][, 32]
#nb.model = naiveBayes(train_data, train_class)
library(klaR)
nb.model = NaiveBayes(Top10 ~., data = train_data)
pred = predict(nb.model, test_data)
error = 1 - sum(pred$class == test_class) / nrow(test_data)
acc = 1 - error
values = as.numeric(pred$class)-1
pred = prediction(as.numeric(values), test_class)
auc.te = performance(pred, "auc")
accu[i] = c(acc)
err[i] = c(error)
auc[i] = c(as.numeric(auc.te@y.values))
}
result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.2585752 | 0.7414248 | 0.6709573 |
| 2 | 0.2770449 | 0.7229551 | 0.6879060 |
| 3 | 0.2532982 | 0.7467018 | 0.7051265 |
| 4 | 0.2401055 | 0.7598945 | 0.7086139 |
| 5 | 0.2589168 | 0.7410832 | 0.6757545 |
| 6 | 0.2496697 | 0.7503303 | 0.6912650 |
| 7 | 0.2721268 | 0.7278732 | 0.7142458 |
| 8 | 0.2708058 | 0.7291942 | 0.6714988 |
| 9 | 0.2721268 | 0.7278732 | 0.6779129 |
| 10 | 0.2708058 | 0.7291942 | 0.6722155 |
names(result) = c("fold", "error", "accuracy", "auc")
paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.26234755300572"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.73765244699428"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.687549619294451"
set.seed(1429461)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)
for(i in 1:10)
{
fold.index = as.numeric(unlist(fold.strat[i]))
test_data = music[fold.index, ][, -32]
test_class = music[fold.index, ][, 32]
train_data = music[-fold.index, ][, -32]
train_class = music[-fold.index, ][, 32]
rf.model = randomForest(train_data, as.factor(as.numeric(train_class)))
pred = predict(rf.model, test_data)
conf.matrix = confusionMatrix(pred, test_class)
cf.mat.tab = conf.matrix$table
acc = conf.matrix$overall[1]
values = as.numeric(pred) - 1
pred.2 = prediction(as.numeric(values), test_class)
auc.te = performance(pred.2, "auc")
accu[i] = c(round(1-(1 - acc), 5))
err[i] = c(round(1 - acc, 5))
auc[i] = c(as.numeric(auc.te@y.values))
}
result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)
| fold | error | accuracy | auc |
|---|---|---|---|
| 1 | 0.13588 | 0.86412 | 0.5590743 |
| 2 | 0.13193 | 0.86807 | 0.5524786 |
| 3 | 0.12005 | 0.87995 | 0.5826515 |
| 4 | 0.13193 | 0.86807 | 0.5567761 |
| 5 | 0.14927 | 0.85073 | 0.5345553 |
| 6 | 0.12285 | 0.87715 | 0.5731230 |
| 7 | 0.13738 | 0.86262 | 0.5802861 |
| 8 | 0.12682 | 0.87318 | 0.5544298 |
| 9 | 0.13871 | 0.86129 | 0.5656595 |
| 10 | 0.14267 | 0.85733 | 0.5489383 |
names(result) = c("fold", "error", "accuracy", "auc")
paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.133749"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.866251"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.560797255021492"
set.seed(18740)
KNN.test.error = rep(NA, n.folds)
KNN.test.accu = rep(NA, n.folds)
KNN.test.auc = rep(NA, n.folds)
knn.test.bestK = rep(NA, n.folds)
KNN.test.accu.in = rep(NA, n.folds-1)
k.values = c(1,3,5,7,9)
for(i in 1:n.folds)
{
fold.index = as.numeric(unlist(fold.strat[i]))
test_data = music[fold.index, ][, -32]
test_class = music[fold.index, ][, 32]
new.data = music[-fold.index, ]
train_data = new.data[, -32]
train_class = music[-fold.index, ][, 32]
#internal CV on the: for selecting the best value of k
#creating the internal folds
fold.in = createFolds(new.data$Top10, k = n.folds - 1)
acc.in = data.frame("kValue" = k.values, "accuracyMean" = NA)
w = 1
for(kk in k.values)
{
for(jj in 1:9)
{
fold.index.in = as.numeric(unlist(fold.in[jj]))
test_data.in = new.data[fold.index.in, ][, -32]
test_class.in = new.data[fold.index.in, ][, 32]
train_data.in = new.data[-fold.index.in, ][, -32]
train_class.in = new.data[-fold.index.in, ][, 32]
test.fit.in = knn(train=train_data.in, test=test_data.in, cl=train_class.in, k = kk)
KNN.test.accu.in[jj] = c(sum(test.fit.in == test_class.in) / nrow(test_data.in))
}
acc.in$accuracyMean[w] = mean(KNN.test.accu.in)
w = w+1
}
#now: picking the best k
k.meanAve = acc.in[order(-acc.in$accuracyMean), ]
best.k = k.meanAve$kValue[1]
knn.test.bestK[i] = best.k
test.fit = knn(train=train_data, test=test_data, cl=train_class, k = best.k)
KNN.test.error[i] = c(1 - sum(test.fit == test_class) / nrow(test_data))
KNN.test.accu[i] = c(sum(test.fit == test_class) / nrow(test_data))
pred = prediction(as.numeric(test.fit), test_class)
auc.te = performance(pred, "auc")
KNN.test.auc[i] = c(as.numeric(auc.te@y.values))
}
detail.k_1 = data.frame(fold=1:10, "error" = KNN.test.error,
"accuracy" = KNN.test.accu, "auc" = KNN.test.auc, "best_K_used" = knn.test.bestK)
kable(detail.k_1, row.names = F)
| fold | error | accuracy | auc | best_K_used |
|---|---|---|---|---|
| 1 | 0.1583113 | 0.8416887 | 0.5422660 | 9 |
| 2 | 0.1345646 | 0.8654354 | 0.5702422 | 9 |
| 3 | 0.1306069 | 0.8693931 | 0.5725297 | 9 |
| 4 | 0.1358839 | 0.8641161 | 0.5808782 | 7 |
| 5 | 0.1492734 | 0.8507266 | 0.5485243 | 9 |
| 6 | 0.1373844 | 0.8626156 | 0.5606924 | 9 |
| 7 | 0.1453104 | 0.8546896 | 0.5889534 | 9 |
| 8 | 0.1387054 | 0.8612946 | 0.5434592 | 9 |
| 9 | 0.1545575 | 0.8454425 | 0.5528371 | 9 |
| 10 | 0.1558785 | 0.8441215 | 0.5585871 | 9 |
ave.detail.k = data.frame(mean_error = mean(detail.k_1$error),
mean_accuracy = mean(detail.k_1$accuracy), mean_auc = mean(detail.k_1$auc))
kable(ave.detail.k, row.names = F)
| mean_error | mean_accuracy | mean_auc |
|---|---|---|
| 0.1440476 | 0.8559524 | 0.561897 |