Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-13

library('ggplot2'); library('lattice'); library('rpart')
library('rpart.plot'); library('knitr'); library('caret')
library('e1071'); library('AUC'); library('robustbase')
library('cvTools'); library('class'); library('gplots')
library('e1071'); library('ROCR'); library('class')
library('dplyr'); library('pROC'); library('rattle')

Question 1

prediction_classifier <- function(predicted, actual)
{
  #Sort the predicted values in decreasing order
  predicted <- sort(predicted, decreasing = TRUE)
 
  #Initialize Variables
  L <- length(predicted); #length of predicted = actual
  P <- 0;
  N <- 0; 
  TP <- 0; 
  TN <- 0; 
  FP <- 0; 
  FN <- 0; 
  TPR <- 0; 
  FPR <- 0; 
  Prec <- 0; 
  Acc <- 0; 
  Err <- 0; 
  tpn <- 0; 
  fpn <- 0; 
  tfp <- 0;
  TP_V <- vector(mode = "numeric", length = 0); 
  FP_V <- vector(mode = "numeric", length = 0); 
  TN_V <- vector(mode = "numeric", length = 0); 
  FN_V <- vector(mode = "numeric", length = 0); 
  TPR_V <- vector(mode = "numeric", length = 0);
  TNR_V <- vector(mode = "numeric", length = 0); 
  Prec_V <- vector(mode = "numeric", length = 0); 
  Acc_V <- vector(mode = "numeric", length = 0); 
  Err_V <- vector(mode = "numeric", length = 0);
 
  #P, N, TP, FP  
  for(i in 1:L)
  { 
    if(actual[i] == 1)
    {
      P <- P + 1;
      TP <- TP + 1;
    }
    else
    {
      N <- N + 1;
      FP <- FP + 1;
    }
    
    TP_V <- c(TP_V, TP)
    FP_V <- c(FP_V, FP)
  }
  
  d <- P + N; 
  
  #TN, FN  
  for(i in 1:L)
  {
    TN <- N - FP_V[i]
    FN <- P - TP_V[i]
    TN_V <- c(TN_V, TN)
    FN_V <- c(FN_V, FN)
  }

  #TPR (sensitivity, recall), TNR (specificity)
  for(i in 1:L)
  {
    TPR <- TP_V[i]/P
    TNR <- TN_V[i]/N
    TPR_V <- c(TPR_V, TPR)
    TNR_V <- c(TNR_V, TNR)
  }

  #Prec
  for(i in 1:L)
  {
    tfp <- TP_V[i] + FP_V[i]
    Prec <- TP_V[i]/tfp
    Prec_V <- c(Prec_V, Prec)
  }
  
  #Acc
  for(i in 1:L)
  {
    tpn <- TP_V[i] + TN_V[i]
    Acc <- tpn/d
    Acc_V <- c(Acc_V, Acc)
  }
  
  #Err
  for(i in 1:L)
  {
    fpn <- FP_V[i] + FN_V[i]
    Err <- fpn/d
    Err_V <- c(Err_V, Err)
  }

  #Show results in Data Frame
  df <- data.frame(actual, predicted, TP_V, FP_V, TN_V, FN_V, TPR_V,
            TNR_V, Prec_V, Acc_V, Err_V);
}

Question 2

Ytrue <- c(1,0,1,1,0,0,1,0,1,0)
Ypred <- c(0.98,0.92,0.85,0.77,0.71,0.64,0.50,0.39,0.34,0.31)

#using function from Problem 1
result <- prediction_classifier(Ypred, Ytrue)

#Use TNR (specificity) to find FPR (1 - specificity)
FPR <- 0
FPR_V <- vector(mode = "numeric", length = 0); 

for(i in 1:length(result$TNR_V))
{
  FPR <- 1 - result$TNR_V[i]
  FPR_V <- c(FPR_V, FPR)
}

#Show TPR, FPR, and Acc
problem2 <- result[,c("TPR_V","Acc_V")]
problem2$FPR_V <- FPR_V

knitr::kable(problem2, format = "html")
TPR_V Acc_V FPR_V
0.2 0.6 0.0
0.2 0.5 0.2
0.4 0.6 0.2
0.6 0.7 0.2
0.6 0.6 0.4
0.6 0.5 0.6
0.8 0.6 0.6
0.8 0.5 0.8
1.0 0.6 0.8
1.0 0.5 1.0

Question 3

plot(problem2$FPR_V, problem2$TPR_V, type = "o", xlab = "False Positive
     Rate", ylab = "True Positive Rate", main="ROC Curve")

Question 4

M1 <- c(30.5, 32.2, 20.7, 20.6, 31.0, 41.0, 27.7, 26.0, 21.5, 26.0)
M2 <- c(22.4, 14.5, 22.4, 19.6, 20.7, 20.4, 22.1, 19.4, 16.2, 35.0)

#1% significance
t.test(M1, M2, conf.level = .99, alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  M1 and M2
## t = 2.4376, df = 17.649, p-value = 0.02561
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -1.184208 14.084208
## sample estimates:
## mean of x mean of y 
##     27.72     21.27

The t-value falls within the confidence interval meaning there is not enough evidence to suggest that the means are different. Therefore, we can not state which model is better.

Question 5a

#GINI index calculation
GINIcalc <- function(numData)
{
  GINI_N <- 1 -((numData[1]/sum(numData))^2)-
    ((numData[2]/sum(numData))^2)- ((numData[3]/sum(numData))^2)
  print(GINI_N)
}

#Gain in GINI index for 2 splits
GAIN_GINI <- function(N, N1, N2)
{
  GAIN_N <- GINIcalc(N) - sum((length(N1)*GINIcalc(N1)),length(N2)*GINIcalc(N2))/(length(N1)+length(N2))
  print(GAIN_N)
}

#Gain in GINI index for 3 splits
GAIN_GINI_3 <- function(N, N1, N2, N3)
{
  GAIN_N_3 <- GINIcalc(N) - sum((length(N1)*GINIcalc(N1)),length(N2)*GINIcalc(N2)
                  ,length(N3)*GINIcalc(N3))/(length(N1)+length(N2)+length(N3))
  print(GAIN_N_3)
}

#Problem 5 Data set
N <- c(100,50,60)
N11 <- c(62,8,0)
N12 <- c(38,42,60)
N21 <- c(65,20,0)
N22 <- c(21,19,20)
N23 <- c(14,11,40)

#First Split GINI Index Gain: 
GAIN_GINI(N,N11,N12)
## [1] 0.6349206
## [1] 0.202449
## [1] 0.6526531
## [1] 0.2073696
#Second Split GINI Index Gain: 
GAIN_GINI_3(N, N21, N22, N23)
## [1] 0.6349206
## [1] 0.3598616
## [1] 0.6661111
## [1] 0.5462722
## [1] 0.110839

Question 5b

The first split gives the largest gain, and the node N12 has the highest GINI index.

Question 5c

#Entropy Calculation
EntropyCalc <- function(N)
{
  entropy <- 0
  calc <- TRUE
  
  #test to see if calculation possible
  for(i in 1:length(N))
  {
     if(N[i] == 0)
     {
       #print("There is a '0' element. This calculation requires log(0) which is undefined.")
       calc <- FALSE
     }
  }
  
  #entropy calculation
  if(calc == TRUE)
  {
    for(i in 1:length(N))
    { 
       entropy = entropy -N[i]/sum(N)*log(N[i]/sum(N))
    }
    
    print(entropy)
  }
}

#Gain in Entropy for 2 splits
GAIN_entropy <- function(N, N1, N2)
{
  GAIN_E <- EntropyCalc(N) - sum((length(N1)*EntropyCalc(N1)),
                      length(N2)*EntropyCalc(N2))/(length(N1)+length(N2))
  print(GAIN_E)
}

#Gain in Entropy for 2 splits
GAIN_entropy_3 <- function(N, N1, N2, N3){
  GAIN_E_3 <- EntropyCalc(N) - sum((length(N1)*EntropyCalc(N1)),length(N2)*EntropyCalc(N2),
                                   length(N3)*EntropyCalc(N3))/(length(N1)+length(N2)+length(N3))
  print(GAIN_E_3)
}

#First Split Entropy Gain: 0.5137838
GAIN_entropy(N,N11,N12)
## [1] 1.052923
## [1] 1.078278
## [1] 0.5137838
#Second Split Entropy Gain: 0.3769638
GAIN_entropy_3(N,N21,N22,N23)
## [1] 1.052923
## [1] 1.097779
## [1] 0.9300976
## [1] 0.3769638

Question 5d

The first split and the second node have the highest gain and entropy, respectively.

Question 6: Social Media Usage

social = read.csv("syp-16-data.csv", header=T)
adult = subset(social, Adult == "Adult")
hs = subset(social, Adult == "HS")

adult.sum = as.data.frame(colSums(adult[, -20]))
names(adult.sum) = "adult"

hs.sum = as.data.frame(colSums(hs[, -20]))
names(hs.sum) = "hs"

sum = cbind(adult.sum, hs.sum)
sum = t(sum)

Q6a: Grouped Bar Plots: Side-by-Side Barplot

par(mfrow=c(10, 2))

for(i in 1:19)
{
  par = names(social[i])
  gbp = as.data.frame(social[, c(i,20)])
  gbp.adult = subset(gbp, Adult=="Adult")
  gbp.adult.yes = nrow(subset(gbp.adult, gbp.adult[,1] == 1))
  gbp.adult.no = nrow(subset(gbp.adult, gbp.adult[,1] == 0))
  
  gbp.hs = subset(gbp, Adult=="HS")
  gbp.hs.yes = nrow(subset(gbp.hs, gbp.hs[,1] == 1))
  gbp.hs.no = nrow(subset(gbp.hs, gbp.hs[,1] == 0))
  
  yes = data.frame("Adult" = gbp.adult.yes, "HS" = gbp.hs.yes)
  no = data.frame("Adult" = gbp.adult.no, "HS" = gbp.hs.no)
  
  gbp.final = rbind(yes, no)
  rownames(gbp.final) = c("yes", "no")
  gbp.final = as.matrix(gbp.final)
  gbp.final = t(gbp.final)
  
  barplot(gbp.final,col=c("orange","blue"), beside=T, main = paste0(par))
  legend("top", col = c("orange","blue"), legend = c("Adult", "HS"), lwd = 5, lty = 1, bty = 'n', cex = 1)
}

barplot(sum, col=c("orange","blue"), cex.names=0.5, beside=T, legend.text = c("Adult", "HS"), 
        main = "Side-by-Side Barplot of Aggregated Respondents by Age")

Q6b:

Snapchat should be at the root of the decision tree because it seems to be the purest with the least entropy.

Q6c: Decision Tree

d.tree = rpart(Adult ~., data = social, method = "class")
fancyRpartPlot(d.tree, sub = "Decision Tree: Using All Data Points")

The variable at the root matches the intuition from part b above.

Question 7: Spam Email

spam.main = read.csv("spam.csv", header=T)


Q7a: Removing some varibles

spam = subset(spam.main, select = -c(id, isuid, domain, spampct, category, cappct))

kable(head(spam, n = 5))
day.of.week time.of.day size.kb box local digits name special credit sucker porn chain username large.text spam
Thu 0 7 no no 0 name 1 no no no no no no no
Thu 0 2 no no 0 name 5 no no no no no no yes
Thu 14 3 no yes 0 name 2 no no no yes no no no
Thu 3 3 yes no 0 name 0 no no no no no no no
Thu 3 4 no no 0 name 2 no no no no no no no

Q7b: Spliting Data into Training and Test Data

set.seed(143498)
row.spam = nrow(spam)
sample.count = round(0.2 * row.spam, 0)
sample.spam = sample(1:row.spam, sample.count) 

spam.train = spam[-sample.spam, ]
spam.test = spam[sample.spam, ]


Training Dataset

kable(head(spam.train, n = 5))
day.of.week time.of.day size.kb box local digits name special credit sucker porn chain username large.text spam
1 Thu 0 7 no no 0 name 1 no no no no no no no
2 Thu 0 2 no no 0 name 5 no no no no no no yes
3 Thu 14 3 no yes 0 name 2 no no no yes no no no
5 Thu 3 4 no no 0 name 2 no no no no no no no
6 Thu 4 4 no no 0 name 1 no no no no no no no
<br / >

Test Dataset

kable(head(spam.test, n = 5))
day.of.week time.of.day size.kb box local digits name special credit sucker porn chain username large.text spam
1951 Tue 15 1 no yes 0 name 0 no no no no no no no
554 Fri 8 2 yes yes 0 name 7 no no no no no no no
1076 Wed 23 2 yes yes 0 name 1 no no no no no no no
1422 Wed 9 1 no yes 0 name 1 no no no no no no no
2144 Wed 13 2 yes yes 0 name 1 no no no no no no no

Q7c: Classification Tree

spam.tree = rpart(spam ~., data = spam.train, method = "class")
spam.tree
## n= 1737 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 1737 563 no (0.67587795 0.32412205)  
##     2) box=yes 886  24 no (0.97291196 0.02708804) *
##     3) box=no 851 312 yes (0.36662750 0.63337250)  
##       6) local=yes 163   8 no (0.95092025 0.04907975) *
##       7) local=no 688 157 yes (0.22819767 0.77180233)  
##        14) large.text=no 418 151 yes (0.36124402 0.63875598)  
##          28) sucker=no 371 150 yes (0.40431267 0.59568733)  
##            56) name=name 235 115 yes (0.48936170 0.51063830)  
##             112) day.of.week=Mon,Thu 93  33 no (0.64516129 0.35483871)  
##               224) time.of.day>=5.5 70  18 no (0.74285714 0.25714286) *
##               225) time.of.day< 5.5 23   8 yes (0.34782609 0.65217391) *
##             113) day.of.week=Fri,Sat,Sun,Tue,Wed 142  55 yes (0.38732394 0.61267606) *
##            57) name=empty,single 136  35 yes (0.25735294 0.74264706) *
##          29) sucker=yes 47   1 yes (0.02127660 0.97872340) *
##        15) large.text=yes 270   6 yes (0.02222222 0.97777778) *


Q7d: Analysing the Decision Tree

print(summary(spam.tree))
## Call:
## rpart(formula = spam ~ ., data = spam.train, method = "class")
##   n= 1737 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.40319716      0 1.0000000 1.0000000 0.03464814
## 2 0.26110124      1 0.5968028 0.5968028 0.02924020
## 3 0.01198934      2 0.3357016 0.3357016 0.02305197
## 4 0.01000000      7 0.2753108 0.3303730 0.02289044
## 
## Variable importance
##         box       local  large.text     size.kb      sucker        name 
##          32          29          12           8           8           7 
##       chain day.of.week time.of.day 
##           2           1           1 
## 
## Node number 1: 1737 observations,    complexity param=0.4031972
##   predicted class=no   expected loss=0.324122  P(node) =1
##     class counts:  1174   563
##    probabilities: 0.676 0.324 
##   left son=2 (886 obs) right son=3 (851 obs)
##   Primary splits:
##       box        splits as  RL,       improve=319.1144, (0 missing)
##       local      splits as  RL,       improve=263.5342, (0 missing)
##       large.text splits as  LR,       improve=234.1405, (0 missing)
##       sucker     splits as  LR,       improve=156.2093, (0 missing)
##       digits     < 0.5  to the left,  improve=100.8942, (0 missing)
##   Surrogate splits:
##       local      splits as  RL,       agree=0.740, adj=0.469, (0 split)
##       large.text splits as  LR,       agree=0.637, adj=0.260, (0 split)
##       size.kb    < 3.5  to the left,  agree=0.622, adj=0.228, (0 split)
##       sucker     splits as  LR,       agree=0.599, adj=0.182, (0 split)
##       name       splits as  RLR,      agree=0.598, adj=0.180, (0 split)
## 
## Node number 2: 886 observations
##   predicted class=no   expected loss=0.02708804  P(node) =0.5100748
##     class counts:   862    24
##    probabilities: 0.973 0.027 
## 
## Node number 3: 851 observations,    complexity param=0.2611012
##   predicted class=yes  expected loss=0.3666275  P(node) =0.4899252
##     class counts:   312   539
##    probabilities: 0.367 0.633 
##   left son=6 (163 obs) right son=7 (688 obs)
##   Primary splits:
##       local      splits as  RL,       improve=137.66380, (0 missing)
##       large.text splits as  LR,       improve= 93.81813, (0 missing)
##       sucker     splits as  LR,       improve= 52.26650, (0 missing)
##       name       splits as  RLR,      improve= 37.45028, (0 missing)
##       size.kb    < 5.5  to the left,  improve= 32.19858, (0 missing)
##   Surrogate splits:
##       chain   splits as  RL,       agree=0.831, adj=0.117, (0 split)
##       size.kb < 75.5 to the right, agree=0.810, adj=0.006, (0 split)
## 
## Node number 6: 163 observations
##   predicted class=no   expected loss=0.04907975  P(node) =0.09383995
##     class counts:   155     8
##    probabilities: 0.951 0.049 
## 
## Node number 7: 688 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.2281977  P(node) =0.3960852
##     class counts:   157   531
##    probabilities: 0.228 0.772 
##   left son=14 (418 obs) right son=15 (270 obs)
##   Primary splits:
##       large.text  splits as  LR,       improve=37.708290, (0 missing)
##       sucker      splits as  LR,       improve=19.745120, (0 missing)
##       size.kb     < 5.5  to the left,  improve= 9.249632, (0 missing)
##       day.of.week splits as  RLRRLLL,  improve= 8.219463, (0 missing)
##       name        splits as  LLR,      improve= 8.072838, (0 missing)
##   Surrogate splits:
##       sucker      splits as  LR,       agree=0.706, adj=0.252, (0 split)
##       size.kb     < 9.5  to the left,  agree=0.692, adj=0.215, (0 split)
##       digits      < 7.5  to the left,  agree=0.644, adj=0.093, (0 split)
##       time.of.day < 21.5 to the left,  agree=0.618, adj=0.026, (0 split)
##       special     < 6.5  to the left,  agree=0.618, adj=0.026, (0 split)
## 
## Node number 14: 418 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.361244  P(node) =0.2406448
##     class counts:   151   267
##    probabilities: 0.361 0.639 
##   left son=28 (371 obs) right son=29 (47 obs)
##   Primary splits:
##       sucker      splits as  LR,       improve=12.240660, (0 missing)
##       name        splits as  RLR,      improve=11.158850, (0 missing)
##       day.of.week splits as  RLRRLLL,  improve=10.519770, (0 missing)
##       username    splits as  LR,       improve= 7.547117, (0 missing)
##       digits      < 0.5  to the left,  improve= 6.553703, (0 missing)
## 
## Node number 15: 270 observations
##   predicted class=yes  expected loss=0.02222222  P(node) =0.1554404
##     class counts:     6   264
##    probabilities: 0.022 0.978 
## 
## Node number 28: 371 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.4043127  P(node) =0.2135866
##     class counts:   150   221
##    probabilities: 0.404 0.596 
##   left son=56 (235 obs) right son=57 (136 obs)
##   Primary splits:
##       name        splits as  RLR,      improve=9.274097, (0 missing)
##       day.of.week splits as  LLRRLLL,  improve=9.153530, (0 missing)
##       username    splits as  LR,       improve=8.229523, (0 missing)
##       digits      < 0.5  to the left,  improve=8.170205, (0 missing)
##       time.of.day < 5.5  to the right, improve=6.251197, (0 missing)
##   Surrogate splits:
##       credit      splits as  LR,       agree=0.658, adj=0.066, (0 split)
##       special     < 6.5  to the left,  agree=0.652, adj=0.051, (0 split)
##       size.kb     < 2.5  to the right, agree=0.644, adj=0.029, (0 split)
##       day.of.week splits as  LLLRLLL,  agree=0.639, adj=0.015, (0 split)
##       digits      < 1.5  to the left,  agree=0.636, adj=0.007, (0 split)
## 
## Node number 29: 47 observations
##   predicted class=yes  expected loss=0.0212766  P(node) =0.02705815
##     class counts:     1    46
##    probabilities: 0.021 0.979 
## 
## Node number 56: 235 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.4893617  P(node) =0.1352907
##     class counts:   115   120
##    probabilities: 0.489 0.511 
##   left son=112 (93 obs) right son=113 (142 obs)
##   Primary splits:
##       day.of.week splits as  RLRRLRR,  improve=7.471797, (0 missing)
##       time.of.day < 5.5  to the right, improve=6.739738, (0 missing)
##       username    splits as  LR,       improve=5.200525, (0 missing)
##       digits      < 0.5  to the left,  improve=4.889873, (0 missing)
##       size.kb     < 1.5  to the right, improve=4.558681, (0 missing)
##   Surrogate splits:
##       size.kb < 50   to the right, agree=0.613, adj=0.022, (0 split)
##       chain   splits as  RL,       agree=0.613, adj=0.022, (0 split)
##       porn    splits as  RL,       agree=0.609, adj=0.011, (0 split)
## 
## Node number 57: 136 observations
##   predicted class=yes  expected loss=0.2573529  P(node) =0.07829591
##     class counts:    35   101
##    probabilities: 0.257 0.743 
## 
## Node number 112: 93 observations,    complexity param=0.01198934
##   predicted class=no   expected loss=0.3548387  P(node) =0.05354059
##     class counts:    60    33
##    probabilities: 0.645 0.355 
##   left son=224 (70 obs) right son=225 (23 obs)
##   Primary splits:
##       time.of.day < 5.5  to the right, improve=5.40300500, (0 missing)
##       special     < 2.5  to the left,  improve=3.44218400, (0 missing)
##       size.kb     < 2.5  to the right, improve=2.82090500, (0 missing)
##       digits      < 0.5  to the left,  improve=0.69456920, (0 missing)
##       day.of.week splits as  -L--R--,  improve=0.02083655, (0 missing)
##   Surrogate splits:
##       credit  splits as  LR,       agree=0.796, adj=0.174, (0 split)
##       special < 2.5  to the left,  agree=0.785, adj=0.130, (0 split)
##       porn    splits as  LR,       agree=0.774, adj=0.087, (0 split)
## 
## Node number 113: 142 observations
##   predicted class=yes  expected loss=0.3873239  P(node) =0.08175014
##     class counts:    55    87
##    probabilities: 0.387 0.613 
## 
## Node number 224: 70 observations
##   predicted class=no   expected loss=0.2571429  P(node) =0.04029937
##     class counts:    52    18
##    probabilities: 0.743 0.257 
## 
## Node number 225: 23 observations
##   predicted class=yes  expected loss=0.3478261  P(node) =0.01324122
##     class counts:     8    15
##    probabilities: 0.348 0.652 
## 
## n= 1737 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 1737 563 no (0.67587795 0.32412205)  
##     2) box=yes 886  24 no (0.97291196 0.02708804) *
##     3) box=no 851 312 yes (0.36662750 0.63337250)  
##       6) local=yes 163   8 no (0.95092025 0.04907975) *
##       7) local=no 688 157 yes (0.22819767 0.77180233)  
##        14) large.text=no 418 151 yes (0.36124402 0.63875598)  
##          28) sucker=no 371 150 yes (0.40431267 0.59568733)  
##            56) name=name 235 115 yes (0.48936170 0.51063830)  
##             112) day.of.week=Mon,Thu 93  33 no (0.64516129 0.35483871)  
##               224) time.of.day>=5.5 70  18 no (0.74285714 0.25714286) *
##               225) time.of.day< 5.5 23   8 yes (0.34782609 0.65217391) *
##             113) day.of.week=Fri,Sat,Sun,Tue,Wed 142  55 yes (0.38732394 0.61267606) *
##            57) name=empty,single 136  35 yes (0.25735294 0.74264706) *
##          29) sucker=yes 47   1 yes (0.02127660 0.97872340) *
##        15) large.text=yes 270   6 yes (0.02222222 0.97777778) *
fancyRpartPlot(spam.tree, sub = "Decision Tree: Using The Training Dataset")

  1. Number of terminal leaves (leaf node) is 8.
  2. Total number of nodes: 15.
  3. Internal nodes(inner nodes): 7
  4. The root is box attribute with 68% yes and 32% no, thus majority of the root node is positive. 100% of the entire training set passes through the root.

Q7e:

Performance of the Decision Tree on the Test Dataset

pred = predict(spam.tree, spam.test, type = "class")

conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table
##           Reference
## Prediction  no yes
##        no  267  14
##        yes  20 133
spam.acc = conf.matrix$overall[1]
spam.acc
## Accuracy 
## 0.921659
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc, " = ", round(1 - spam.acc, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.921658986175115 = 0.07834"
pred2 = prediction(as.numeric(pred), as.numeric( spam.test$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.91753774680604"


Performance of the Decision Tree on the Training Dataset

pred.train = predict(spam.tree, spam.train, type = "class")
conf.matrix.train = confusionMatrix(pred.train, spam.train$spam)
conf.matrix.train$table
##           Reference
## Prediction   no  yes
##        no  1069   50
##        yes  105  513
spam.acc.train = conf.matrix.train$overall[1]
spam.acc.train
##  Accuracy 
## 0.9107657
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc.train, " = ", round(1 - spam.acc.train, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.91076568796776 = 0.08923"
pred2 = prediction(as.numeric(pred.train), as.numeric(spam.train$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.910876116932592"


Q7f: Pruning the Decision Tree

pruned.1 = prune(spam.tree, cp = 0.1)
fancyRpartPlot(pruned.1, sub = "Pruned Decision Tree - With Complex Parameter 0.1")


Performance of the Pruned Decision Tree on Test Dataset

pred = predict(pruned.1, spam.test, type = "class")

conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table
##           Reference
## Prediction  no yes
##        no  259   6
##        yes  28 141
spam.acc = conf.matrix$overall[1]
spam.acc
## Accuracy 
## 0.921659
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc, " = ", round(1 - spam.acc, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.921658986175115 = 0.07834"
pred2 = prediction(as.numeric(pred), as.numeric(spam.test$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.930811348929816"

Accuracy and error rate remained the same, however, AUC increased.


Performance of the Pruned Decision Tree on Training Dataset

pred.train = predict(pruned.1, spam.train, type = "class")
conf.matrix.train = confusionMatrix(pred.train, spam.train$spam)
conf.matrix.train$table
##           Reference
## Prediction   no  yes
##        no  1017   32
##        yes  157  531
spam.acc.train = conf.matrix.train$overall[1]
spam.acc.train
##  Accuracy 
## 0.8911917
paste0("Error rate = 1 - accuracy = 1 - ", spam.acc.train, " = ", round(1 - spam.acc.train, 5))
## [1] "Error rate = 1 - accuracy = 1 - 0.89119170984456 = 0.10881"
pred2 = prediction(as.numeric(pred.train), as.numeric(spam.train$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))
## [1] "AUC = 0.904715399675019"

Accuracy has reduced, as a result the error rate increased.Also, AUC reduced.
Thus, the pruned decision tree is not performing well on the training data set.


Cutting the Tree Further

pruned.2 = prune(spam.tree, cp = 0.3)
fancyRpartPlot(pruned.2, sub = "Pruned Decision Tree - With Complex Parameter 0.3")

Question 8: Music Popularity

music.data = read.csv("music.csv", header=T)

Q8a: Removing some varibles

music = subset(music.data, select = -c(songtitle, artistname, songID,
                                       artistID, timesignature_confidence, tempo_confidence, key_confidence))

normalize = function(data) 
{
  return ((data - min(data)) / (max(data) - min(data))) 
}

music = as.data.frame(lapply(music, normalize))

kable(head(music, n = 5))
year timesignature loudness tempo key energy pitch timbre_0_min timbre_0_max timbre_1_min timbre_1_max timbre_2_min timbre_2_max timbre_3_min timbre_3_max timbre_4_min timbre_4_max timbre_5_min timbre_5_max timbre_6_min timbre_6_max timbre_7_min timbre_7_max timbre_8_min timbre_8_max timbre_9_min timbre_9_max timbre_10_min timbre_10_max timbre_11_min timbre_11_max Top10
1 0.4285714 0.8727717 0.3746311 1.0000000 0.9681145 0.0443623 4.14e-05 0.8703938 0.7153206 0.3931543 0.6766215 0.2411221 0.4438659 0.5065554 0.6448221 0.2874073 0.7162544 0.5504263 0.4046581 0.3109549 0.6531844 0.3353556 0.6825367 0.3806657 0.7576586 0.4129704 0.4156798 0.1258552 0.7248566 0.1822900 0
1 0.5714286 0.8775939 0.5732460 0.9090909 0.9861960 0.0462107 0.00e+00 0.8717939 0.6478704 0.3932136 0.4876386 0.4552745 0.2431661 0.7628129 0.6128019 0.3910397 0.7953092 0.1746606 0.4166321 0.3206946 0.6773724 0.4581107 0.6230959 0.3587984 0.4485917 0.4321209 0.5298796 0.6454200 0.7671133 0.1485661 0
1 0.5714286 0.8885639 0.6570094 0.1818182 0.9913949 0.0480591 6.20e-05 0.8719494 0.6918732 0.3931015 0.7009716 0.3989601 0.6149570 0.4897756 0.5455583 0.5910034 0.7435350 0.4386697 0.4040007 0.2749548 0.6699796 0.3260446 0.6329889 0.4208143 0.6851381 0.3363030 0.5071476 0.1995160 0.7323115 0.1799131 0
1 0.5714286 0.8829875 0.3991904 0.0909091 0.9406241 0.0240296 0.00e+00 0.8786193 0.6593864 0.4729224 0.5182236 0.4377612 0.8903700 0.7408849 0.5859346 0.4635638 0.8395755 0.4381688 0.5194258 0.2649187 0.6871022 0.4066693 0.5122079 0.3922019 0.6658812 0.5684082 0.5355916 0.2655300 0.6194016 0.2899138 0
1 0.5714286 0.8626017 0.5732664 0.5454545 0.9892287 0.1164510 0.00e+00 0.8612543 0.2400185 0.3932136 0.4944161 0.4192729 0.7751513 0.7736043 0.6311931 0.3868533 0.7550011 0.1626208 0.2643276 0.3050375 0.7222566 0.4752561 0.6542392 0.3715339 0.6699038 0.3863739 0.7872832 0.1471355 0.6842820 0.2479626 0

Q8b: 10-fold CV

paste0("Below is the class distribution of the entire data set")
## [1] "Below is the class distribution of the entire data set"
prop.table(table(music$Top10))
## 
##         0         1 
## 0.8522577 0.1477423
set.seed(120634)
n.folds = 10

fold.strat = createFolds(music$Top10, k=10) #k-folds with stratification. caret package.
dt.prop.test = data.frame('false'=1:n.folds, 'true'=1:n.folds)
dt.prop.train = data.frame('false'=1:n.folds, 'true'=1:n.folds)

for(i in 1:n.folds)
{
  te.index = as.numeric(unlist(fold.strat[i]))
  test.cv = music[te.index, ]
  prop = prop.table(table(test.cv$Top10))
  dt.prop.test[i,] = prop
  
  train.cv = music[-te.index, ]
  prop = prop.table(table(train.cv$Top10))
  dt.prop.train[i,] = prop
}

r.names = c("Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fold07", "Fold08", "Fold09", "Fold10")

row.names(dt.prop.test) = names(fold.strat)
row.names(dt.prop.train) = names(fold.strat)


Class distribution of the the test data per folds

kable(dt.prop.test)
false true
Fold01 0.8496042 0.1503958
Fold02 0.8575198 0.1424802
Fold03 0.8614776 0.1385224
Fold04 0.8548813 0.1451187
Fold05 0.8454425 0.1545575
Fold06 0.8612946 0.1387054
Fold07 0.8401585 0.1598415
Fold08 0.8639366 0.1360634
Fold09 0.8428005 0.1571995
Fold10 0.8454425 0.1545575

Class distribution of the training data per folds

kable(dt.prop.train)
false true
Fold01 0.8525528 0.1474472
Fold02 0.8516725 0.1483275
Fold03 0.8512324 0.1487676
Fold04 0.8519660 0.1480340
Fold05 0.8530145 0.1469855
Fold06 0.8512542 0.1487458
Fold07 0.8536013 0.1463987
Fold08 0.8509608 0.1490392
Fold09 0.8533079 0.1466921
Fold10 0.8530145 0.1469855

Q8c: KNN

set.seed(1874)
KNN.test.error = rep(NA, n.folds)
KNN.test.accu = rep(NA, n.folds)
KNN.test.auc = rep(NA, n.folds)
minErr = list(NA,NA,NA)
minAccu = list(NA,NA,NA)
minAuc = list(NA,NA,NA)
j = 1
k.values = c(3,5,9)

for (k in k.values) 
{
  for(i in 1:n.folds)
  {
    fold.index = as.numeric(unlist(fold.strat[i]))
    test_data = music[fold.index, ][, -32]
    test_class = music[fold.index, ][, 32]
    train_data = music[-fold.index, ][, -32]
    train_class = music[-fold.index, ][, 32]
    test.fit  = knn(train=train_data, test=test_data, cl=train_class, k = k)
    
    #train.fit = knn(train=train_data, train_data, cl=train_data, k = k)
    
    KNN.test.error[i] = c(1 - sum(test.fit == test_class) / nrow(test_data))
    KNN.test.accu[i] = c(sum(test.fit == test_class) / nrow(test_data))
    
    pred = prediction(as.numeric(test.fit), test_class)
    auc.te = performance(pred, "auc")
    KNN.test.auc[i] = c(as.numeric(auc.te@y.values))
  }
  
  minErr[j] = data.frame(test.error = KNN.test.error)
  minAccu[j] = data.frame(test.accu = KNN.test.accu)
  minAuc[j] = data.frame(test.auc = KNN.test.auc)
  j = j + 1
}

error.k_3 = data.frame(fold=1:10, "error" = minErr[1], "accuracy" = minAccu[1], "auc" = minAuc[1])
error.k_5 = data.frame(fold=1:10, "error" = minErr[2], "accuracy" = minAccu[2], "auc" = minAuc[2])
error.k_9 = data.frame(fold=1:10, "error" = minErr[3], "accuracy" = minAccu[3], "auc" = minAuc[3])
names(error.k_3) = c("fold", "error", "accuracy", "auc")
names(error.k_5) = c("fold", "error", "accuracy", "auc")
names(error.k_9) = c("fold", "error", "accuracy", "auc")
#mean(knn.kcv.error)


Errors, Accuracy, & AUC : k = 3

kable(error.k_3, rownames = F) 
fold error accuracy auc
1 0.1728232 0.8271768 0.5878691
2 0.1701847 0.8298153 0.6035185
3 0.1715040 0.8284960 0.5687742
4 0.1503958 0.8496042 0.6063552
5 0.1730515 0.8269485 0.5658921
6 0.1690885 0.8309115 0.5822378
7 0.1690885 0.8309115 0.6249935
8 0.1558785 0.8441215 0.5866883
9 0.1585205 0.8414795 0.6120097
10 0.1770145 0.8229855 0.5845019

paste0("Mean_Error = ", mean(error.k_3$error))
## [1] "Mean_Error = 0.166754965964106"
paste0("Mean_Accuracy = ", mean(error.k_3$accuracy))
## [1] "Mean_Accuracy = 0.833245034035894"
paste0("Mean_AUC = ", mean(error.k_3$auc))
## [1] "Mean_AUC = 0.592284032694927"


Errors, Accuracy, & AUC : k = 5

kable(error.k_5, rownames = F)
fold error accuracy auc
1 0.1715040 0.8284960 0.5597690
2 0.1583113 0.8416887 0.5795584
3 0.1437995 0.8562005 0.5688690
4 0.1424802 0.8575198 0.5921156
5 0.1532365 0.8467635 0.5741186
6 0.1519155 0.8480845 0.5722320
7 0.1466314 0.8533686 0.6283201
8 0.1321004 0.8678996 0.6004498
9 0.1532365 0.8467635 0.5843827
10 0.1571995 0.8428005 0.5997129

paste0("Mean_Error = ", mean(error.k_5$error))
## [1] "Mean_Error = 0.151041466976644"
paste0("Mean_Accuracy = ", mean(error.k_5$accuracy))
## [1] "Mean_Accuracy = 0.848958533023356"
paste0("Mean_AUC = ", mean(error.k_5$auc))
## [1] "Mean_AUC = 0.585952791648983"


Error, Accuracy, & AUC : k = 9

kable(error.k_9, rownames = F)
fold error accuracy auc
1 0.1583113 0.8416887 0.5422660
2 0.1332454 0.8667546 0.5710114
3 0.1306069 0.8693931 0.5725297
4 0.1358839 0.8641161 0.5657828
5 0.1492734 0.8507266 0.5485243
6 0.1373844 0.8626156 0.5606924
7 0.1453104 0.8546896 0.5889534
8 0.1387054 0.8612946 0.5434592
9 0.1545575 0.8454425 0.5528371
10 0.1558785 0.8441215 0.5585871

paste0("Mean_Error = ", mean(error.k_9$error))
## [1] "Mean_Error = 0.143915713673262"
paste0("Mean_Accuracy = ", mean(error.k_9$accuracy))
## [1] "Mean_Accuracy = 0.856084286326738"
paste0("Mean_AUC = ", mean(error.k_9$auc))
## [1] "Mean_AUC = 0.560464345557562"


Q8d: Decision Tree

set.seed(1294)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)

accu.p1 = rep(NA, n.folds)
err.p1 = rep(NA, n.folds)
auc.p1 = rep(NA, n.folds)

accu.p2 = rep(NA, n.folds)
err.p2 = rep(NA, n.folds)
auc.p2 = rep(NA, n.folds)

for(i in 1:10)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  
  test_data = music[fold.index, ]
  test_class = music[fold.index, ][, 32]
  train_data = music[-fold.index, ]
  
  tree = rpart(Top10 ~., train_data, method="class")
  fancyRpartPlot(tree, sub = paste("Decision Tree For Fold", i))
  pred = predict(tree, test_data, type = "class")

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu[i] = c(round(1-(1 - acc), 5))
  err[i] = c(round(1 - acc, 5))
  auc[i] = c(as.numeric(auc.te@y.values))
  
  #Prune with cp = 0.1
  tree.prune.1 = prune(tree, cp = 0.1)
  #fancyRpartPlot(tree.prune.1)
  
  pred = predict(tree.prune.1, test_data, type = "class")

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu.p1[i] = c(round(1-(1 - acc), 5))
  err.p1[i] = c(round(1 - acc, 5))
  auc.p1[i] = c(as.numeric(auc.te@y.values))
  
  #Prune with cp = 0.3
  tree.prune.2 = prune(tree, cp = 0.03)
  #fancyRpartPlot(tree.prune.2, sub = paste("cp=0.3 - Pruned Decision Tree -  Fold", i))
  
  pred = predict(tree.prune.2, test_data, type = "class")

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu.p2[i] = c(round(1-(1 - acc), 5))
  err.p2[i] = c(round(1 - acc, 5))
  auc.p2[i] = c(as.numeric(auc.te@y.values))
}


Error, Accuracy, & AUC Without Pruning

result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)
fold error accuracy auc
1 0.14248 0.85752 0.5804593
2 0.13720 0.86280 0.5725641
3 0.14512 0.85488 0.5601108
4 0.12665 0.87335 0.5825056
5 0.15456 0.84544 0.5768296
6 0.13474 0.86526 0.5422509
7 0.13871 0.86129 0.5928842
8 0.13738 0.86262 0.5278644
9 0.16380 0.83620 0.5405153
10 0.14663 0.85337 0.5954861
names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.142727"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.857273"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.567147039875754"


Error, Accuracy, & AUC With Pruning cp = 0.1

result = data.frame(fold=1:10, "error" = err.p1, "accuracy" = accu.p1, "auc" = auc.p1)
kable(result)
fold error accuracy auc
1 0.15040 0.84960 0.5
2 0.14248 0.85752 0.5
3 0.13852 0.86148 0.5
4 0.14512 0.85488 0.5
5 0.15456 0.84544 0.5
6 0.13871 0.86129 0.5
7 0.15984 0.84016 0.5
8 0.13606 0.86394 0.5
9 0.15720 0.84280 0.5
10 0.15456 0.84544 0.5
names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.147745"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.852255"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.5"


Error, Accuracy, & AUC With Pruning cp = 0.3

result = data.frame(fold=1:10, "error" = err.p2, "accuracy" = accu.p2, "auc" = auc.p2)
kable(result)
fold error accuracy auc
1 0.15040 0.84960 0.5
2 0.14248 0.85752 0.5
3 0.13852 0.86148 0.5
4 0.14512 0.85488 0.5
5 0.15456 0.84544 0.5
6 0.13871 0.86129 0.5
7 0.15984 0.84016 0.5
8 0.13606 0.86394 0.5
9 0.15720 0.84280 0.5
10 0.15456 0.84544 0.5
names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.147745"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.852255"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.5"


Q8e: Naive Bayes

set.seed(123694)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)

for(i in 1:10)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  
  test_data = music[fold.index, ]
  test_data$Top10 = as.factor(test_data$Top10)
  test_class = music[fold.index, ][, 32]
  train_data = music[-fold.index, ]
  train_data$Top10 = as.factor(train_data$Top10)
  train_class = music[-fold.index, ][, 32]
  
  #nb.model = naiveBayes(train_data, train_class)
  library(klaR)
  nb.model = NaiveBayes(Top10 ~., data = train_data)
  pred = predict(nb.model, test_data)
  
  error = 1 - sum(pred$class == test_class) / nrow(test_data)
  acc = 1 - error
  
  values = as.numeric(pred$class)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu[i] = c(acc)
  err[i] = c(error)
  auc[i] = c(as.numeric(auc.te@y.values))
}


Error, Accuracy, & AUC

result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)
fold error accuracy auc
1 0.2585752 0.7414248 0.6709573
2 0.2770449 0.7229551 0.6879060
3 0.2532982 0.7467018 0.7051265
4 0.2401055 0.7598945 0.7086139
5 0.2589168 0.7410832 0.6757545
6 0.2496697 0.7503303 0.6912650
7 0.2721268 0.7278732 0.7142458
8 0.2708058 0.7291942 0.6714988
9 0.2721268 0.7278732 0.6779129
10 0.2708058 0.7291942 0.6722155
names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.26234755300572"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.73765244699428"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.687549619294451"


Q8f: Random Forest

set.seed(1429461)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)

for(i in 1:10)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  test_data = music[fold.index, ][, -32]
  test_class = music[fold.index, ][, 32]
  train_data = music[-fold.index, ][, -32]
  train_class = music[-fold.index, ][, 32]

  rf.model = randomForest(train_data, as.factor(as.numeric(train_class)))
  pred = predict(rf.model, test_data)

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred) - 1
  
  pred.2 = prediction(as.numeric(values), test_class)
  auc.te = performance(pred.2, "auc")
  
  accu[i] = c(round(1-(1 - acc), 5))
  err[i] = c(round(1 - acc, 5))
  auc[i] = c(as.numeric(auc.te@y.values))
}


Error, Accuracy, & AUC

result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)
fold error accuracy auc
1 0.13588 0.86412 0.5590743
2 0.13193 0.86807 0.5524786
3 0.12005 0.87995 0.5826515
4 0.13193 0.86807 0.5567761
5 0.14927 0.85073 0.5345553
6 0.12285 0.87715 0.5731230
7 0.13738 0.86262 0.5802861
8 0.12682 0.87318 0.5544298
9 0.13871 0.86129 0.5656595
10 0.14267 0.85733 0.5489383
names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))
## [1] "Mean_Error = 0.133749"
paste0("Mean_Accuracy = ", mean(result$accuracy))
## [1] "Mean_Accuracy = 0.866251"
paste0("Mean_AUC = ", mean(result$auc))
## [1] "Mean_AUC = 0.560797255021492"


Q8g: Nested CV

Part (a) KNN - Determining the best value of k (Nested CV)

set.seed(18740)
KNN.test.error = rep(NA, n.folds)
KNN.test.accu = rep(NA, n.folds)
KNN.test.auc = rep(NA, n.folds)
knn.test.bestK = rep(NA, n.folds)

KNN.test.accu.in = rep(NA, n.folds-1)
k.values = c(1,3,5,7,9)

for(i in 1:n.folds)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  test_data = music[fold.index, ][, -32]
  test_class = music[fold.index, ][, 32]
  new.data = music[-fold.index, ]
  train_data = new.data[, -32]
  train_class = music[-fold.index, ][, 32]

  #internal CV on the: for selecting the best value of k
  #creating the internal folds
  fold.in = createFolds(new.data$Top10, k = n.folds - 1)
  acc.in = data.frame("kValue" = k.values, "accuracyMean" = NA)
  w = 1
  
  for(kk in k.values)
  {
    for(jj in 1:9)
    {
      fold.index.in = as.numeric(unlist(fold.in[jj]))
      test_data.in = new.data[fold.index.in, ][, -32]
      test_class.in = new.data[fold.index.in, ][, 32]
      train_data.in = new.data[-fold.index.in, ][, -32]
      train_class.in = new.data[-fold.index.in, ][, 32]
      
      test.fit.in  = knn(train=train_data.in, test=test_data.in, cl=train_class.in, k = kk)
      KNN.test.accu.in[jj] = c(sum(test.fit.in == test_class.in) / nrow(test_data.in))
    }
    
    acc.in$accuracyMean[w] = mean(KNN.test.accu.in)
    w = w+1
  }
  
  #now: picking the best k
  k.meanAve = acc.in[order(-acc.in$accuracyMean), ] 
  best.k = k.meanAve$kValue[1]
  knn.test.bestK[i] = best.k
  
  test.fit  = knn(train=train_data, test=test_data, cl=train_class, k = best.k)
  
  KNN.test.error[i] = c(1 - sum(test.fit == test_class) / nrow(test_data))
  KNN.test.accu[i] = c(sum(test.fit == test_class) / nrow(test_data))
    
  pred = prediction(as.numeric(test.fit), test_class)
  auc.te = performance(pred, "auc")
  KNN.test.auc[i] = c(as.numeric(auc.te@y.values))
}

detail.k_1 = data.frame(fold=1:10, "error" = KNN.test.error, 
                        "accuracy" = KNN.test.accu, "auc" = KNN.test.auc, "best_K_used" = knn.test.bestK)

Errors, Accuracy, & AUC - (Having used nested CV to pick best value for K)

kable(detail.k_1, row.names = F)
fold error accuracy auc best_K_used
1 0.1583113 0.8416887 0.5422660 9
2 0.1345646 0.8654354 0.5702422 9
3 0.1306069 0.8693931 0.5725297 9
4 0.1358839 0.8641161 0.5808782 7
5 0.1492734 0.8507266 0.5485243 9
6 0.1373844 0.8626156 0.5606924 9
7 0.1453104 0.8546896 0.5889534 9
8 0.1387054 0.8612946 0.5434592 9
9 0.1545575 0.8454425 0.5528371 9
10 0.1558785 0.8441215 0.5585871 9


ave.detail.k = data.frame(mean_error = mean(detail.k_1$error),
                          mean_accuracy = mean(detail.k_1$accuracy), mean_auc = mean(detail.k_1$auc))
kable(ave.detail.k, row.names = F)
mean_error mean_accuracy mean_auc
0.1440476 0.8559524 0.561897