Question 1

prediction_classifier <- function(predicted, actual)
{
  #Sort the predicted values in decreasing order
  predicted <- sort(predicted, decreasing = TRUE)
 
  #Initialize Variables
  L <- length(predicted); #length of predicted = actual
  P <- 0;
  N <- 0; 
  TP <- 0; 
  TN <- 0; 
  FP <- 0; 
  FN <- 0; 
  TPR <- 0; 
  FPR <- 0; 
  Prec <- 0; 
  Acc <- 0; 
  Err <- 0; 
  tpn <- 0; 
  fpn <- 0; 
  tfp <- 0;
  TP_V <- vector(mode = "numeric", length = 0); 
  FP_V <- vector(mode = "numeric", length = 0); 
  TN_V <- vector(mode = "numeric", length = 0); 
  FN_V <- vector(mode = "numeric", length = 0); 
  TPR_V <- vector(mode = "numeric", length = 0);
  TNR_V <- vector(mode = "numeric", length = 0); 
  Prec_V <- vector(mode = "numeric", length = 0); 
  Acc_V <- vector(mode = "numeric", length = 0); 
  Err_V <- vector(mode = "numeric", length = 0);
 
  #P, N, TP, FP  
  for(i in 1:L)
  { 
    if(actual[i] == 1)
    {
      P <- P + 1;
      TP <- TP + 1;
    }
    else
    {
      N <- N + 1;
      FP <- FP + 1;
    }
    
    TP_V <- c(TP_V, TP)
    FP_V <- c(FP_V, FP)
  }
  
  d <- P + N; 
  
  #TN, FN  
  for(i in 1:L)
  {
    TN <- N - FP_V[i]
    FN <- P - TP_V[i]
    TN_V <- c(TN_V, TN)
    FN_V <- c(FN_V, FN)
  }

  #TPR (sensitivity, recall), TNR (specificity)
  for(i in 1:L)
  {
    TPR <- TP_V[i]/P
    TNR <- TN_V[i]/N
    TPR_V <- c(TPR_V, TPR)
    TNR_V <- c(TNR_V, TNR)
  }

  #Prec
  for(i in 1:L)
  {
    tfp <- TP_V[i] + FP_V[i]
    Prec <- TP_V[i]/tfp
    Prec_V <- c(Prec_V, Prec)
  }
  
  #Acc
  for(i in 1:L)
  {
    tpn <- TP_V[i] + TN_V[i]
    Acc <- tpn/d
    Acc_V <- c(Acc_V, Acc)
  }
  
  #Err
  for(i in 1:L)
  {
    fpn <- FP_V[i] + FN_V[i]
    Err <- fpn/d
    Err_V <- c(Err_V, Err)
  }

  #Show results in Data Frame
  df <- data.frame(actual, predicted, TP_V, FP_V, TN_V, FN_V, TPR_V,
            TNR_V, Prec_V, Acc_V, Err_V);
}

Question 2

Ytrue <- c(1,0,1,1,0,0,1,0,1,0)
Ypred <- c(0.98,0.92,0.85,0.77,0.71,0.64,0.50,0.39,0.34,0.31)

#using function from Problem 1
result <- prediction_classifier(Ypred, Ytrue)

#Use TNR (specificity) to find FPR (1 - specificity)
FPR <- 0
FPR_V <- vector(mode = "numeric", length = 0); 

for(i in 1:length(result$TNR_V))
{
  FPR <- 1 - result$TNR_V[i]
  FPR_V <- c(FPR_V, FPR)
}

#Show TPR, FPR, and Acc
problem2 <- result[,c("TPR_V","Acc_V")]
problem2$FPR_V <- FPR_V

knitr::kable(problem2, format = "html")

TPR_V	Acc_V	FPR_V
0.2	0.6	0.0
0.2	0.5	0.2
0.4	0.6	0.2
0.6	0.7	0.2
0.6	0.6	0.4
0.6	0.5	0.6
0.8	0.6	0.6
0.8	0.5	0.8
1.0	0.6	0.8
1.0	0.5	1.0

Question 3

plot(problem2$FPR_V, problem2$TPR_V, type = "o", xlab = "False Positive
     Rate", ylab = "True Positive Rate", main="ROC Curve")

Question 4

M1 <- c(30.5, 32.2, 20.7, 20.6, 31.0, 41.0, 27.7, 26.0, 21.5, 26.0)
M2 <- c(22.4, 14.5, 22.4, 19.6, 20.7, 20.4, 22.1, 19.4, 16.2, 35.0)

#1% significance
t.test(M1, M2, conf.level = .99, alternative = "two.sided")

## 
##  Welch Two Sample t-test
## 
## data:  M1 and M2
## t = 2.4376, df = 17.649, p-value = 0.02561
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -1.184208 14.084208
## sample estimates:
## mean of x mean of y 
##     27.72     21.27

The t-value falls within the confidence interval meaning there is not enough evidence to suggest that the means are different. Therefore, we can not state which model is better.

Question 5a

#GINI index calculation
GINIcalc <- function(numData)
{
  GINI_N <- 1 -((numData[1]/sum(numData))^2)-
    ((numData[2]/sum(numData))^2)- ((numData[3]/sum(numData))^2)
  print(GINI_N)
}

#Gain in GINI index for 2 splits
GAIN_GINI <- function(N, N1, N2)
{
  GAIN_N <- GINIcalc(N) - sum((length(N1)*GINIcalc(N1)),length(N2)*GINIcalc(N2))/(length(N1)+length(N2))
  print(GAIN_N)
}

#Gain in GINI index for 3 splits
GAIN_GINI_3 <- function(N, N1, N2, N3)
{
  GAIN_N_3 <- GINIcalc(N) - sum((length(N1)*GINIcalc(N1)),length(N2)*GINIcalc(N2)
                  ,length(N3)*GINIcalc(N3))/(length(N1)+length(N2)+length(N3))
  print(GAIN_N_3)
}

#Problem 5 Data set
N <- c(100,50,60)
N11 <- c(62,8,0)
N12 <- c(38,42,60)
N21 <- c(65,20,0)
N22 <- c(21,19,20)
N23 <- c(14,11,40)

#First Split GINI Index Gain: 
GAIN_GINI(N,N11,N12)

## [1] 0.6349206
## [1] 0.202449
## [1] 0.6526531
## [1] 0.2073696

#Second Split GINI Index Gain: 
GAIN_GINI_3(N, N21, N22, N23)

## [1] 0.6349206
## [1] 0.3598616
## [1] 0.6661111
## [1] 0.5462722
## [1] 0.110839

Question 5b

The first split gives the largest gain, and the node N12 has the highest GINI index.

Question 5c

#Entropy Calculation
EntropyCalc <- function(N)
{
  entropy <- 0
  calc <- TRUE
  
  #test to see if calculation possible
  for(i in 1:length(N))
  {
     if(N[i] == 0)
     {
       #print("There is a '0' element. This calculation requires log(0) which is undefined.")
       calc <- FALSE
     }
  }
  
  #entropy calculation
  if(calc == TRUE)
  {
    for(i in 1:length(N))
    { 
       entropy = entropy -N[i]/sum(N)*log(N[i]/sum(N))
    }
    
    print(entropy)
  }
}

#Gain in Entropy for 2 splits
GAIN_entropy <- function(N, N1, N2)
{
  GAIN_E <- EntropyCalc(N) - sum((length(N1)*EntropyCalc(N1)),
                      length(N2)*EntropyCalc(N2))/(length(N1)+length(N2))
  print(GAIN_E)
}

#Gain in Entropy for 2 splits
GAIN_entropy_3 <- function(N, N1, N2, N3){
  GAIN_E_3 <- EntropyCalc(N) - sum((length(N1)*EntropyCalc(N1)),length(N2)*EntropyCalc(N2),
                                   length(N3)*EntropyCalc(N3))/(length(N1)+length(N2)+length(N3))
  print(GAIN_E_3)
}

#First Split Entropy Gain: 0.5137838
GAIN_entropy(N,N11,N12)

## [1] 1.052923
## [1] 1.078278
## [1] 0.5137838

#Second Split Entropy Gain: 0.3769638
GAIN_entropy_3(N,N21,N22,N23)

## [1] 1.052923
## [1] 1.097779
## [1] 0.9300976
## [1] 0.3769638

Question 5d

The first split and the second node have the highest gain and entropy, respectively.

Question 6: Social Media Usage

social = read.csv("syp-16-data.csv", header=T)
adult = subset(social, Adult == "Adult")
hs = subset(social, Adult == "HS")

adult.sum = as.data.frame(colSums(adult[, -20]))
names(adult.sum) = "adult"

hs.sum = as.data.frame(colSums(hs[, -20]))
names(hs.sum) = "hs"

sum = cbind(adult.sum, hs.sum)
sum = t(sum)

Q6a: Grouped Bar Plots: Side-by-Side Barplot

par(mfrow=c(10, 2))

for(i in 1:19)
{
  par = names(social[i])
  gbp = as.data.frame(social[, c(i,20)])
  gbp.adult = subset(gbp, Adult=="Adult")
  gbp.adult.yes = nrow(subset(gbp.adult, gbp.adult[,1] == 1))
  gbp.adult.no = nrow(subset(gbp.adult, gbp.adult[,1] == 0))
  
  gbp.hs = subset(gbp, Adult=="HS")
  gbp.hs.yes = nrow(subset(gbp.hs, gbp.hs[,1] == 1))
  gbp.hs.no = nrow(subset(gbp.hs, gbp.hs[,1] == 0))
  
  yes = data.frame("Adult" = gbp.adult.yes, "HS" = gbp.hs.yes)
  no = data.frame("Adult" = gbp.adult.no, "HS" = gbp.hs.no)
  
  gbp.final = rbind(yes, no)
  rownames(gbp.final) = c("yes", "no")
  gbp.final = as.matrix(gbp.final)
  gbp.final = t(gbp.final)
  
  barplot(gbp.final,col=c("orange","blue"), beside=T, main = paste0(par))
  legend("top", col = c("orange","blue"), legend = c("Adult", "HS"), lwd = 5, lty = 1, bty = 'n', cex = 1)
}

barplot(sum, col=c("orange","blue"), cex.names=0.5, beside=T, legend.text = c("Adult", "HS"), 
        main = "Side-by-Side Barplot of Aggregated Respondents by Age")

Q6b:

Snapchat should be at the root of the decision tree because it seems to be the purest with the least entropy.

Q6c: Decision Tree

d.tree = rpart(Adult ~., data = social, method = "class")
fancyRpartPlot(d.tree, sub = "Decision Tree: Using All Data Points")

The variable at the root matches the intuition from part b above.

Question 7: Spam Email

spam.main = read.csv("spam.csv", header=T)

Q7a: Removing some varibles

spam = subset(spam.main, select = -c(id, isuid, domain, spampct, category, cappct))

kable(head(spam, n = 5))

day.of.week	time.of.day	size.kb	box	local	digits	name	special	credit	sucker	porn	chain	username	large.text	spam
Thu	0	7	no	no	0	name	1	no	no	no	no	no	no	no
Thu	0	2	no	no	0	name	5	no	no	no	no	no	no	yes
Thu	14	3	no	yes	0	name	2	no	no	no	yes	no	no	no
Thu	3	3	yes	no	0	name	0	no	no	no	no	no	no	no
Thu	3	4	no	no	0	name	2	no	no	no	no	no	no	no

Q7b: Spliting Data into Training and Test Data

set.seed(143498)
row.spam = nrow(spam)
sample.count = round(0.2 * row.spam, 0)
sample.spam = sample(1:row.spam, sample.count) 

spam.train = spam[-sample.spam, ]
spam.test = spam[sample.spam, ]

Training Dataset

kable(head(spam.train, n = 5))

	day.of.week	time.of.day	size.kb	box	local	digits	name	special	credit	sucker	porn	chain	username	large.text	spam
1	Thu	0	7	no	no	0	name	1	no	no	no	no	no	no	no
2	Thu	0	2	no	no	0	name	5	no	no	no	no	no	no	yes
3	Thu	14	3	no	yes	0	name	2	no	no	no	yes	no	no	no
5	Thu	3	4	no	no	0	name	2	no	no	no	no	no	no	no
6	Thu	4	4	no	no	0	name	1	no	no	no	no	no	no	no
<br /	>

Test Dataset

kable(head(spam.test, n = 5))

	day.of.week	time.of.day	size.kb	box	local	digits	name	special	credit	sucker	porn	chain	username	large.text	spam
1951	Tue	15	1	no	yes	0	name	0	no	no	no	no	no	no	no
554	Fri	8	2	yes	yes	0	name	7	no	no	no	no	no	no	no
1076	Wed	23	2	yes	yes	0	name	1	no	no	no	no	no	no	no
1422	Wed	9	1	no	yes	0	name	1	no	no	no	no	no	no	no
2144	Wed	13	2	yes	yes	0	name	1	no	no	no	no	no	no	no

Q7c: Classification Tree

spam.tree = rpart(spam ~., data = spam.train, method = "class")
spam.tree

## n= 1737 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 1737 563 no (0.67587795 0.32412205)  
##     2) box=yes 886  24 no (0.97291196 0.02708804) *
##     3) box=no 851 312 yes (0.36662750 0.63337250)  
##       6) local=yes 163   8 no (0.95092025 0.04907975) *
##       7) local=no 688 157 yes (0.22819767 0.77180233)  
##        14) large.text=no 418 151 yes (0.36124402 0.63875598)  
##          28) sucker=no 371 150 yes (0.40431267 0.59568733)  
##            56) name=name 235 115 yes (0.48936170 0.51063830)  
##             112) day.of.week=Mon,Thu 93  33 no (0.64516129 0.35483871)  
##               224) time.of.day>=5.5 70  18 no (0.74285714 0.25714286) *
##               225) time.of.day< 5.5 23   8 yes (0.34782609 0.65217391) *
##             113) day.of.week=Fri,Sat,Sun,Tue,Wed 142  55 yes (0.38732394 0.61267606) *
##            57) name=empty,single 136  35 yes (0.25735294 0.74264706) *
##          29) sucker=yes 47   1 yes (0.02127660 0.97872340) *
##        15) large.text=yes 270   6 yes (0.02222222 0.97777778) *

Q7d: Analysing the Decision Tree

print(summary(spam.tree))

## Call:
## rpart(formula = spam ~ ., data = spam.train, method = "class")
##   n= 1737 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.40319716      0 1.0000000 1.0000000 0.03464814
## 2 0.26110124      1 0.5968028 0.5968028 0.02924020
## 3 0.01198934      2 0.3357016 0.3357016 0.02305197
## 4 0.01000000      7 0.2753108 0.3303730 0.02289044
## 
## Variable importance
##         box       local  large.text     size.kb      sucker        name 
##          32          29          12           8           8           7 
##       chain day.of.week time.of.day 
##           2           1           1 
## 
## Node number 1: 1737 observations,    complexity param=0.4031972
##   predicted class=no   expected loss=0.324122  P(node) =1
##     class counts:  1174   563
##    probabilities: 0.676 0.324 
##   left son=2 (886 obs) right son=3 (851 obs)
##   Primary splits:
##       box        splits as  RL,       improve=319.1144, (0 missing)
##       local      splits as  RL,       improve=263.5342, (0 missing)
##       large.text splits as  LR,       improve=234.1405, (0 missing)
##       sucker     splits as  LR,       improve=156.2093, (0 missing)
##       digits     < 0.5  to the left,  improve=100.8942, (0 missing)
##   Surrogate splits:
##       local      splits as  RL,       agree=0.740, adj=0.469, (0 split)
##       large.text splits as  LR,       agree=0.637, adj=0.260, (0 split)
##       size.kb    < 3.5  to the left,  agree=0.622, adj=0.228, (0 split)
##       sucker     splits as  LR,       agree=0.599, adj=0.182, (0 split)
##       name       splits as  RLR,      agree=0.598, adj=0.180, (0 split)
## 
## Node number 2: 886 observations
##   predicted class=no   expected loss=0.02708804  P(node) =0.5100748
##     class counts:   862    24
##    probabilities: 0.973 0.027 
## 
## Node number 3: 851 observations,    complexity param=0.2611012
##   predicted class=yes  expected loss=0.3666275  P(node) =0.4899252
##     class counts:   312   539
##    probabilities: 0.367 0.633 
##   left son=6 (163 obs) right son=7 (688 obs)
##   Primary splits:
##       local      splits as  RL,       improve=137.66380, (0 missing)
##       large.text splits as  LR,       improve= 93.81813, (0 missing)
##       sucker     splits as  LR,       improve= 52.26650, (0 missing)
##       name       splits as  RLR,      improve= 37.45028, (0 missing)
##       size.kb    < 5.5  to the left,  improve= 32.19858, (0 missing)
##   Surrogate splits:
##       chain   splits as  RL,       agree=0.831, adj=0.117, (0 split)
##       size.kb < 75.5 to the right, agree=0.810, adj=0.006, (0 split)
## 
## Node number 6: 163 observations
##   predicted class=no   expected loss=0.04907975  P(node) =0.09383995
##     class counts:   155     8
##    probabilities: 0.951 0.049 
## 
## Node number 7: 688 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.2281977  P(node) =0.3960852
##     class counts:   157   531
##    probabilities: 0.228 0.772 
##   left son=14 (418 obs) right son=15 (270 obs)
##   Primary splits:
##       large.text  splits as  LR,       improve=37.708290, (0 missing)
##       sucker      splits as  LR,       improve=19.745120, (0 missing)
##       size.kb     < 5.5  to the left,  improve= 9.249632, (0 missing)
##       day.of.week splits as  RLRRLLL,  improve= 8.219463, (0 missing)
##       name        splits as  LLR,      improve= 8.072838, (0 missing)
##   Surrogate splits:
##       sucker      splits as  LR,       agree=0.706, adj=0.252, (0 split)
##       size.kb     < 9.5  to the left,  agree=0.692, adj=0.215, (0 split)
##       digits      < 7.5  to the left,  agree=0.644, adj=0.093, (0 split)
##       time.of.day < 21.5 to the left,  agree=0.618, adj=0.026, (0 split)
##       special     < 6.5  to the left,  agree=0.618, adj=0.026, (0 split)
## 
## Node number 14: 418 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.361244  P(node) =0.2406448
##     class counts:   151   267
##    probabilities: 0.361 0.639 
##   left son=28 (371 obs) right son=29 (47 obs)
##   Primary splits:
##       sucker      splits as  LR,       improve=12.240660, (0 missing)
##       name        splits as  RLR,      improve=11.158850, (0 missing)
##       day.of.week splits as  RLRRLLL,  improve=10.519770, (0 missing)
##       username    splits as  LR,       improve= 7.547117, (0 missing)
##       digits      < 0.5  to the left,  improve= 6.553703, (0 missing)
## 
## Node number 15: 270 observations
##   predicted class=yes  expected loss=0.02222222  P(node) =0.1554404
##     class counts:     6   264
##    probabilities: 0.022 0.978 
## 
## Node number 28: 371 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.4043127  P(node) =0.2135866
##     class counts:   150   221
##    probabilities: 0.404 0.596 
##   left son=56 (235 obs) right son=57 (136 obs)
##   Primary splits:
##       name        splits as  RLR,      improve=9.274097, (0 missing)
##       day.of.week splits as  LLRRLLL,  improve=9.153530, (0 missing)
##       username    splits as  LR,       improve=8.229523, (0 missing)
##       digits      < 0.5  to the left,  improve=8.170205, (0 missing)
##       time.of.day < 5.5  to the right, improve=6.251197, (0 missing)
##   Surrogate splits:
##       credit      splits as  LR,       agree=0.658, adj=0.066, (0 split)
##       special     < 6.5  to the left,  agree=0.652, adj=0.051, (0 split)
##       size.kb     < 2.5  to the right, agree=0.644, adj=0.029, (0 split)
##       day.of.week splits as  LLLRLLL,  agree=0.639, adj=0.015, (0 split)
##       digits      < 1.5  to the left,  agree=0.636, adj=0.007, (0 split)
## 
## Node number 29: 47 observations
##   predicted class=yes  expected loss=0.0212766  P(node) =0.02705815
##     class counts:     1    46
##    probabilities: 0.021 0.979 
## 
## Node number 56: 235 observations,    complexity param=0.01198934
##   predicted class=yes  expected loss=0.4893617  P(node) =0.1352907
##     class counts:   115   120
##    probabilities: 0.489 0.511 
##   left son=112 (93 obs) right son=113 (142 obs)
##   Primary splits:
##       day.of.week splits as  RLRRLRR,  improve=7.471797, (0 missing)
##       time.of.day < 5.5  to the right, improve=6.739738, (0 missing)
##       username    splits as  LR,       improve=5.200525, (0 missing)
##       digits      < 0.5  to the left,  improve=4.889873, (0 missing)
##       size.kb     < 1.5  to the right, improve=4.558681, (0 missing)
##   Surrogate splits:
##       size.kb < 50   to the right, agree=0.613, adj=0.022, (0 split)
##       chain   splits as  RL,       agree=0.613, adj=0.022, (0 split)
##       porn    splits as  RL,       agree=0.609, adj=0.011, (0 split)
## 
## Node number 57: 136 observations
##   predicted class=yes  expected loss=0.2573529  P(node) =0.07829591
##     class counts:    35   101
##    probabilities: 0.257 0.743 
## 
## Node number 112: 93 observations,    complexity param=0.01198934
##   predicted class=no   expected loss=0.3548387  P(node) =0.05354059
##     class counts:    60    33
##    probabilities: 0.645 0.355 
##   left son=224 (70 obs) right son=225 (23 obs)
##   Primary splits:
##       time.of.day < 5.5  to the right, improve=5.40300500, (0 missing)
##       special     < 2.5  to the left,  improve=3.44218400, (0 missing)
##       size.kb     < 2.5  to the right, improve=2.82090500, (0 missing)
##       digits      < 0.5  to the left,  improve=0.69456920, (0 missing)
##       day.of.week splits as  -L--R--,  improve=0.02083655, (0 missing)
##   Surrogate splits:
##       credit  splits as  LR,       agree=0.796, adj=0.174, (0 split)
##       special < 2.5  to the left,  agree=0.785, adj=0.130, (0 split)
##       porn    splits as  LR,       agree=0.774, adj=0.087, (0 split)
## 
## Node number 113: 142 observations
##   predicted class=yes  expected loss=0.3873239  P(node) =0.08175014
##     class counts:    55    87
##    probabilities: 0.387 0.613 
## 
## Node number 224: 70 observations
##   predicted class=no   expected loss=0.2571429  P(node) =0.04029937
##     class counts:    52    18
##    probabilities: 0.743 0.257 
## 
## Node number 225: 23 observations
##   predicted class=yes  expected loss=0.3478261  P(node) =0.01324122
##     class counts:     8    15
##    probabilities: 0.348 0.652 
## 
## n= 1737 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 1737 563 no (0.67587795 0.32412205)  
##     2) box=yes 886  24 no (0.97291196 0.02708804) *
##     3) box=no 851 312 yes (0.36662750 0.63337250)  
##       6) local=yes 163   8 no (0.95092025 0.04907975) *
##       7) local=no 688 157 yes (0.22819767 0.77180233)  
##        14) large.text=no 418 151 yes (0.36124402 0.63875598)  
##          28) sucker=no 371 150 yes (0.40431267 0.59568733)  
##            56) name=name 235 115 yes (0.48936170 0.51063830)  
##             112) day.of.week=Mon,Thu 93  33 no (0.64516129 0.35483871)  
##               224) time.of.day>=5.5 70  18 no (0.74285714 0.25714286) *
##               225) time.of.day< 5.5 23   8 yes (0.34782609 0.65217391) *
##             113) day.of.week=Fri,Sat,Sun,Tue,Wed 142  55 yes (0.38732394 0.61267606) *
##            57) name=empty,single 136  35 yes (0.25735294 0.74264706) *
##          29) sucker=yes 47   1 yes (0.02127660 0.97872340) *
##        15) large.text=yes 270   6 yes (0.02222222 0.97777778) *

fancyRpartPlot(spam.tree, sub = "Decision Tree: Using The Training Dataset")

Number of terminal leaves (leaf node) is 8.
Total number of nodes: 15.
Internal nodes(inner nodes): 7
The root is box attribute with 68% yes and 32% no, thus majority of the root node is positive. 100% of the entire training set passes through the root.

Q7e:

Performance of the Decision Tree on the Test Dataset

pred = predict(spam.tree, spam.test, type = "class")

conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table

##           Reference
## Prediction  no yes
##        no  267  14
##        yes  20 133

spam.acc = conf.matrix$overall[1]
spam.acc

## Accuracy 
## 0.921659

paste0("Error rate = 1 - accuracy = 1 - ", spam.acc, " = ", round(1 - spam.acc, 5))

## [1] "Error rate = 1 - accuracy = 1 - 0.921658986175115 = 0.07834"

pred2 = prediction(as.numeric(pred), as.numeric( spam.test$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))

## [1] "AUC = 0.91753774680604"

Performance of the Decision Tree on the Training Dataset

pred.train = predict(spam.tree, spam.train, type = "class")
conf.matrix.train = confusionMatrix(pred.train, spam.train$spam)
conf.matrix.train$table

##           Reference
## Prediction   no  yes
##        no  1069   50
##        yes  105  513

spam.acc.train = conf.matrix.train$overall[1]
spam.acc.train

##  Accuracy 
## 0.9107657

paste0("Error rate = 1 - accuracy = 1 - ", spam.acc.train, " = ", round(1 - spam.acc.train, 5))

## [1] "Error rate = 1 - accuracy = 1 - 0.91076568796776 = 0.08923"

pred2 = prediction(as.numeric(pred.train), as.numeric(spam.train$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))

## [1] "AUC = 0.910876116932592"

Q7f: Pruning the Decision Tree

pruned.1 = prune(spam.tree, cp = 0.1)
fancyRpartPlot(pruned.1, sub = "Pruned Decision Tree - With Complex Parameter 0.1")

Performance of the Pruned Decision Tree on Test Dataset

pred = predict(pruned.1, spam.test, type = "class")

conf.matrix = confusionMatrix(pred, spam.test$spam)
conf.matrix$table

##           Reference
## Prediction  no yes
##        no  259   6
##        yes  28 141

spam.acc = conf.matrix$overall[1]
spam.acc

## Accuracy 
## 0.921659

paste0("Error rate = 1 - accuracy = 1 - ", spam.acc, " = ", round(1 - spam.acc, 5))

## [1] "Error rate = 1 - accuracy = 1 - 0.921658986175115 = 0.07834"

pred2 = prediction(as.numeric(pred), as.numeric(spam.test$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))

## [1] "AUC = 0.930811348929816"

Accuracy and error rate remained the same, however, AUC increased.

Performance of the Pruned Decision Tree on Training Dataset

pred.train = predict(pruned.1, spam.train, type = "class")
conf.matrix.train = confusionMatrix(pred.train, spam.train$spam)
conf.matrix.train$table

##           Reference
## Prediction   no  yes
##        no  1017   32
##        yes  157  531

spam.acc.train = conf.matrix.train$overall[1]
spam.acc.train

##  Accuracy 
## 0.8911917

paste0("Error rate = 1 - accuracy = 1 - ", spam.acc.train, " = ", round(1 - spam.acc.train, 5))

## [1] "Error rate = 1 - accuracy = 1 - 0.89119170984456 = 0.10881"

pred2 = prediction(as.numeric(pred.train), as.numeric(spam.train$spam))
auc.te = performance(pred2, "auc")
paste0("AUC = ", as.numeric(auc.te@y.values))

## [1] "AUC = 0.904715399675019"

Accuracy has reduced, as a result the error rate increased.Also, AUC reduced.
Thus, the pruned decision tree is not performing well on the training data set.

Cutting the Tree Further

pruned.2 = prune(spam.tree, cp = 0.3)
fancyRpartPlot(pruned.2, sub = "Pruned Decision Tree - With Complex Parameter 0.3")

Question 8: Music Popularity

music.data = read.csv("music.csv", header=T)

Q8a: Removing some varibles

music = subset(music.data, select = -c(songtitle, artistname, songID,
                                       artistID, timesignature_confidence, tempo_confidence, key_confidence))

normalize = function(data) 
{
  return ((data - min(data)) / (max(data) - min(data))) 
}

music = as.data.frame(lapply(music, normalize))

kable(head(music, n = 5))

year	timesignature	loudness	tempo	key	energy	pitch	timbre_0_min	timbre_0_max	timbre_1_min	timbre_1_max	timbre_2_min	timbre_2_max	timbre_3_min	timbre_3_max	timbre_4_min	timbre_4_max	timbre_5_min	timbre_5_max	timbre_6_min	timbre_6_max	timbre_7_min	timbre_7_max	timbre_8_min	timbre_8_max	timbre_9_min	timbre_9_max	timbre_10_min	timbre_10_max	timbre_11_min	timbre_11_max	Top10
1	0.4285714	0.8727717	0.3746311	1.0000000	0.9681145	0.0443623	4.14e-05	0.8703938	0.7153206	0.3931543	0.6766215	0.2411221	0.4438659	0.5065554	0.6448221	0.2874073	0.7162544	0.5504263	0.4046581	0.3109549	0.6531844	0.3353556	0.6825367	0.3806657	0.7576586	0.4129704	0.4156798	0.1258552	0.7248566	0.1822900	0
1	0.5714286	0.8775939	0.5732460	0.9090909	0.9861960	0.0462107	0.00e+00	0.8717939	0.6478704	0.3932136	0.4876386	0.4552745	0.2431661	0.7628129	0.6128019	0.3910397	0.7953092	0.1746606	0.4166321	0.3206946	0.6773724	0.4581107	0.6230959	0.3587984	0.4485917	0.4321209	0.5298796	0.6454200	0.7671133	0.1485661	0
1	0.5714286	0.8885639	0.6570094	0.1818182	0.9913949	0.0480591	6.20e-05	0.8719494	0.6918732	0.3931015	0.7009716	0.3989601	0.6149570	0.4897756	0.5455583	0.5910034	0.7435350	0.4386697	0.4040007	0.2749548	0.6699796	0.3260446	0.6329889	0.4208143	0.6851381	0.3363030	0.5071476	0.1995160	0.7323115	0.1799131	0
1	0.5714286	0.8829875	0.3991904	0.0909091	0.9406241	0.0240296	0.00e+00	0.8786193	0.6593864	0.4729224	0.5182236	0.4377612	0.8903700	0.7408849	0.5859346	0.4635638	0.8395755	0.4381688	0.5194258	0.2649187	0.6871022	0.4066693	0.5122079	0.3922019	0.6658812	0.5684082	0.5355916	0.2655300	0.6194016	0.2899138	0
1	0.5714286	0.8626017	0.5732664	0.5454545	0.9892287	0.1164510	0.00e+00	0.8612543	0.2400185	0.3932136	0.4944161	0.4192729	0.7751513	0.7736043	0.6311931	0.3868533	0.7550011	0.1626208	0.2643276	0.3050375	0.7222566	0.4752561	0.6542392	0.3715339	0.6699038	0.3863739	0.7872832	0.1471355	0.6842820	0.2479626	0

Q8b: 10-fold CV

paste0("Below is the class distribution of the entire data set")

## [1] "Below is the class distribution of the entire data set"

prop.table(table(music$Top10))

## 
##         0         1 
## 0.8522577 0.1477423

set.seed(120634)
n.folds = 10

fold.strat = createFolds(music$Top10, k=10) #k-folds with stratification. caret package.
dt.prop.test = data.frame('false'=1:n.folds, 'true'=1:n.folds)
dt.prop.train = data.frame('false'=1:n.folds, 'true'=1:n.folds)

for(i in 1:n.folds)
{
  te.index = as.numeric(unlist(fold.strat[i]))
  test.cv = music[te.index, ]
  prop = prop.table(table(test.cv$Top10))
  dt.prop.test[i,] = prop
  
  train.cv = music[-te.index, ]
  prop = prop.table(table(train.cv$Top10))
  dt.prop.train[i,] = prop
}

r.names = c("Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fold07", "Fold08", "Fold09", "Fold10")

row.names(dt.prop.test) = names(fold.strat)
row.names(dt.prop.train) = names(fold.strat)

Class distribution of the the test data per folds

kable(dt.prop.test)

	false	true
Fold01	0.8496042	0.1503958
Fold02	0.8575198	0.1424802
Fold03	0.8614776	0.1385224
Fold04	0.8548813	0.1451187
Fold05	0.8454425	0.1545575
Fold06	0.8612946	0.1387054
Fold07	0.8401585	0.1598415
Fold08	0.8639366	0.1360634
Fold09	0.8428005	0.1571995
Fold10	0.8454425	0.1545575

Class distribution of the training data per folds

kable(dt.prop.train)

	false	true
Fold01	0.8525528	0.1474472
Fold02	0.8516725	0.1483275
Fold03	0.8512324	0.1487676
Fold04	0.8519660	0.1480340
Fold05	0.8530145	0.1469855
Fold06	0.8512542	0.1487458
Fold07	0.8536013	0.1463987
Fold08	0.8509608	0.1490392
Fold09	0.8533079	0.1466921
Fold10	0.8530145	0.1469855

Q8c: KNN

set.seed(1874)
KNN.test.error = rep(NA, n.folds)
KNN.test.accu = rep(NA, n.folds)
KNN.test.auc = rep(NA, n.folds)
minErr = list(NA,NA,NA)
minAccu = list(NA,NA,NA)
minAuc = list(NA,NA,NA)
j = 1
k.values = c(3,5,9)

for (k in k.values) 
{
  for(i in 1:n.folds)
  {
    fold.index = as.numeric(unlist(fold.strat[i]))
    test_data = music[fold.index, ][, -32]
    test_class = music[fold.index, ][, 32]
    train_data = music[-fold.index, ][, -32]
    train_class = music[-fold.index, ][, 32]
    test.fit  = knn(train=train_data, test=test_data, cl=train_class, k = k)
    
    #train.fit = knn(train=train_data, train_data, cl=train_data, k = k)
    
    KNN.test.error[i] = c(1 - sum(test.fit == test_class) / nrow(test_data))
    KNN.test.accu[i] = c(sum(test.fit == test_class) / nrow(test_data))
    
    pred = prediction(as.numeric(test.fit), test_class)
    auc.te = performance(pred, "auc")
    KNN.test.auc[i] = c(as.numeric(auc.te@y.values))
  }
  
  minErr[j] = data.frame(test.error = KNN.test.error)
  minAccu[j] = data.frame(test.accu = KNN.test.accu)
  minAuc[j] = data.frame(test.auc = KNN.test.auc)
  j = j + 1
}

error.k_3 = data.frame(fold=1:10, "error" = minErr[1], "accuracy" = minAccu[1], "auc" = minAuc[1])
error.k_5 = data.frame(fold=1:10, "error" = minErr[2], "accuracy" = minAccu[2], "auc" = minAuc[2])
error.k_9 = data.frame(fold=1:10, "error" = minErr[3], "accuracy" = minAccu[3], "auc" = minAuc[3])
names(error.k_3) = c("fold", "error", "accuracy", "auc")
names(error.k_5) = c("fold", "error", "accuracy", "auc")
names(error.k_9) = c("fold", "error", "accuracy", "auc")
#mean(knn.kcv.error)

Errors, Accuracy, & AUC : k = 3

kable(error.k_3, rownames = F)

fold	error	accuracy	auc
1	0.1728232	0.8271768	0.5878691
2	0.1701847	0.8298153	0.6035185
3	0.1715040	0.8284960	0.5687742
4	0.1503958	0.8496042	0.6063552
5	0.1730515	0.8269485	0.5658921
6	0.1690885	0.8309115	0.5822378
7	0.1690885	0.8309115	0.6249935
8	0.1558785	0.8441215	0.5866883
9	0.1585205	0.8414795	0.6120097
10	0.1770145	0.8229855	0.5845019

paste0("Mean_Error = ", mean(error.k_3$error))

## [1] "Mean_Error = 0.166754965964106"

paste0("Mean_Accuracy = ", mean(error.k_3$accuracy))

## [1] "Mean_Accuracy = 0.833245034035894"

paste0("Mean_AUC = ", mean(error.k_3$auc))

## [1] "Mean_AUC = 0.592284032694927"

Errors, Accuracy, & AUC : k = 5

kable(error.k_5, rownames = F)

fold	error	accuracy	auc
1	0.1715040	0.8284960	0.5597690
2	0.1583113	0.8416887	0.5795584
3	0.1437995	0.8562005	0.5688690
4	0.1424802	0.8575198	0.5921156
5	0.1532365	0.8467635	0.5741186
6	0.1519155	0.8480845	0.5722320
7	0.1466314	0.8533686	0.6283201
8	0.1321004	0.8678996	0.6004498
9	0.1532365	0.8467635	0.5843827
10	0.1571995	0.8428005	0.5997129

paste0("Mean_Error = ", mean(error.k_5$error))

## [1] "Mean_Error = 0.151041466976644"

paste0("Mean_Accuracy = ", mean(error.k_5$accuracy))

## [1] "Mean_Accuracy = 0.848958533023356"

paste0("Mean_AUC = ", mean(error.k_5$auc))

## [1] "Mean_AUC = 0.585952791648983"

Error, Accuracy, & AUC : k = 9

kable(error.k_9, rownames = F)

fold	error	accuracy	auc
1	0.1583113	0.8416887	0.5422660
2	0.1332454	0.8667546	0.5710114
3	0.1306069	0.8693931	0.5725297
4	0.1358839	0.8641161	0.5657828
5	0.1492734	0.8507266	0.5485243
6	0.1373844	0.8626156	0.5606924
7	0.1453104	0.8546896	0.5889534
8	0.1387054	0.8612946	0.5434592
9	0.1545575	0.8454425	0.5528371
10	0.1558785	0.8441215	0.5585871

paste0("Mean_Error = ", mean(error.k_9$error))

## [1] "Mean_Error = 0.143915713673262"

paste0("Mean_Accuracy = ", mean(error.k_9$accuracy))

## [1] "Mean_Accuracy = 0.856084286326738"

paste0("Mean_AUC = ", mean(error.k_9$auc))

## [1] "Mean_AUC = 0.560464345557562"

Q8d: Decision Tree

set.seed(1294)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)

accu.p1 = rep(NA, n.folds)
err.p1 = rep(NA, n.folds)
auc.p1 = rep(NA, n.folds)

accu.p2 = rep(NA, n.folds)
err.p2 = rep(NA, n.folds)
auc.p2 = rep(NA, n.folds)

for(i in 1:10)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  
  test_data = music[fold.index, ]
  test_class = music[fold.index, ][, 32]
  train_data = music[-fold.index, ]
  
  tree = rpart(Top10 ~., train_data, method="class")
  fancyRpartPlot(tree, sub = paste("Decision Tree For Fold", i))
  pred = predict(tree, test_data, type = "class")

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu[i] = c(round(1-(1 - acc), 5))
  err[i] = c(round(1 - acc, 5))
  auc[i] = c(as.numeric(auc.te@y.values))
  
  #Prune with cp = 0.1
  tree.prune.1 = prune(tree, cp = 0.1)
  #fancyRpartPlot(tree.prune.1)
  
  pred = predict(tree.prune.1, test_data, type = "class")

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu.p1[i] = c(round(1-(1 - acc), 5))
  err.p1[i] = c(round(1 - acc, 5))
  auc.p1[i] = c(as.numeric(auc.te@y.values))
  
  #Prune with cp = 0.3
  tree.prune.2 = prune(tree, cp = 0.03)
  #fancyRpartPlot(tree.prune.2, sub = paste("cp=0.3 - Pruned Decision Tree -  Fold", i))
  
  pred = predict(tree.prune.2, test_data, type = "class")

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu.p2[i] = c(round(1-(1 - acc), 5))
  err.p2[i] = c(round(1 - acc, 5))
  auc.p2[i] = c(as.numeric(auc.te@y.values))
}

Error, Accuracy, & AUC Without Pruning

result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)

fold	error	accuracy	auc
1	0.14248	0.85752	0.5804593
2	0.13720	0.86280	0.5725641
3	0.14512	0.85488	0.5601108
4	0.12665	0.87335	0.5825056
5	0.15456	0.84544	0.5768296
6	0.13474	0.86526	0.5422509
7	0.13871	0.86129	0.5928842
8	0.13738	0.86262	0.5278644
9	0.16380	0.83620	0.5405153
10	0.14663	0.85337	0.5954861

names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))

## [1] "Mean_Error = 0.142727"

paste0("Mean_Accuracy = ", mean(result$accuracy))

## [1] "Mean_Accuracy = 0.857273"

paste0("Mean_AUC = ", mean(result$auc))

## [1] "Mean_AUC = 0.567147039875754"

Error, Accuracy, & AUC With Pruning cp = 0.1

result = data.frame(fold=1:10, "error" = err.p1, "accuracy" = accu.p1, "auc" = auc.p1)
kable(result)

fold	error	accuracy	auc
1	0.15040	0.84960	0.5
2	0.14248	0.85752	0.5
3	0.13852	0.86148	0.5
4	0.14512	0.85488	0.5
5	0.15456	0.84544	0.5
6	0.13871	0.86129	0.5
7	0.15984	0.84016	0.5
8	0.13606	0.86394	0.5
9	0.15720	0.84280	0.5
10	0.15456	0.84544	0.5

names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))

## [1] "Mean_Error = 0.147745"

paste0("Mean_Accuracy = ", mean(result$accuracy))

## [1] "Mean_Accuracy = 0.852255"

paste0("Mean_AUC = ", mean(result$auc))

## [1] "Mean_AUC = 0.5"

Error, Accuracy, & AUC With Pruning cp = 0.3

result = data.frame(fold=1:10, "error" = err.p2, "accuracy" = accu.p2, "auc" = auc.p2)
kable(result)

fold	error	accuracy	auc
1	0.15040	0.84960	0.5
2	0.14248	0.85752	0.5
3	0.13852	0.86148	0.5
4	0.14512	0.85488	0.5
5	0.15456	0.84544	0.5
6	0.13871	0.86129	0.5
7	0.15984	0.84016	0.5
8	0.13606	0.86394	0.5
9	0.15720	0.84280	0.5
10	0.15456	0.84544	0.5

names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))

## [1] "Mean_Error = 0.147745"

paste0("Mean_Accuracy = ", mean(result$accuracy))

## [1] "Mean_Accuracy = 0.852255"

paste0("Mean_AUC = ", mean(result$auc))

## [1] "Mean_AUC = 0.5"

Q8e: Naive Bayes

set.seed(123694)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)

for(i in 1:10)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  
  test_data = music[fold.index, ]
  test_data$Top10 = as.factor(test_data$Top10)
  test_class = music[fold.index, ][, 32]
  train_data = music[-fold.index, ]
  train_data$Top10 = as.factor(train_data$Top10)
  train_class = music[-fold.index, ][, 32]
  
  #nb.model = naiveBayes(train_data, train_class)
  library(klaR)
  nb.model = NaiveBayes(Top10 ~., data = train_data)
  pred = predict(nb.model, test_data)
  
  error = 1 - sum(pred$class == test_class) / nrow(test_data)
  acc = 1 - error
  
  values = as.numeric(pred$class)-1
  
  pred = prediction(as.numeric(values), test_class)
  auc.te = performance(pred, "auc")
  
  accu[i] = c(acc)
  err[i] = c(error)
  auc[i] = c(as.numeric(auc.te@y.values))
}

Error, Accuracy, & AUC

result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)

fold	error	accuracy	auc
1	0.2585752	0.7414248	0.6709573
2	0.2770449	0.7229551	0.6879060
3	0.2532982	0.7467018	0.7051265
4	0.2401055	0.7598945	0.7086139
5	0.2589168	0.7410832	0.6757545
6	0.2496697	0.7503303	0.6912650
7	0.2721268	0.7278732	0.7142458
8	0.2708058	0.7291942	0.6714988
9	0.2721268	0.7278732	0.6779129
10	0.2708058	0.7291942	0.6722155

names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))

## [1] "Mean_Error = 0.26234755300572"

paste0("Mean_Accuracy = ", mean(result$accuracy))

## [1] "Mean_Accuracy = 0.73765244699428"

paste0("Mean_AUC = ", mean(result$auc))

## [1] "Mean_AUC = 0.687549619294451"

Q8f: Random Forest

set.seed(1429461)
accu = rep(NA, n.folds)
err = rep(NA, n.folds)
auc = rep(NA, n.folds)

for(i in 1:10)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  test_data = music[fold.index, ][, -32]
  test_class = music[fold.index, ][, 32]
  train_data = music[-fold.index, ][, -32]
  train_class = music[-fold.index, ][, 32]

  rf.model = randomForest(train_data, as.factor(as.numeric(train_class)))
  pred = predict(rf.model, test_data)

  conf.matrix = confusionMatrix(pred, test_class)
  cf.mat.tab = conf.matrix$table
  acc = conf.matrix$overall[1]
  
  values = as.numeric(pred) - 1
  
  pred.2 = prediction(as.numeric(values), test_class)
  auc.te = performance(pred.2, "auc")
  
  accu[i] = c(round(1-(1 - acc), 5))
  err[i] = c(round(1 - acc, 5))
  auc[i] = c(as.numeric(auc.te@y.values))
}

Error, Accuracy, & AUC

result = data.frame(fold=1:10, "error" = err, "accuracy" = accu, "auc" = auc)
kable(result)

fold	error	accuracy	auc
1	0.13588	0.86412	0.5590743
2	0.13193	0.86807	0.5524786
3	0.12005	0.87995	0.5826515
4	0.13193	0.86807	0.5567761
5	0.14927	0.85073	0.5345553
6	0.12285	0.87715	0.5731230
7	0.13738	0.86262	0.5802861
8	0.12682	0.87318	0.5544298
9	0.13871	0.86129	0.5656595
10	0.14267	0.85733	0.5489383

names(result) = c("fold", "error", "accuracy", "auc")

paste0("Mean_Error = ", mean(result$error))

## [1] "Mean_Error = 0.133749"

paste0("Mean_Accuracy = ", mean(result$accuracy))

## [1] "Mean_Accuracy = 0.866251"

paste0("Mean_AUC = ", mean(result$auc))

## [1] "Mean_AUC = 0.560797255021492"

Q8g: Nested CV

Part (a) KNN - Determining the best value of k (Nested CV)

set.seed(18740)
KNN.test.error = rep(NA, n.folds)
KNN.test.accu = rep(NA, n.folds)
KNN.test.auc = rep(NA, n.folds)
knn.test.bestK = rep(NA, n.folds)

KNN.test.accu.in = rep(NA, n.folds-1)
k.values = c(1,3,5,7,9)

for(i in 1:n.folds)
{
  fold.index = as.numeric(unlist(fold.strat[i]))
  test_data = music[fold.index, ][, -32]
  test_class = music[fold.index, ][, 32]
  new.data = music[-fold.index, ]
  train_data = new.data[, -32]
  train_class = music[-fold.index, ][, 32]

  #internal CV on the: for selecting the best value of k
  #creating the internal folds
  fold.in = createFolds(new.data$Top10, k = n.folds - 1)
  acc.in = data.frame("kValue" = k.values, "accuracyMean" = NA)
  w = 1
  
  for(kk in k.values)
  {
    for(jj in 1:9)
    {
      fold.index.in = as.numeric(unlist(fold.in[jj]))
      test_data.in = new.data[fold.index.in, ][, -32]
      test_class.in = new.data[fold.index.in, ][, 32]
      train_data.in = new.data[-fold.index.in, ][, -32]
      train_class.in = new.data[-fold.index.in, ][, 32]
      
      test.fit.in  = knn(train=train_data.in, test=test_data.in, cl=train_class.in, k = kk)
      KNN.test.accu.in[jj] = c(sum(test.fit.in == test_class.in) / nrow(test_data.in))
    }
    
    acc.in$accuracyMean[w] = mean(KNN.test.accu.in)
    w = w+1
  }
  
  #now: picking the best k
  k.meanAve = acc.in[order(-acc.in$accuracyMean), ] 
  best.k = k.meanAve$kValue[1]
  knn.test.bestK[i] = best.k
  
  test.fit  = knn(train=train_data, test=test_data, cl=train_class, k = best.k)
  
  KNN.test.error[i] = c(1 - sum(test.fit == test_class) / nrow(test_data))
  KNN.test.accu[i] = c(sum(test.fit == test_class) / nrow(test_data))
    
  pred = prediction(as.numeric(test.fit), test_class)
  auc.te = performance(pred, "auc")
  KNN.test.auc[i] = c(as.numeric(auc.te@y.values))
}

detail.k_1 = data.frame(fold=1:10, "error" = KNN.test.error, 
                        "accuracy" = KNN.test.accu, "auc" = KNN.test.auc, "best_K_used" = knn.test.bestK)

Errors, Accuracy, & AUC - (Having used nested CV to pick best value for K)

kable(detail.k_1, row.names = F)

fold	error	accuracy	auc	best_K_used
1	0.1583113	0.8416887	0.5422660	9
2	0.1345646	0.8654354	0.5702422	9
3	0.1306069	0.8693931	0.5725297	9
4	0.1358839	0.8641161	0.5808782	7
5	0.1492734	0.8507266	0.5485243	9
6	0.1373844	0.8626156	0.5606924	9
7	0.1453104	0.8546896	0.5889534	9
8	0.1387054	0.8612946	0.5434592	9
9	0.1545575	0.8454425	0.5528371	9
10	0.1558785	0.8441215	0.5585871	9

ave.detail.k = data.frame(mean_error = mean(detail.k_1$error),
                          mean_accuracy = mean(detail.k_1$accuracy), mean_auc = mean(detail.k_1$auc))
kable(ave.detail.k, row.names = F)

mean_error	mean_accuracy	mean_auc
0.1440476	0.8559524	0.561897

TPR_V	Acc_V	FPR_V
0.2	0.6	0.0
0.2	0.5	0.2
0.4	0.6	0.2
0.6	0.7	0.2
0.6	0.6	0.4
0.6	0.5	0.6
0.8	0.6	0.6
0.8	0.5	0.8
1.0	0.6	0.8
1.0	0.5	1.0

TPR_V	Acc_V	FPR_V
0.2	0.6	0.0
0.2	0.5	0.2
0.4	0.6	0.2
0.6	0.7	0.2
0.6	0.6	0.4
0.6	0.5	0.6
0.8	0.6	0.6
0.8	0.5	0.8
1.0	0.6	0.8
1.0	0.5	1.0

Project

Ibrahim Odumas Odufowora & Jamie Berger

2017-05-13

Question 1

Question 2

Question 3

Question 4

Question 5a

Question 5b

Question 5c

Question 5d

Question 6: Social Media Usage

Q6a: Grouped Bar Plots: Side-by-Side Barplot

Q6b:

Q6c: Decision Tree

Question 7: Spam Email

Q7a: Removing some varibles

Q7b: Spliting Data into Training and Test Data

Training Dataset

Test Dataset

Q7c: Classification Tree

Q7d: Analysing the Decision Tree

Q7e:

Performance of the Decision Tree on the Test Dataset

Performance of the Decision Tree on the Training Dataset

Q7f: Pruning the Decision Tree

Performance of the Pruned Decision Tree on Test Dataset

Performance of the Pruned Decision Tree on Training Dataset

Cutting the Tree Further

Question 8: Music Popularity

Q8a: Removing some varibles

Q8b: 10-fold CV

Class distribution of the the test data per folds

Class distribution of the training data per folds

Q8c: KNN

Errors, Accuracy, & AUC : k = 3

Errors, Accuracy, & AUC : k = 5

Error, Accuracy, & AUC : k = 9

Q8d: Decision Tree

Error, Accuracy, & AUC Without Pruning

Error, Accuracy, & AUC With Pruning cp = 0.1

Error, Accuracy, & AUC With Pruning cp = 0.3

Q8e: Naive Bayes

Error, Accuracy, & AUC

Q8f: Random Forest

Error, Accuracy, & AUC

Q8g: Nested CV

Part (a) KNN - Determining the best value of k (Nested CV)

Errors, Accuracy, & AUC - (Having used nested CV to pick best value for K)

TPR_V	Acc_V	FPR_V
0.2	0.6	0.0
0.2	0.5	0.2
0.4	0.6	0.2
0.6	0.7	0.2
0.6	0.6	0.4
0.6	0.5	0.6
0.8	0.6	0.6
0.8	0.5	0.8
1.0	0.6	0.8
1.0	0.5	1.0