ISYE 6501x HW1

0.2 KKNN Model

#KKNN function that iterates through different values of K and stores the accuracy of that respective model into a vector so that we can determine which K value gives use the highest accuracy on the dataset.

knn_pred <- rep(0, nrow(ccdata)) #vector of all 0's the size of our dataset that will be filled with 1's & 0's our prediction based from our model
knn_acc_vector <- vector("numeric") #empty vector to store the accurracy of our model for each iteration of K
for (K in 1:50) { #number of K values to iterate through
  for (i in 1:nrow(ccdata)) { #for each data point where i is the data point
    knn_model <- kknn(ccdata[-i,11]~., #Can also be "V11 ~.,"  
                      ccdata[-i,1:10], #Train on all the predictors for all but the ith data point
                      ccdata[i,1:10], #Test on all the predictors including i
                      k = K, 
                      kernel = "optimal",
                      scale = TRUE)
    knn_pred[i] <- round(fitted(knn_model)) #"fitted will return the predicted respones from our model. Since kknn will read responses as continous, we can use the round function to make make all predictions either 1 or 0 it will then be stored into our previously vector of all 0's"
    knn_acc <- sum(knn_pred == ccdata[,11]) / nrow(ccdata) #sums all the data points where our prediction matches our data set and then divides it over the number of datapoints we have to determine accuracy
    

  }
  knn_acc_vector <- c(knn_acc_vector,knn_acc) #for each K, store the accuracy in a vector
}

plot(knn_acc_vector)

max(knn_acc_vector) # Accurate 85.32% of the time!

## [1] 0.853211

which.max(knn_acc_vector) #Max accuracy @ K = 12

## [1] 12

Question 3.1a

Using the full ccdata set, we can train our model using the k-fold crossvalidation by function cv.kknn from library(kknn). Keeping the number of folds constant, we can iterate through which K we want for the nearest neighbor model. We could do the opposite and keep the K nearest neighbor constant, and iterate to determine the best number of folds to use too!

k_acc_vec = vector("numeric")
for (K in 1:50) {
  kmodel3 <- cv.kknn(V11 ~ .,
                     ccdata,
                     kcv = 10, # # of folds
                     k = K, 
                     kernel = "optimal",
                     scale = TRUE)
  kmodel3 <- data.frame(kmodel3) #cv.kknn function outputs our prediction in a weird way, so we can use the data.frame function to put into a normal matrix
  kmodelpred2 <- kmodel3[,2] #the 2nd column has our model predictions
  rpred2 <- round(kmodelpred2) #round them so that they are 1 or 0
  k_accuracy3 <- sum(rpred2 == ccdata[,11]) / nrow(ccdata) 
  k_acc_vec <- c(k_acc_vec, k_accuracy3) 
}
plot(k_acc_vec)

max(k_acc_vec) # 85.53% accurate

## [1] 0.8577982

which.max(k_acc_vec) # Most accurate with a K value of 20

## [1] 5

Training of kknn via leave-one-out cross validation method

set.seed(3)

kmodel <- train.kknn(V11 ~.,
                     ccdata,
                     kmax = 100,
                     kernel = "optimal",
                     scale = TRUE)

kpred <- predict(kmodel, ccdata)
roundedpred <- round(kpred)
k_accuracy <- sum(roundedpred == ccdata[,11])/ nrow(ccdata)
k_accuracy

## [1] 0.8776758

kmodel

## 
## Call:
## train.kknn(formula = V11 ~ ., data = ccdata, kmax = 100, kernel = "optimal",     scale = TRUE)
## 
## Type of response variable: continuous
## minimal mean absolute error: 0.1850153
## Minimal mean squared error: 0.1073792
## Best kernel: optimal
## Best k: 58

Question 3.1b

Splitting the data into training, validation, and test data, we can compare between the KNN and SVM.

set.seed(3)
#Splitting data into 70% training, 15% validation, and 15% testin
ccdatasplit <- sample(1:3, nrow(ccdata), prob = c(.7,.15,.15), replace = TRUE)
cctrain <- ccdata[ccdatasplit == 1,]
ccvalid <- ccdata[ccdatasplit == 2,]
cctest <- ccdata[ccdatasplit == 3,]

#Training KSVM Model using our previous code to find the C value that has the lowest training error on our training set.
Cloop <- 10^(-3:3)
ksvm_acc_vec <- vector("numeric")
for(lambda in Cloop) {
  ksvm_model <- ksvm(as.matrix(cctrain[,1:10]),
                as.factor(cctrain[,11]),
                type = "C-svc",
                kernel = "vanilladot",
                C = lambda,
                scaled=TRUE)
  a <- colSums(ksvm_model@xmatrix[[1]] * ksvm_model@coef[[1]])
  a0 <- -ksvm_model@b
  prediction <- predict(ksvm_model,cctrain[,1:10])
  ksvm_acc <- sum(prediction == cctrain[,11]) / nrow(cctrain)
  ksvm_acc_vec <- c(ksvm_acc_vec,ksvm_acc)
}

##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters

ksvm_acc_vec

## [1] 0.8200000 0.8711111 0.8711111 0.8711111 0.8711111 0.8711111 0.8711111

Our ksvm_model appears to have a pretty consistent accuracy at 87.11% as long as our C value is not significantly small, so we will use C=100 for our KSVM model for validation.

set.seed(3)
ksvm_model2 <- ksvm(as.matrix(cctrain[,1:10]), #train on training set
                as.factor(cctrain[,11]),
                type = "C-svc",
                kernel = "vanilladot",
                C = 100,
                scaled=TRUE)

##  Setting default kernel parameters

ksvm_prediction_valid <- predict(ksvm_model2, ccvalid[,1:10]) #predicting how the model will do on our validation set's predictors.
ksvm.acc <- sum(ksvm_prediction_valid == ccvalid[,11]) / nrow(ccvalid)
ksvm.acc # 87.75% accurate!

## [1] 0.877551

I was skeptical about the predict function so i wanted to see if i could reproduce this accuracy by training the model on the validation set itself

#Validating KSVM Model 
set.seed(3)
ksvm_model_valid <- ksvm(as.matrix(ccvalid[,1:10]),
                as.factor(ccvalid[,11]),
                type = "C-svc",
                kernel = "vanilladot",
                C = 100,
                scaled=TRUE)

##  Setting default kernel parameters

ksvm_model_valid

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 100 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 42 
## 
## Objective Function Value : -2400.483 
## Training error : 0.122449

1-.122449

## [1] 0.877551

Got the same answer! From the validation data, our ksvm_model has a training error of about 0.122449. So, its accurate about 87.75% of the time. Slightly higher than our training set. Lets see how our KNN model performs against the training set.

set.seed(3)
#Finding best K on the training set by iterating through different values of K. The K with the highest accuracy will be used for our kknn model that uses validation dataset.

knn_pred2 <- rep(0, nrow(cctrain)) 
knn_acc_vector2 <- vector("numeric") 
for (K in 1:50) { 
  for (i in 1:nrow(cctrain)) {
    knn_model2 <- kknn(cctrain[-i,11]~.,  
                      cctrain[-i,1:10], 
                      cctrain[i,1:10], 
                      k = K, 
                      kernel = "optimal",
                      scale = TRUE)
    knn_pred2[i] <- round(fitted(knn_model2)) 
    knn_acc2 <- sum(knn_pred2 == cctrain[,11]) / nrow(cctrain) 
    

  }
  knn_acc_vector2 <- c(knn_acc_vector2,knn_acc2) 
}

plot(knn_acc_vector2)

max(knn_acc_vector2) # 84.667% Accurate!

## [1] 0.8466667

which.max(knn_acc_vector2) #K value of 10 is the most accurate

## [1] 10

K = 10 had the highest accuracy on the training set with an accuracy of 84.667%. Lets see how this performs on the validation set.

set.seed(3)

knn_pred3 <- rep(0, nrow(ccvalid)) 
knn_acc_vector3 <- vector("numeric")
for (i in 1:nrow(ccvalid)) {
  knn_model3 <- kknn(ccvalid[-i,11]~.,  
                    ccvalid[-i,1:10], 
                    ccvalid[i,1:10], 
                    k = 10, 
                    kernel = "optimal",
                    scale = TRUE)
  knn_pred3[i] <- round(predict(knn_model3))
  knn.acc = sum(knn_pred3 == ccvalid[,11])/ nrow(ccvalid)
  

  
 
}

knn.acc # 85.7% accurate witht he validation set

## [1] 0.8571429

KKNN model performed slightly better on the validation set. Since the SVM model performed best on the validation set, we will use our SVM model on the test data set to see how well our model can predict.

set.seed(3)
ksvm_prediction_test <- predict(ksvm_model2, cctest[,1:10])
ksvm.acc2 <- sum(ksvm_prediction_test == cctest[,11]) / nrow(cctest)
ksvm.acc2 # 82.07% accurate on the test set!

## [1] 0.8207547

##Conclusion

The KSVM model is a better model for our data due to its higher performance on the validation set compared to KNN. Using our KSVM model on the test set, our model is accurate about 82.07% of the time, down from the 87.11% accuracy we observed on the training set.

ISYE 6501x HW1

Trae Taylor

05/16/2020

0.1 KSVM Model

0.2 KKNN Model