Homework Week 2

Problem 3.1(a)

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(kknn)
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
library(ggplot2)

## Import the data
data_cv <- read.table("credit_card_data.txt")

head(data_cv)
##   V1    V2    V3   V4 V5 V6 V7 V8  V9 V10 V11
## 1  1 30.83 0.000 1.25  1  0  1  1 202   0   1
## 2  0 58.67 4.460 3.04  1  0  6  1  43 560   1
## 3  0 24.50 0.500 1.50  1  1  0  1 280 824   1
## 4  1 27.83 1.540 3.75  1  0  5  0 100   3   1
## 5  1 20.17 5.625 1.71  1  1  0  1 120   0   1
## 6  1 32.08 4.000 2.50  1  1  0  0 360   0   1
set.seed(1)
## Take a random sample of 80% of the data aside for training
mask_test <- sample(nrow(data_cv), size = floor(nrow(data_cv) * 0.2))
test_cv <- data_cv[mask_test,]
train_cv <- data_cv[-mask_test,]

set.seed(1)
## Shuffle training data randomly
data_shuffled <- train_cv[sample(nrow(train_cv)),]

set.seed(1) 
# Create 10 equally size folds
folds <- cut(seq(1,nrow(data_shuffled)),breaks=10,labels=FALSE)

## Create a function for looping through the 10 kfolds and returning the 
## average accuracy of all 10 models across different values of k (nearest neighbor)
check_accuracy = function(X){
  
  ## Set value of kfolds
  k <- 10
  
  ## Empty list to store accuracy outputs from various k values  
  accuracy_list <- vector(mode = "list") 
  
  #Perform 10 fold cross validation
  for(i in 1:k){
    #Segement your data by fold using the which() function 
    testIndexes <- which(folds==i,arr.ind=TRUE)
    validationData <- data_shuffled[testIndexes, ]
    trainData <- data_shuffled[-testIndexes, ]
    
    kknn.model <- kknn(V11~., 
                       trainData[,1:11], 
                       validationData[,1:11], 
                       k = X,
                       scale = TRUE)
    set.seed(1)
    ## Predict on the validationData 
    preds <- as.integer(fitted(kknn.model)+0.5)
    ## Calculate the accuracy
    accuracy <- sum(preds == validationData[,11]) / nrow(validationData)
    #Store the values from each iteration of i in this list
    accuracy_list <- c(accuracy_list, accuracy)
    
  }
  ## Calculate the average of the 10 folds for each value of k
  final_C_outputs <- mean(as.numeric(accuracy_list))
  ## Return the output for each value of X (k = X) passed to the function
  return(final_C_outputs)
}

## Create an empty list to store the outputs from the check_accuracy function
acc <- vector(mode = "list") 

## For X values between 1 and 100..pass them to the function.
for (X in 1:100){
  acc[X] = check_accuracy(X) # test knn with X neighbors
}

##  Take a look at the results of function 
## (looping through 10 kfolds for each value of X 1-100)

## Store the output of accuracy as a data.frame
acc<- as.data.frame(acc)

X<-seq(1,100)

## Combine the sequence of x and the accuracy values for ggplot next
combined <- do.call(rbind, Map(data.frame, A=X, B=acc))

## take a look at combined
head(combined)
##   A         B
## 1 1 0.8242743
## 2 2 0.8242743
## 3 3 0.8242743
## 4 4 0.8242743
## 5 5 0.8527576
## 6 6 0.8527939
## Plot x against the accuracy values (k=1 gives us the highest percentage accuracy)
g <- ggplot(data=combined,aes(A,B))+geom_point()
g + ggtitle("Accuracy by K value") +
  xlab("k value") + ylab("Accuracy")

set.seed(1)
## using k=1 train the model on training+validation data and predict on unseen test data 
kknn.model_optimized <- kknn(V11~., 
                             train_cv[,1:11], ## this is the full training dataset
                             test_cv[,1:11], ## data the model has never seen before
                             k = 17,
                             scale = TRUE)
set.seed(1)
## Predict on the test_cv data that the model has never seen before
preds_optimized <- as.integer(fitted(kknn.model_optimized)+0.5)

## Final accuracy of 83.07%% (as expected, smaller than what we saw in the testing scenarios 
## above where we see 86.8% accuracy for our cv/k scenarios).  
sum(preds_optimized == test_cv[,11]) / nrow(test_cv)
## [1] 0.8307692

Problem 3.1(b)

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(ggplot2)
data <- read.table("credit_card_data.txt")

## Look at a sample of the data
head(data)
##   V1    V2    V3   V4 V5 V6 V7 V8  V9 V10 V11
## 1  1 30.83 0.000 1.25  1  0  1  1 202   0   1
## 2  0 58.67 4.460 3.04  1  0  6  1  43 560   1
## 3  0 24.50 0.500 1.50  1  1  0  1 280 824   1
## 4  1 27.83 1.540 3.75  1  0  5  0 100   3   1
## 5  1 20.17 5.625 1.71  1  1  0  1 120   0   1
## 6  1 32.08 4.000 2.50  1  1  0  0 360   0   1
set.seed(1)
## Take a random sample of 60% of the data aside for training
mask_train <- sample(nrow(data), size = floor(nrow(data) * 0.6))

## Store this data in the 'train' dataframe
train <- data[mask_train,]

## Store the remaining 40% as 'leftover'
leftover <- data[-mask_train,]

## Divide the leftover by 2 and place one in validation and one in test
validation <- leftover[1:(nrow(leftover)/2),]
test <- leftover[((nrow(leftover)/2)+1):nrow(leftover),]

## make the results reproducible
set.seed(1)

## Create a sequence of values in magnitudes of 10 from 1e-08 to 1e+08 (17 values to test)
x <- 10^seq(-8, 8, 1)


## Look at how different values of 'x' passed to the 'C' argument in the ksvm model produce 
## different accuracy percentages on the validation data
accuracy <-sapply(x, function(x){
  set.seed(1)
  model_scaled <- ksvm(V11~.,
                       data=train, ## Use the 60% training data
                       type = "C-svc", # Use C-classification method
                       kernel = "rbfdot",
                       C = x,
                       scaled=TRUE) # have ksvm scale the data for you
  
  set.seed(1)
  ##  Predict on the validation data (20% of our 654 rows)
  pred_scaled <- predict(model_scaled,validation[,1:10])
  
  ## Calculate the accuracy
  sum(pred_scaled == validation$V11) / nrow(validation)
})


## Combine the sequence of x and the accuracy values for ggplot next
combined <- do.call(rbind, Map(data.frame, A=x, B=accuracy))

## take a look at combined
combined
##        A         B
## 1  1e-08 0.3358779
## 2  1e-07 0.3358779
## 3  1e-06 0.3358779
## 4  1e-05 0.3358779
## 5  1e-04 0.3358779
## 6  1e-03 0.3358779
## 7  1e-02 0.3358779
## 8  1e-01 0.8244275
## 9  1e+00 0.8244275
## 10 1e+01 0.7862595
## 11 1e+02 0.7557252
## 12 1e+03 0.7251908
## 13 1e+04 0.7251908
## 14 1e+05 0.7022901
## 15 1e+06 0.7022901
## 16 1e+07 0.7022901
## 17 1e+08 0.7022901
## Plot x against the accuracy values (k=1 gives us the highest percentage accuracy)
g <- ggplot(data=combined,aes(A,B))+geom_point()
g + ggtitle("Accuracy by C value") +
  xlab("C value") + ylab("Accuracy")

## It appears that a C value of 0.1 and 1 produce the highest accuracy of 82.44% when 
## looking at the validation data

set.seed(1)
## Now train the model using the determined value of C = (0.1)
model_highest_C <- ksvm(V11~.,data=train,
                        type = "C-svc", # Use C-classification method
                        kernel = "rbfdot", 
                        C = 1,
                        scaled=TRUE) # have ksvm scale the data for you

## Predict on an entirely unfamiliar dataset--'test'  the other 20% of our 
## 654 rows of data
set.seed(1)
pred_highest_C <- predict(model_highest_C,test[,1:10])

## What was the final accuracy of our model? In this case 90.8% 
sum(pred_highest_C == test$V11) / nrow(test)
## [1] 0.9083969