Problem 3.1(a)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(kknn)
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
library(ggplot2)
## Import the data
data_cv <- read.table("credit_card_data.txt")
head(data_cv)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1 30.83 0.000 1.25 1 0 1 1 202 0 1
## 2 0 58.67 4.460 3.04 1 0 6 1 43 560 1
## 3 0 24.50 0.500 1.50 1 1 0 1 280 824 1
## 4 1 27.83 1.540 3.75 1 0 5 0 100 3 1
## 5 1 20.17 5.625 1.71 1 1 0 1 120 0 1
## 6 1 32.08 4.000 2.50 1 1 0 0 360 0 1
set.seed(1)
## Take a random sample of 80% of the data aside for training
mask_test <- sample(nrow(data_cv), size = floor(nrow(data_cv) * 0.2))
test_cv <- data_cv[mask_test,]
train_cv <- data_cv[-mask_test,]
set.seed(1)
## Shuffle training data randomly
data_shuffled <- train_cv[sample(nrow(train_cv)),]
set.seed(1)
# Create 10 equally size folds
folds <- cut(seq(1,nrow(data_shuffled)),breaks=10,labels=FALSE)
## Create a function for looping through the 10 kfolds and returning the
## average accuracy of all 10 models across different values of k (nearest neighbor)
check_accuracy = function(X){
## Set value of kfolds
k <- 10
## Empty list to store accuracy outputs from various k values
accuracy_list <- vector(mode = "list")
#Perform 10 fold cross validation
for(i in 1:k){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
validationData <- data_shuffled[testIndexes, ]
trainData <- data_shuffled[-testIndexes, ]
kknn.model <- kknn(V11~.,
trainData[,1:11],
validationData[,1:11],
k = X,
scale = TRUE)
set.seed(1)
## Predict on the validationData
preds <- as.integer(fitted(kknn.model)+0.5)
## Calculate the accuracy
accuracy <- sum(preds == validationData[,11]) / nrow(validationData)
#Store the values from each iteration of i in this list
accuracy_list <- c(accuracy_list, accuracy)
}
## Calculate the average of the 10 folds for each value of k
final_C_outputs <- mean(as.numeric(accuracy_list))
## Return the output for each value of X (k = X) passed to the function
return(final_C_outputs)
}
## Create an empty list to store the outputs from the check_accuracy function
acc <- vector(mode = "list")
## For X values between 1 and 100..pass them to the function.
for (X in 1:100){
acc[X] = check_accuracy(X) # test knn with X neighbors
}
## Take a look at the results of function
## (looping through 10 kfolds for each value of X 1-100)
## Store the output of accuracy as a data.frame
acc<- as.data.frame(acc)
X<-seq(1,100)
## Combine the sequence of x and the accuracy values for ggplot next
combined <- do.call(rbind, Map(data.frame, A=X, B=acc))
## take a look at combined
head(combined)
## A B
## 1 1 0.8242743
## 2 2 0.8242743
## 3 3 0.8242743
## 4 4 0.8242743
## 5 5 0.8527576
## 6 6 0.8527939
## Plot x against the accuracy values (k=1 gives us the highest percentage accuracy)
g <- ggplot(data=combined,aes(A,B))+geom_point()
g + ggtitle("Accuracy by K value") +
xlab("k value") + ylab("Accuracy")
set.seed(1)
## using k=1 train the model on training+validation data and predict on unseen test data
kknn.model_optimized <- kknn(V11~.,
train_cv[,1:11], ## this is the full training dataset
test_cv[,1:11], ## data the model has never seen before
k = 17,
scale = TRUE)
set.seed(1)
## Predict on the test_cv data that the model has never seen before
preds_optimized <- as.integer(fitted(kknn.model_optimized)+0.5)
## Final accuracy of 83.07%% (as expected, smaller than what we saw in the testing scenarios
## above where we see 86.8% accuracy for our cv/k scenarios).
sum(preds_optimized == test_cv[,11]) / nrow(test_cv)
## [1] 0.8307692
Problem 3.1(b)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(ggplot2)
data <- read.table("credit_card_data.txt")
## Look at a sample of the data
head(data)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1 30.83 0.000 1.25 1 0 1 1 202 0 1
## 2 0 58.67 4.460 3.04 1 0 6 1 43 560 1
## 3 0 24.50 0.500 1.50 1 1 0 1 280 824 1
## 4 1 27.83 1.540 3.75 1 0 5 0 100 3 1
## 5 1 20.17 5.625 1.71 1 1 0 1 120 0 1
## 6 1 32.08 4.000 2.50 1 1 0 0 360 0 1
set.seed(1)
## Take a random sample of 60% of the data aside for training
mask_train <- sample(nrow(data), size = floor(nrow(data) * 0.6))
## Store this data in the 'train' dataframe
train <- data[mask_train,]
## Store the remaining 40% as 'leftover'
leftover <- data[-mask_train,]
## Divide the leftover by 2 and place one in validation and one in test
validation <- leftover[1:(nrow(leftover)/2),]
test <- leftover[((nrow(leftover)/2)+1):nrow(leftover),]
## make the results reproducible
set.seed(1)
## Create a sequence of values in magnitudes of 10 from 1e-08 to 1e+08 (17 values to test)
x <- 10^seq(-8, 8, 1)
## Look at how different values of 'x' passed to the 'C' argument in the ksvm model produce
## different accuracy percentages on the validation data
accuracy <-sapply(x, function(x){
set.seed(1)
model_scaled <- ksvm(V11~.,
data=train, ## Use the 60% training data
type = "C-svc", # Use C-classification method
kernel = "rbfdot",
C = x,
scaled=TRUE) # have ksvm scale the data for you
set.seed(1)
## Predict on the validation data (20% of our 654 rows)
pred_scaled <- predict(model_scaled,validation[,1:10])
## Calculate the accuracy
sum(pred_scaled == validation$V11) / nrow(validation)
})
## Combine the sequence of x and the accuracy values for ggplot next
combined <- do.call(rbind, Map(data.frame, A=x, B=accuracy))
## take a look at combined
combined
## A B
## 1 1e-08 0.3358779
## 2 1e-07 0.3358779
## 3 1e-06 0.3358779
## 4 1e-05 0.3358779
## 5 1e-04 0.3358779
## 6 1e-03 0.3358779
## 7 1e-02 0.3358779
## 8 1e-01 0.8244275
## 9 1e+00 0.8244275
## 10 1e+01 0.7862595
## 11 1e+02 0.7557252
## 12 1e+03 0.7251908
## 13 1e+04 0.7251908
## 14 1e+05 0.7022901
## 15 1e+06 0.7022901
## 16 1e+07 0.7022901
## 17 1e+08 0.7022901
## Plot x against the accuracy values (k=1 gives us the highest percentage accuracy)
g <- ggplot(data=combined,aes(A,B))+geom_point()
g + ggtitle("Accuracy by C value") +
xlab("C value") + ylab("Accuracy")
## It appears that a C value of 0.1 and 1 produce the highest accuracy of 82.44% when
## looking at the validation data
set.seed(1)
## Now train the model using the determined value of C = (0.1)
model_highest_C <- ksvm(V11~.,data=train,
type = "C-svc", # Use C-classification method
kernel = "rbfdot",
C = 1,
scaled=TRUE) # have ksvm scale the data for you
## Predict on an entirely unfamiliar dataset--'test' the other 20% of our
## 654 rows of data
set.seed(1)
pred_highest_C <- predict(model_highest_C,test[,1:10])
## What was the final accuracy of our model? In this case 90.8%
sum(pred_highest_C == test$V11) / nrow(test)
## [1] 0.9083969