knitr::opts_chunk$set(echo = TRUE)
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.3.3
## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha
data <- read.table("credit_card_data.txt")
options(scipen = 999)
library(kknn)
## Warning: package 'kknn' was built under R version 4.3.3
###Q1###
#A problem that I have run into at work is the likelihood that my boss will buy lunch for the team on any given day.
#Predictors to determine the likelihood of lunch buying could be:
# Number of team members in the office, last time lunch was bought, did he pack lunch(automatic no), average time employees spent in office.
#Plenty of other predictors could be used as well. 
###Q2###
#About to split the data for improving of accuracy, set seed is gonna create a fixed starting point for the data
set.seed(101)

n <- nrow(data)

#Creating the split in the data, with 75% going into training, 25% for testing
Data_Manip <- sample(1:n, size = 0.75*n) 
Training <- data[Data_Manip, ]
Testing  <- data[-Data_Manip, ]


Model <- ksvm(as.matrix(Training[,1:10]),as.factor(Training[,11]),
              type="C-svc",kernel="vanilladot",C=1000,scaled=TRUE);
##  Setting default kernel parameters
a <- colSums(Model@xmatrix[[1]] * Model@coef[[1]])
a
##             V1             V2             V3             V4             V5 
## -0.00041315694 -0.00001943761  0.00018928438  0.00054986850  0.99498813752 
##             V6             V7             V8             V9            V10 
##  0.00074831825  0.00020880793 -0.00011441726  0.00001301294  0.00200001563
# calculate a0
a0 <- Model@b
a0
## [1] -0.1106239
# see what the Model predicts
pred <- predict(Model,Training[,1:10])
pred
##   [1] 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1
##  [38] 1 1 1 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 0 0 0 0
##  [75] 0 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1
## [112] 1 0 1 1 0 1 1 1 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1
## [149] 0 0 1 0 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0
## [223] 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1
## [260] 0 0 1 0 1 0 0 0 1 1 1 1 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1
## [297] 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1
## [334] 1 1 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0
## [371] 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1
## [408] 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0
## [445] 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 0 1 1 0 1
## [482] 1 0 0 1 1 1 0 0 1
## Levels: 0 1
# see what fraction of the Model’s predictions match the actual classification
accuracy <- sum(pred == Training[,11]) / nrow(Training)

#Changing C doesn’t seem to be doing much to change the accuracy between the ksvm and the actual classification of whether the credit was accepted #I suspect that taking V5 out of the data would lead to more variable results as it has a heavy dominance on the influence of the decision. This would likely mean that our credit approval process is havily a one factor deciding process.

##Expriment taking out the V5 variable that's doing the heavy lifting up there
Data_Minus_5 <- Training %>%
  select(V1,V2,V3,V4,V6,V7,V8,V9,V10,V11);
Test_Data_Minus_5 <- Training %>%
  select(V1,V2,V3,V4,V6,V7,V8,V9,V10,V11);

Model_Minus_5 <- ksvm(as.matrix(Data_Minus_5[,1:9]),as.factor(Data_Minus_5[,10]),
              type="C-svc",kernel="vanilladot",C=100,scaled=TRUE);
##  Setting default kernel parameters
a_minus <- colSums(Model_Minus_5@xmatrix[[1]] *Model_Minus_5@coef[[1]])
a_minus
##           V1           V2           V3           V4           V6           V7 
##  0.018527112 -0.086552523 -0.005562132  0.660293431 -0.180066087  0.901462584 
##           V8           V9          V10 
##  0.008074218 -0.084098907  2.098990301
# calculate a0
a0_minus <- Model_Minus_5@b
a0_minus
## [1] -0.1714124
# see what the Model predicts
pred_minus <- predict(Model_Minus_5,Data_Minus_5[,1:9])
pred_minus
##   [1] 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1
##  [38] 1 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0
##  [75] 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0
## [112] 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 0
## [186] 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 1
## [223] 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 1 1
## [260] 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1
## [297] 1 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 1 0 0 1 1
## [334] 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 1 0 1 0 0 0 1
## [371] 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## [408] 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 0
## [445] 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 1
## [482] 1 0 0 0 1 0 0 0 0
## Levels: 0 1
# see what fraction of the Model’s predictions match the actual classification
accuracy_minus <- sum(pred_minus == Data_Minus_5[,10]) / nrow(Data_Minus_5)

#Honestly, didn’t really change that much, it’s intersting to see which variables were the next most important though

#Test on the testing data
Test_Pred <- predict(Model,Testing[,1:10])
test_comp <- sum(Test_Pred == Testing[,11]) / nrow(Testing)

#The test set is showing higher accuracy than the training set, which could mean it’s more greatly separated than the training data #The misclassification occurring suggests that with the current kernel Modeling systems used, there are overlaps that cannot be correctly classified to the actual banking data

###Chat GPT assisted Code for creating a graph of the classification ###

#Creating a changable list of C values for the loop to iterate through to find accuracy at different points
C_values <- c(0.1, 1, 10, 100, 1000, 10000, 20000)
results <- data.frame(C=double(), Training=double(), Testing=double())

#creates a loop that runs through each of the C values in the C_Values object for both the training and test data
#RBind is filling in the results data frame with the accuracy results from each iteration of the loop
for (c in C_values) {
  m <- ksvm(as.matrix(Training[,1:10]), as.factor(Training[,11]), 
            type="C-svc", kernel="vanilladot", C=c, scaled=TRUE)
  
  train_pred <- predict(m, Training[,1:10])
  test_pred  <- predict(m, Testing[,1:10])
  
  train_acc <- sum(train_pred == Training[,11]) / nrow(Training)
  test_acc  <- sum(test_pred == Testing[,11]) / nrow(Testing)
  
  results <- rbind(results, data.frame(C=c, Training=train_acc, Testing=test_acc))
}
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters
## Plot accuracy vs C
results_long <- reshape2::melt(results, id="C")
#I've never seen melt before, it seems helpful for visualizing and analysis, like transposing data. 

ggplot(results_long, aes(x=C, y=value, color=variable)) +
  geom_line() + geom_point() +
  scale_x_log10() +  # log scale helps visualize large C values
  labs(title="K-SVM Accuracy vs C", x="C (log scale)", y="Accuracy", color="Dataset") +
  theme_minimal()

#There isn't much difference in accuracy no matter the C value, I'm confused why this is

###End of Chat assisted code###
## 
## Call:
## kknn(formula = V11 ~ ., train = Training, test = Testing, k = 12,     scale = TRUE)
## 
## Response: "continuous"
##Chat GPT assisted in coding for visualizing##
plot_df <- data.frame(
  Actual = actual,
  PredictedProb = probs,
  PredictedClass = ifelse(probs >= 0.5, 1, 0),
  Correct = ifelse((probs >= 0.6 & actual == 1) | (probs < 0.4 & actual == 0), "Correct", "Incorrect")
)


# ggplot that shows the correctness of the predictions vs the actual in V11
ggplot(plot_df, aes(x = PredictedProb, y = Actual, color = Correct)) +
  geom_jitter(height = 0.05, width = 0, alpha = 0.7, size = 2) +
  scale_color_manual(values = c("Correct" = "blue", "Incorrect" = "red")) +
  labs(title = "kNN Predictions vs Actual Classes",
       x = "Predicted Probability of Class 1",
       y = "Actual Class",
       color = "Prediction") +
  theme_minimal()

##End of AI assisted code##

#You can change the correct looks of the graph through setting harsher or more lenient bars for correctness in predictions