KNN with LOOCV

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
train <- read.csv("wineq_train.csv", stringsAsFactors = F)
test <- read.csv("wineq_validation.csv", , stringsAsFactors=F)

train_2vars <- function(k, center = FALSE){
  #print(k) # For debug
  
  train_scaled <- train
  
  if (center) {
    pre_proc <- preProcess(train, method = c("center", "scale"))
    train_scaled <- predict(pre_proc, train)  
  }
  
  ctrl <- trainControl(method = "LOOCV")
  model <- train(quality ~ alcohol + density, data = train_scaled, method = "knn", tuneGrid=data.frame(k=k), trControl = ctrl)
  return(model)
}

train_allvars <- function(k, center = FALSE){
  #print(k) # For debug
  
  train_scaled <- train
  
  if (center) {
    pre_proc <- preProcess(train, method = c("center", "scale"))
    train_scaled <- predict(pre_proc, train)  
  }
    
  #specify the cross-validation method
  ctrl <- trainControl(method = "LOOCV")
  model <- train(quality ~ ., data = train_scaled, method = "knn", tuneGrid=data.frame(k=k), trControl = ctrl)
  return(model)
}

compute_model_accuracy <- function(model) {
  preds <- predict(model, newdata = test)
  actual <- test$quality
  mean(preds == actual)  # Accuracy calculation
}


# models_list <- lapply(1:21, train_2vars)
# allvar_models_list <- lapply(1:21, train_allvars)
# save(models_list, file = "knn_models.RData")
# save(allvar_models_list, file = "all_knn_models.RData")


scaled_models_list <- lapply(1:21, train_2vars, TRUE)
scaled_allvar_models_list <- lapply(1:21, train_allvars, TRUE)

save(scaled_models_list, file = "scaled_knn_models.RData")
save(scaled_allvar_models_list, file = "scaled_all_knn_models.RData")

#scaled_knn_models <- load("scaled_knn_models.RData")
#scaled_allvar_models_list <- load("scaled_all_knn_models.RData")

rmse_values <- sapply(scaled_models_list, function(model) min(model$results$RMSE))
rmse_df <- data.frame(k = 1:21, RMSE = rmse_values)
plot(rmse_df, main= "KNN with LOOCV. density and alcohol as predictors")

all_rmse_values <- sapply(scaled_allvar_models_list, function(model) min(model$results$RMSE))
all_rmse_df <- data.frame(k = 1:21, RMSE = all_rmse_values)
plot(all_rmse_df, main= "KNN with LOOCV. All variables as predictors")

all <- train_allvars(21, TRUE)

yhat = predict(all, newdata=test)
write.table(file="mySubmission.txt", yhat, row.names = FALSE, col.names = FALSE)

10-FOLD

train_2vars_kfold <- function(k, center = FALSE) {
  #print(k)
  
  train_scaled <- train
  
  if (center) {
    pre_proc <- preProcess(train, method = c("center", "scale"))
    train_scaled <- predict(pre_proc, train)  
  }
  
  ctrl <- trainControl(method = "cv", number = 10)
  
  model <- train(quality ~ alcohol + density, data = train_scaled, method = "knn", tuneGrid = data.frame(k=k), trControl = ctrl)
  
  return(model)
}


train_allvars_kfold <- function(k, center = FALSE) {
  #print(k)
  
  train_scaled <- train
  
  if (center) {
    pre_proc <- preProcess(train, method = c("center", "scale"))
    train_scaled <- predict(pre_proc, train)  
  }
  
  ctrl <- trainControl(method = "cv", number = 10)
  
  model <- train(quality ~ ., data = train_scaled, method = "knn", tuneGrid = data.frame(k=k), trControl = ctrl)
  
  return(model)
}

models_2var_kfold_list <- lapply(1:21, train_allvars_kfold, TRUE)

models_allvar_kfold_list <- lapply(1:21, train_allvars_kfold, TRUE)

#save(models_2var_kfold_list, file = "models_2var_kfold_list.RData")
#save(scaled_allvar_models_list, file = "scaled_allvar_models_list.RData")

kfold_2var_rmse_values <- sapply(models_2var_kfold_list, function(model) min(model$results$RMSE))
kfold_2var_rmse_df <- data.frame(k = 1:21, RMSE = kfold_2var_rmse_values)
plot(kfold_2var_rmse_df, main= "KNN with 10-FOLD. density and alcohol as predictors")

kfold_all_rmse_values <- sapply(models_allvar_kfold_list, function(model) min(model$results$RMSE))
kfold_all_rmse_df <- data.frame(k = 1:21, RMSE = kfold_all_rmse_values)
plot(kfold_all_rmse_df, main= "KNN with 10-FOLD. All variables as predictors")