Using All features plot - LOOCV vs 10 fold CV

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

set.seed(123)

# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)

# Convert the target variable to a factor (classification)
training_data$quality <- as.factor(training_data$quality)

# Standardize the features (excluding the target variable)
preprocess_params <- preProcess(training_data[, -ncol(training_data)], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_data)

# Add the target variable back
training_scaled$quality <- training_data$quality

# Function to run KNN with different k values and cross-validation methods
run_knn_cv <- function(train_control_method, number = 10, plot_color = "blue", first_plot = FALSE) { # Added first_plot
  # Set up cross-validation, the default is 10, number = 10
  train_control <- trainControl(method = train_control_method, number = number)
  
  # Range of k values to test
  k_values <- 1:20
  
  # Store accuracy results
  accuracy_results <- sapply(k_values, function(k) {
    knn_model <- train(quality ~ ., data = training_scaled, 
                       method = "knn", 
                       trControl = train_control, 
                       tuneGrid = data.frame(k = k))
    return(knn_model$results$Accuracy)
  })
  
  # Print accuracy for each k
  print(data.frame(k = k_values, Accuracy = accuracy_results))
  
  # Change the name for just printing the value in the plot
  if (train_control_method == "cv") {
    cv_method_name = "10-fold CV"
  } else {
    cv_method_name = "LOOCV"
  }

  if (first_plot == TRUE){
    plot(k_values, accuracy_results, type = "b", col = plot_color, pch = 19,
         xlab = "Number of Neighbors (k)", ylab = "Accuracy",
         main = "KNN Accuracy for Different k Values (All features)")
  } else {
    lines(k_values, accuracy_results, type = "b", col = plot_color, pch = 19) # Changed plot() to lines()
  }
  
  return(list(k_values = k_values, accuracy_results = accuracy_results, cv_method_name = cv_method_name)) # Return the results
}

# Run KNN for 10-fold CV
results_10fold <- run_knn_cv("cv", plot_color = "blue", first_plot = TRUE)

##     k  Accuracy
## 1   1 0.6230041
## 2   2 0.5359432
## 3   3 0.5430319
## 4   4 0.5442802
## 5   5 0.5473253
## 6   6 0.5359875
## 7   7 0.5354078
## 8   8 0.5367635
## 9   9 0.5395096
## 10 10 0.5378508
## 11 11 0.5399875
## 12 12 0.5457075
## 13 13 0.5500562
## 14 14 0.5397921
## 15 15 0.5492152
## 16 16 0.5516148
## 17 17 0.5562261
## 18 18 0.5530414
## 19 19 0.5548828
## 20 20 0.5581404

# Run KNN for LOOCV
results_loocv <- run_knn_cv("LOOCV", plot_color = "red")

##     k  Accuracy
## 1   1 0.6379124
## 2   2 0.5478637
## 3   3 0.5413737
## 4   4 0.5405625
## 5   5 0.5527312
## 6   6 0.5435370
## 7   7 0.5465116
## 8   8 0.5351541
## 9   9 0.5359654
## 10 10 0.5416441
## 11 11 0.5392104
## 12 12 0.5443483
## 13 13 0.5427258
## 14 14 0.5478637
## 15 15 0.5551650
## 16 16 0.5448891
## 17 17 0.5516495
## 18 18 0.5516495
## 19 19 0.5521904
## 20 20 0.5570579

# Add a legend
legend("topright", legend = c(results_10fold$cv_method_name, results_loocv$cv_method_name),
       col = c("blue", "red"), lty = 1, pch = 19)

test the model on Validation data set

library(caret)

set.seed(123)

# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
training_data$quality <- as.factor(training_data$quality)

# Standardize the training features
preprocess_params <- preProcess(training_data[, -ncol(training_data)], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_data)

# Load the validation data
validation_data <- read.csv("wineq_validation.csv", stringsAsFactors = FALSE)

# Standardize the validation features using the same parameters as training
validation_scaled <- predict(preprocess_params, validation_data)

# Set up 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)

# Train the KNN model with k = 1
knn_model <- train(quality ~ ., data = training_scaled, method = "knn", trControl = train_control, tuneGrid = data.frame(k = 1))

# Predict on the scaled validation data
predictions_validation <- predict(knn_model, newdata = validation_scaled)

# Convert predictions to numeric
predictions_numeric <- as.numeric(as.character(predictions_validation))

# Save the predictions to a text file
write.table(predictions_numeric, file = "all_11.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

print("text generated")

## [1] "text generated"

=================================================================================

using Alcohol & Density features plot - LOOCV vs 10 fold CV

library(caret)

set.seed(123)

# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)

# Convert the target variable to a factor (classification)
training_data$quality <- as.factor(training_data$quality)

# Standardize the features (alcohol and density only)
preprocess_params <- preProcess(training_data[, c("alcohol", "density")], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_data[, c("alcohol", "density")])

# Add the target variable back
training_scaled$quality <- training_data$quality

# Function to run KNN with different k values and cross-validation methods
run_knn_cv <- function(train_control_method, number = 10, plot_color = "blue", first_plot = FALSE) {
  # Set up cross-validation, the default is 10, number = 10
  train_control <- trainControl(method = train_control_method, number = number)
  
  # Range of k values to test
  k_values <- 1:20
  
  # Store accuracy results
  accuracy_results <- sapply(k_values, function(k) {
    knn_model <- train(quality ~ alcohol + density, data = training_scaled, 
                       method = "knn", 
                       trControl = train_control, 
                       tuneGrid = data.frame(k = k))
    return(knn_model$results$Accuracy)
  })
  
  # Print accuracy for each k
  print(data.frame(k = k_values, Accuracy = accuracy_results))
  
  # Change the name for just printing the value in the plot
  if (train_control_method == "cv") {
    cv_method_name = "10-fold CV"
  } else {
    cv_method_name = "LOOCV"
  }

  if (first_plot == TRUE){
    plot(k_values, accuracy_results, type = "b", col = plot_color, pch = 19,
         xlab = "Number of Neighbors (k)", ylab = "Accuracy",
         main = "KNN Accuracy for Different k Values (Alcohol & Density)")
  } else {
    lines(k_values, accuracy_results, type = "b", col = plot_color, pch = 19)
  }
  
  return(list(k_values = k_values, accuracy_results = accuracy_results, cv_method_name = cv_method_name))
}

# Run KNN for 10-fold CV
results_10fold <- run_knn_cv("cv", plot_color = "blue", first_plot = TRUE)

##     k  Accuracy
## 1   1 0.5353918
## 2   2 0.4862212
## 3   3 0.4951410
## 4   4 0.4972981
## 5   5 0.5043134
## 6   6 0.4913790
## 7   7 0.4934900
## 8   8 0.4864527
## 9   9 0.4892096
## 10 10 0.4896968
## 11 11 0.4856530
## 12 12 0.4975807
## 13 13 0.4905149
## 14 14 0.4916091
## 15 15 0.4945603
## 16 16 0.4883979
## 17 17 0.4891987
## 18 18 0.4907888
## 19 19 0.4872731
## 20 20 0.4884251

# Run KNN for LOOCV
results_loocv <- run_knn_cv("LOOCV", plot_color = "red")

##     k  Accuracy
## 1   1 0.5532720
## 2   2 0.4956733
## 3   3 0.5051379
## 4   4 0.5024337
## 5   5 0.5010817
## 6   6 0.4959438
## 7   7 0.4978367
## 8   8 0.4889129
## 9   9 0.4862088
## 10 10 0.4775554
## 11 11 0.4837750
## 12 12 0.4970254
## 13 13 0.4967550
## 14 14 0.4940508
## 15 15 0.4991888
## 16 16 0.4981071
## 17 17 0.4970254
## 18 18 0.4954029
## 19 19 0.4972958
## 20 20 0.4867496

# Add a legend
legend("topright", legend = c(results_10fold$cv_method_name, results_loocv$cv_method_name),
       col = c("blue", "red"), lty = 1, pch = 19)

test the model on Validation data set

library(caret)

set.seed(123)

# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
training_data$quality <- as.factor(training_data$quality)

# Select only 'density' and 'alcohol' features
training_selected <- training_data[, c("density", "alcohol", "quality")]

# Standardize the selected training features
preprocess_params <- preProcess(training_selected[, c("density", "alcohol")], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_selected)

# Add the target variable back
training_scaled$quality <- training_selected$quality

# Load the validation data
validation_data <- read.csv("wineq_validation.csv", stringsAsFactors = FALSE)

# Select only 'density' and 'alcohol' features from validation
validation_selected <- validation_data[, c("density", "alcohol")]

# Standardize the selected validation features using the same parameters as training
validation_scaled <- predict(preprocess_params, validation_selected)

# Set up 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)

# Train the KNN model with k = 1
knn_model <- train(quality ~ ., data = training_scaled, method = "knn", trControl = train_control, tuneGrid = data.frame(k = 1))

# Predict on the scaled validation data
predictions_validation <- predict(knn_model, newdata = validation_scaled)

# Convert predictions to numeric
predictions_numeric <- as.numeric(as.character(predictions_validation))

# Save the predictions to a text file
write.table(predictions_numeric, file = "2-features.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

print("text generated")

## [1] "text generated"

Final

2025-03-30

Using All features plot - LOOCV vs 10 fold CV

test the model on Validation data set

=================================================================================

=================================================================================

=================================================================================

=================================================================================

using Alcohol & Density features plot - LOOCV vs 10 fold CV

test the model on Validation data set