Using All features plot - LOOCV vs 10 fold CV
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(123)
# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
# Convert the target variable to a factor (classification)
training_data$quality <- as.factor(training_data$quality)
# Standardize the features (excluding the target variable)
preprocess_params <- preProcess(training_data[, -ncol(training_data)], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_data)
# Add the target variable back
training_scaled$quality <- training_data$quality
# Function to run KNN with different k values and cross-validation methods
run_knn_cv <- function(train_control_method, number = 10, plot_color = "blue", first_plot = FALSE) { # Added first_plot
# Set up cross-validation, the default is 10, number = 10
train_control <- trainControl(method = train_control_method, number = number)
# Range of k values to test
k_values <- 1:20
# Store accuracy results
accuracy_results <- sapply(k_values, function(k) {
knn_model <- train(quality ~ ., data = training_scaled,
method = "knn",
trControl = train_control,
tuneGrid = data.frame(k = k))
return(knn_model$results$Accuracy)
})
# Print accuracy for each k
print(data.frame(k = k_values, Accuracy = accuracy_results))
# Change the name for just printing the value in the plot
if (train_control_method == "cv") {
cv_method_name = "10-fold CV"
} else {
cv_method_name = "LOOCV"
}
if (first_plot == TRUE){
plot(k_values, accuracy_results, type = "b", col = plot_color, pch = 19,
xlab = "Number of Neighbors (k)", ylab = "Accuracy",
main = "KNN Accuracy for Different k Values (All features)")
} else {
lines(k_values, accuracy_results, type = "b", col = plot_color, pch = 19) # Changed plot() to lines()
}
return(list(k_values = k_values, accuracy_results = accuracy_results, cv_method_name = cv_method_name)) # Return the results
}
# Run KNN for 10-fold CV
results_10fold <- run_knn_cv("cv", plot_color = "blue", first_plot = TRUE)
## k Accuracy
## 1 1 0.6230041
## 2 2 0.5359432
## 3 3 0.5430319
## 4 4 0.5442802
## 5 5 0.5473253
## 6 6 0.5359875
## 7 7 0.5354078
## 8 8 0.5367635
## 9 9 0.5395096
## 10 10 0.5378508
## 11 11 0.5399875
## 12 12 0.5457075
## 13 13 0.5500562
## 14 14 0.5397921
## 15 15 0.5492152
## 16 16 0.5516148
## 17 17 0.5562261
## 18 18 0.5530414
## 19 19 0.5548828
## 20 20 0.5581404
# Run KNN for LOOCV
results_loocv <- run_knn_cv("LOOCV", plot_color = "red")
## k Accuracy
## 1 1 0.6379124
## 2 2 0.5478637
## 3 3 0.5413737
## 4 4 0.5405625
## 5 5 0.5527312
## 6 6 0.5435370
## 7 7 0.5465116
## 8 8 0.5351541
## 9 9 0.5359654
## 10 10 0.5416441
## 11 11 0.5392104
## 12 12 0.5443483
## 13 13 0.5427258
## 14 14 0.5478637
## 15 15 0.5551650
## 16 16 0.5448891
## 17 17 0.5516495
## 18 18 0.5516495
## 19 19 0.5521904
## 20 20 0.5570579
# Add a legend
legend("topright", legend = c(results_10fold$cv_method_name, results_loocv$cv_method_name),
col = c("blue", "red"), lty = 1, pch = 19)

test the model on Validation data set
library(caret)
set.seed(123)
# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
training_data$quality <- as.factor(training_data$quality)
# Standardize the training features
preprocess_params <- preProcess(training_data[, -ncol(training_data)], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_data)
# Load the validation data
validation_data <- read.csv("wineq_validation.csv", stringsAsFactors = FALSE)
# Standardize the validation features using the same parameters as training
validation_scaled <- predict(preprocess_params, validation_data)
# Set up 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Train the KNN model with k = 1
knn_model <- train(quality ~ ., data = training_scaled, method = "knn", trControl = train_control, tuneGrid = data.frame(k = 1))
# Predict on the scaled validation data
predictions_validation <- predict(knn_model, newdata = validation_scaled)
# Convert predictions to numeric
predictions_numeric <- as.numeric(as.character(predictions_validation))
# Save the predictions to a text file
write.table(predictions_numeric, file = "all_11.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
print("text generated")
## [1] "text generated"
=================================================================================
=================================================================================
=================================================================================
=================================================================================
using Alcohol & Density features plot - LOOCV vs 10 fold CV
library(caret)
set.seed(123)
# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
# Convert the target variable to a factor (classification)
training_data$quality <- as.factor(training_data$quality)
# Standardize the features (alcohol and density only)
preprocess_params <- preProcess(training_data[, c("alcohol", "density")], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_data[, c("alcohol", "density")])
# Add the target variable back
training_scaled$quality <- training_data$quality
# Function to run KNN with different k values and cross-validation methods
run_knn_cv <- function(train_control_method, number = 10, plot_color = "blue", first_plot = FALSE) {
# Set up cross-validation, the default is 10, number = 10
train_control <- trainControl(method = train_control_method, number = number)
# Range of k values to test
k_values <- 1:20
# Store accuracy results
accuracy_results <- sapply(k_values, function(k) {
knn_model <- train(quality ~ alcohol + density, data = training_scaled,
method = "knn",
trControl = train_control,
tuneGrid = data.frame(k = k))
return(knn_model$results$Accuracy)
})
# Print accuracy for each k
print(data.frame(k = k_values, Accuracy = accuracy_results))
# Change the name for just printing the value in the plot
if (train_control_method == "cv") {
cv_method_name = "10-fold CV"
} else {
cv_method_name = "LOOCV"
}
if (first_plot == TRUE){
plot(k_values, accuracy_results, type = "b", col = plot_color, pch = 19,
xlab = "Number of Neighbors (k)", ylab = "Accuracy",
main = "KNN Accuracy for Different k Values (Alcohol & Density)")
} else {
lines(k_values, accuracy_results, type = "b", col = plot_color, pch = 19)
}
return(list(k_values = k_values, accuracy_results = accuracy_results, cv_method_name = cv_method_name))
}
# Run KNN for 10-fold CV
results_10fold <- run_knn_cv("cv", plot_color = "blue", first_plot = TRUE)
## k Accuracy
## 1 1 0.5353918
## 2 2 0.4862212
## 3 3 0.4951410
## 4 4 0.4972981
## 5 5 0.5043134
## 6 6 0.4913790
## 7 7 0.4934900
## 8 8 0.4864527
## 9 9 0.4892096
## 10 10 0.4896968
## 11 11 0.4856530
## 12 12 0.4975807
## 13 13 0.4905149
## 14 14 0.4916091
## 15 15 0.4945603
## 16 16 0.4883979
## 17 17 0.4891987
## 18 18 0.4907888
## 19 19 0.4872731
## 20 20 0.4884251
# Run KNN for LOOCV
results_loocv <- run_knn_cv("LOOCV", plot_color = "red")
## k Accuracy
## 1 1 0.5532720
## 2 2 0.4956733
## 3 3 0.5051379
## 4 4 0.5024337
## 5 5 0.5010817
## 6 6 0.4959438
## 7 7 0.4978367
## 8 8 0.4889129
## 9 9 0.4862088
## 10 10 0.4775554
## 11 11 0.4837750
## 12 12 0.4970254
## 13 13 0.4967550
## 14 14 0.4940508
## 15 15 0.4991888
## 16 16 0.4981071
## 17 17 0.4970254
## 18 18 0.4954029
## 19 19 0.4972958
## 20 20 0.4867496
# Add a legend
legend("topright", legend = c(results_10fold$cv_method_name, results_loocv$cv_method_name),
col = c("blue", "red"), lty = 1, pch = 19)

test the model on Validation data set
library(caret)
set.seed(123)
# Load the training data
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
training_data$quality <- as.factor(training_data$quality)
# Select only 'density' and 'alcohol' features
training_selected <- training_data[, c("density", "alcohol", "quality")]
# Standardize the selected training features
preprocess_params <- preProcess(training_selected[, c("density", "alcohol")], method = c("center", "scale"))
training_scaled <- predict(preprocess_params, training_selected)
# Add the target variable back
training_scaled$quality <- training_selected$quality
# Load the validation data
validation_data <- read.csv("wineq_validation.csv", stringsAsFactors = FALSE)
# Select only 'density' and 'alcohol' features from validation
validation_selected <- validation_data[, c("density", "alcohol")]
# Standardize the selected validation features using the same parameters as training
validation_scaled <- predict(preprocess_params, validation_selected)
# Set up 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Train the KNN model with k = 1
knn_model <- train(quality ~ ., data = training_scaled, method = "knn", trControl = train_control, tuneGrid = data.frame(k = 1))
# Predict on the scaled validation data
predictions_validation <- predict(knn_model, newdata = validation_scaled)
# Convert predictions to numeric
predictions_numeric <- as.numeric(as.character(predictions_validation))
# Save the predictions to a text file
write.table(predictions_numeric, file = "2-features.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
print("text generated")
## [1] "text generated"