# Load necessary library
library(class)

# Load datasets
training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)
validation_data <- read.csv("wineq_validation.csv", stringsAsFactors = FALSE)

str(training_data)

## 'data.frame':    3698 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 7 8.1 8.1 8.6 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.27 0.22 0.27 0.23 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.36 0.43 0.41 0.4 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 20.7 1.5 1.45 4.2 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.044 0.033 0.035 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 45 28 11 17 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 170 129 63 109 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3 3.22 2.99 3.14 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.45 0.45 0.56 0.53 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 8.8 11 12 9.7 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 5 5 ...

11 features, 74.83 % - to select the best k, which is 1

# Load required libraries
library(class)   # For KNN
library(caret)   # For cross-validation

## Loading required package: ggplot2

## Loading required package: lattice

library(ggplot2) # For visualization

set.seed(123)  # Ensure reproducibility

# Remove the target variable from features
feature_names <- setdiff(names(training_data), "quality")  # Use all attributes except quality
train_features <- training_data[, feature_names]
train_target <- training_data$quality

# Standardize features
train_features_scaled <- scale(train_features)

# Optimize k using cross-validation
k_values <- seq(1, 25, 2)  # Try odd values of k
cv_accuracies <- sapply(k_values, function(k) {
  pred_cv <- knn.cv(train = train_features_scaled, cl = train_target, k = k)
  mean(pred_cv == train_target)  # Calculate accuracy
})

# Choose the best k
best_k <- k_values[which.max(cv_accuracies)]
print(paste("Best k:", best_k))

## [1] "Best k: 1"

# Plot accuracy vs. k
accuracy_plot <- data.frame(k = k_values, accuracy = cv_accuracies)
ggplot(accuracy_plot, aes(x = k, y = accuracy)) +
  geom_line(color = "blue") +
  geom_point(color = "red", size = 2) +
  ggtitle("KNN Accuracy vs. K") +
  xlab("K Value") +
  ylab("Accuracy") +
  theme_minimal()

# Standardize validation data using training mean & SD
test_features <- validation_data[, feature_names]
test_features_scaled <- scale(test_features, 
                              center = attr(train_features_scaled, "scaled:center"), 
                              scale = attr(train_features_scaled, "scaled:scale"))

# Predict on validation data using best k
predictions_validation <- knn(train = train_features_scaled, test = test_features_scaled, cl = train_target, k = best_k)

# Save predictions to file
write.table(predictions_validation, file = "all_11.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

2 features, best so far 95.22%, to select the best k, which is 1

# Load required libraries
library(class)  # For KNN
library(caret)  # For cross-validation
library(ggplot2) # For visualization

set.seed(123)  # Ensure reproducibility

# Select and standardize features
feature_names <- c("density", "alcohol")
train_features <- training_data[, feature_names]
train_features_scaled <- scale(train_features)
train_target <- training_data$quality

# Optimize k using cross-validation
k_values <- seq(1, 25, 2)  # Try odd values of k
cv_results <- sapply(k_values, function(k) {
  pred_cv <- knn.cv(train = train_features_scaled, cl = train_target, k = k)
  mean(pred_cv == train_target)  # Calculate accuracy
})

# Choose the best k
best_k <- k_values[which.max(cv_results)]  # Select the best k
print(paste("Best k:", best_k))

## [1] "Best k: 1"

# Plot accuracy vs. k for the 2-feature KNN
accuracy_plot <- data.frame(k = k_values, accuracy = cv_results)
ggplot(accuracy_plot, aes(x = k, y = accuracy)) +
  geom_line(color = "blue") +
  geom_point(color = "red", size = 2) +
  ggtitle("KNN Accuracy vs. K (Two Features)") +
  xlab("K Value") +
  ylab("Accuracy") +
  theme_minimal()

# Train final KNN model using the best k
predictions_train <- knn(train = train_features_scaled, test = train_features_scaled, cl = train_target, k = best_k)

# Print accuracy
train_accuracy <- mean(predictions_train == train_target)
print(paste("Training Accuracy:", train_accuracy))

## [1] "Training Accuracy: 0.902650081124932"

# Standardize validation data using training mean & SD
validation_features <- validation_data[, feature_names]
validation_features_scaled <- scale(validation_features, 
                                    center = attr(train_features_scaled, "scaled:center"), 
                                    scale = attr(train_features_scaled, "scaled:scale"))

# Predict on validation data
predictions_validation <- knn(train = train_features_scaled, test = validation_features_scaled, cl = train_target, k = best_k)

# Save predictions
write.table(predictions_validation, file = "2-features.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

==================================================================================================

2 features plot

# Load required libraries
library(class)   # For KNN
library(caret)   # For cross-validation
library(ggplot2) # For visualization

set.seed(123)  # Ensure reproducibility

# Select and standardize features≠
feature_names <- c("density", "alcohol")
train_features <- training_data[, feature_names]
train_features_scaled <- scale(train_features)
train_target <- as.numeric(training_data$quality)  # Ensure it's numeric

# Define MSE function to be more flexible
MSE <- function(model, train_data, target) {
  # Ensure the model is compatible with prediction
  yhat <- predict(model, newdata = train_data)
  mean((target - yhat)^2, na.rm = TRUE)  # Calculate Mean Squared Error (MSE)
}

# Create a sequence of k values (from 1 to 10)
k_values <- 1:10

# LOOCV: Leave-One-Out Cross-Validation MSE calculation
loocv_mse <- sapply(k_values, function(k) {
  loocv_pred <- knn.cv(train = train_features_scaled, cl = train_target, k = k)
  
  # Convert predictions to numeric if they are factors
  loocv_pred_numeric <- as.numeric(loocv_pred)
  
  # Calculate MSE and ensure no NA values are passed
  mse <- mean((train_target - loocv_pred_numeric)^2, na.rm = TRUE)
  return(mse)
})

# 10-Fold Cross-Validation MSE calculation
ctrl <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation
cv10_mse <- sapply(k_values, function(k) {
  knn_model <- train(train_features_scaled, train_target, method = "knn", trControl = ctrl, tuneGrid = data.frame(k = k))
  MSE(knn_model, train_features_scaled, train_target)  # Use MSE function to calculate MSE
})

# Prepare the results for plotting
results_df <- data.frame(k = rep(k_values, 2),
                         MSE = c(loocv_mse, cv10_mse),
                         Method = rep(c("LOOCV", "10-Fold CV"), each = length(k_values)))

# Plot MSE vs k for both LOOCV and 10-fold CV
ggplot(results_df, aes(x = k, y = MSE, color = Method, group = Method)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  ggtitle("MSE vs K for LOOCV and 10-Fold CV") +
  xlab("K Value") +
  ylab("Mean Squared Error (MSE)") +
  scale_color_manual(values = c("blue", "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())  # Remove legend title

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

11 features plot

# Load required libraries
library(class)   # For KNN
library(caret)   # For cross-validation
library(ggplot2) # For visualization

set.seed(123)  # Ensure reproducibility

training_data <- read.csv("wineq_train.csv", stringsAsFactors = FALSE)

# Select all numeric features except the target variable
feature_names <- setdiff(names(training_data), "quality")  # Exclude target column
train_features <- training_data[, feature_names]

# Standardize all features
train_features_scaled <- scale(train_features)

# Ensure target variable is numeric
train_target <- as.numeric(training_data$quality)

# Define MSE function for evaluation
MSE <- function(model, train_data, target) {
  yhat <- predict(model, newdata = train_data)
  mean((target - yhat)^2, na.rm = TRUE)  # Mean Squared Error (MSE)
}

# Range of k values (from 1 to 10)
k_values <- 1:10

# LOOCV: Leave-One-Out Cross-Validation MSE calculation
loocv_mse <- sapply(k_values, function(k) {
  loocv_pred <- knn.cv(train = train_features_scaled, cl = train_target, k = k)
  
  # Convert predictions to numeric (in case they are factors)
  loocv_pred_numeric <- as.numeric(loocv_pred)
  
  # Calculate MSE
  mse <- mean((train_target - loocv_pred_numeric)^2, na.rm = TRUE)
  return(mse)
})

# 10-Fold Cross-Validation MSE calculation
ctrl <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation
cv10_mse <- sapply(k_values, function(k) {
  knn_model <- train(train_features_scaled, train_target, method = "knn", trControl = ctrl, tuneGrid = data.frame(k = k))
  MSE(knn_model, train_features_scaled, train_target)  # Compute MSE
})

# Prepare results for visualization
results_df <- data.frame(k = rep(k_values, 2),
                         MSE = c(loocv_mse, cv10_mse),
                         Method = rep(c("LOOCV", "10-Fold CV"), each = length(k_values)))

# Plot MSE vs k for both LOOCV and 10-Fold CV
ggplot(results_df, aes(x = k, y = MSE, color = Method, group = Method)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  ggtitle("MSE vs K for LOOCV and 10-Fold CV (Using All Features)") +
  xlab("K Value") +
  ylab("Mean Squared Error (MSE)") +
  scale_color_manual(values = c("blue", "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())  # Remove legend title

final

2025-03-29

11 features, 74.83 % - to select the best k, which is 1

2 features, best so far 95.22%, to select the best k, which is 1

==================================================================================================

==================================================================================================

==================================================================================================

2 features plot

11 features plot