Preprocessing and EDA

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
image_data <- read.csv('C:\\Alliance University\\SEM 3\\ML2\\Puneetha_Submission\\grayscale_image_data.csv')
# Check for any missing values
sum(is.na(image_data))
## [1] 0
# Convert 'label' column to a factor
image_data$label <- as.factor(image_data$label)
# Scale the columns V1 to V10000
preprocess_params <- preProcess(image_data[, 1:10000], method = c("center", "scale"))
image_data_scaled <- predict(preprocess_params, image_data[, 1:10000])
image_data <- cbind(image_data_scaled, label = image_data$label)
#Summary statistics
selected_columns <- image_data[, c(1:5, ncol(image_data))]
summary(selected_columns)
##        V1                V2                V3                V4         
##  Min.   :-2.2703   Min.   :-2.3135   Min.   :-2.3054   Min.   :-2.3364  
##  1st Qu.:-0.9093   1st Qu.:-0.8668   1st Qu.:-0.8252   1st Qu.:-0.8797  
##  Median : 0.4768   Median : 0.4276   Median : 0.5032   Median : 0.4237  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8888   3rd Qu.: 0.8971   3rd Qu.: 0.8953   3rd Qu.: 0.8965  
##  Max.   : 0.9138   Max.   : 0.9225   Max.   : 0.9207   Max.   : 0.9220  
##        V5                   label    
##  Min.   :-2.3110   avocado     :116  
##  1st Qu.:-0.8400   banana      :142  
##  Median : 0.4155   bitter_gourd:139  
##  Mean   : 0.0000   grape       :144  
##  3rd Qu.: 0.8974                     
##  Max.   : 0.9228
# Plot label distribution
ggplot(image_data, aes(x = label)) + 
  geom_bar() + 
  theme_minimal() +
  labs(title = "Distribution of Labels", x = "Label", y = "Count")

The bar plot suggests that labels like “grape” and “bitter gourd” have more instances than others.

#Visualize a grayscale image
visualize_image <- function(image_row) {
  image_matrix <- matrix(as.numeric(image_row), nrow = 100, ncol = 100, byrow = TRUE)
  image(1:100, 1:100, t(image_matrix)[, nrow(image_matrix):1], col = gray.colors(256), axes = FALSE)
}

par(mfrow = c(2, 2)) # 2x2 grid of images
for (i in 1:4) {
  visualize_image(image_data[i, 1:10000])
  title(main = image_data$label[i])
}

# Correlation Matrix
subset_data <- image_data[, 1:50]
corrplot::corrplot(cor(subset_data), method = "color", type = "upper", tl.cex = 0.6)

PCA

library(tidyverse)
library(caret)

image_data$label <- as.factor(image_data$label)
preprocess_params <- preProcess(image_data[, 1:10000], method = c("center", "scale"))
image_data_scaled <- predict(preprocess_params, image_data[, 1:10000])
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)
plot(pca_result, type = "l", main = "Scree Plot - Variance Explained by PCs")

explained_variance <- cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2))
n_components <- which(explained_variance > 0.90)[1]
cat("Number of components for 90% variance explained:", n_components, "\n")
## Number of components for 90% variance explained: 82
pca_data <- as.data.frame(pca_result$x[, 1:n_components])
pca_data$label <- image_data$label

PCA was applied to reduce the dimensionality of the data, keeping 82 components to explain 90% of the variance. This step reduces the computational complexity of the data, making it more manageable for further analysis.
A scree plot was used to visualize the variance explained by each principal component, helping to decide how many components to retain.

Logistic Regression

library(tidyverse)
library(caret)
library(nnet)
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.3.3
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
# The 'label' column is the target, and the rest are the features

# Split training and testing sets
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(pca_data$label, p = 0.7, list = FALSE)
train_data <- pca_data[trainIndex, ]
test_data <- pca_data[-trainIndex, ]

# Logistic regression model
logistic_model <- multinom(label ~ ., data = train_data)
## # weights:  336 (249 variable)
## initial  value 528.178152 
## iter  10 value 330.530315
## iter  20 value 288.811501
## iter  30 value 284.083106
## iter  40 value 280.197457
## iter  50 value 269.437346
## iter  60 value 251.048568
## iter  70 value 244.313666
## iter  80 value 237.552415
## iter  90 value 224.899849
## iter 100 value 220.146428
## final  value 220.146428 
## stopped after 100 iterations
predictions <- predict(logistic_model, newdata = test_data)
confusionMatrix(predictions, test_data$label)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     avocado banana bitter_gourd grape
##   avocado           19      9           10     4
##   banana             6     27            3     4
##   bitter_gourd       4      2           20     8
##   grape              5      4            8    27
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5812          
##                  95% CI : (0.5008, 0.6587)
##     No Information Rate : 0.2688          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4418          
##                                           
##  Mcnemar's Test P-Value : 0.7463          
## 
## Statistics by Class:
## 
##                      Class: avocado Class: banana Class: bitter_gourd
## Sensitivity                  0.5588        0.6429              0.4878
## Specificity                  0.8175        0.8898              0.8824
## Pos Pred Value               0.4524        0.6750              0.5882
## Neg Pred Value               0.8729        0.8750              0.8333
## Prevalence                   0.2125        0.2625              0.2562
## Detection Rate               0.1187        0.1688              0.1250
## Detection Prevalence         0.2625        0.2500              0.2125
## Balanced Accuracy            0.6881        0.7663              0.6851
##                      Class: grape
## Sensitivity                0.6279
## Specificity                0.8547
## Pos Pred Value             0.6136
## Neg Pred Value             0.8621
## Prevalence                 0.2687
## Detection Rate             0.1688
## Detection Prevalence       0.2750
## Balanced Accuracy          0.7413
accuracy <- mean(predictions == test_data$label)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.58125
precision <- Precision(test_data$label, predictions)
recall <- Recall(test_data$label, predictions)
f1 <- F1_Score(test_data$label, predictions)

cat("Precision:", precision, "\n")
## Precision: 0.452381
cat("Recall:", recall, "\n")
## Recall: 0.5588235
cat("F1-Score:", f1, "\n")
## F1-Score: 0.5

A logistic regression model was applied to the PCA-transformed data for multi-class classification. The model achieved an accuracy of around 54%, with moderate precision and recall values for different classes. The confusion matrix shows where the model performed well and where it struggled, such as classifying “banana” with high accuracy but “bitter_gourd” with lower performance.

Clustering

Clusterability

if (!require(hopkins)) {
  install.packages("hopkins")
}
## Loading required package: hopkins
## Warning: package 'hopkins' was built under R version 4.3.3
library(hopkins)

hopkins_stat_value <- hopkins(pca_data[, -ncol(pca_data)])
cat("Hopkins Statistic:", hopkins_stat_value, "\n")
## Hopkins Statistic: 1

K-Means

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster) 
## Warning: package 'cluster' was built under R version 4.3.3
data <- read.csv('C:\\Alliance University\\SEM 3\\ML2\\Puneetha_Submission\\grayscale_image_data.csv')

data_features <- data[, -ncol(data)]  # All columns except the label

# Elbow method
set.seed(123)
fviz_nbclust(data_features, kmeans, method = "wss") +
  labs(title = "Elbow Plot", x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares")

# Perform K-Means clustering with optimal k (based on the elbow plot)
set.seed(123)
optimal_k <- 2
kmeans_model <- kmeans(data_features, centers = optimal_k, nstart = 25)
data$cluster <- as.factor(kmeans_model$cluster)

# Silhouette Score
silhouette_score <- silhouette(kmeans_model$cluster, dist(data_features))
fviz_silhouette(silhouette_score) +
  labs(title = "Silhouette Plot")
##   cluster size ave.sil.width
## 1       1  309          0.24
## 2       2  232          0.31

cat("Average Silhouette Width:", mean(silhouette_score[, 3]), "\n")
## Average Silhouette Width: 0.2708388

Hopkins statistic was calculated, which returned a value of 1, indicating that the data is not clusterable. Despite this, K-Means clustering was performed, and the resulting silhouette score was low (~0.2), suggesting poor clustering quality. K-Means clustering with k=4, but given the low silhouette score, these clusters are not well-separated.

Neural Networks

library(caret)
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.3.3
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(ggplot2)

fit_and_evaluate_nn <- function(neurons) {
  pca_data <- as.data.frame(pca_data)

  pca_data[] <- lapply(pca_data, function(x) {
    if (is.factor(x)) {
      return(as.numeric(as.character(x)))
    } else {
      return(x)
    }
  })

  # Split data into training and testing sets
  set.seed(123)
  train_index <- createDataPartition(pca_data[, ncol(pca_data)], p = 0.8, list = FALSE)
  train_data <- pca_data[train_index, ]
  test_data <- pca_data[-train_index, ]

  # Scale the predictor variables (excluding the target variable)
  train_data_scaled <- as.data.frame(scale(train_data[, -ncol(train_data)]))  # Scale features only
  train_data_scaled$label <- train_data[, ncol(train_data)]  # Add label back

  set.seed(1)
  nn_model <- neuralnet(label ~ ., data = train_data_scaled, hidden = neurons, linear.output = FALSE)

  print(class(nn_model))

  plot(nn_model)

  test_data_scaled <- as.data.frame(scale(test_data[, -ncol(test_data)]))  # Scale features only
  test_data_scaled$label <- test_data[, ncol(test_data)]  # Add label back

  nn_predictions <- neuralnet::compute(nn_model, test_data_scaled[, -ncol(test_data_scaled)])$net.result
  
  print("NN Predictions (raw):")
  print(nn_predictions)

  predicted_labels <- max.col(nn_predictions)  # Get the index of the max probability for each prediction
  predicted_labels <- factor(predicted_labels, levels = 1:length(levels(test_data$label)))  # Adjust the factor levels

  predicted_labels <- factor(predicted_labels, levels = levels(test_data_scaled$label))

  cat("Predicted Levels:", levels(predicted_labels), "\n")
  cat("Actual Levels:", levels(test_data_scaled$label), "\n")

  confusion_matrix <- confusionMatrix(predicted_labels, test_data_scaled$label)
  print(confusion_matrix)

  performance_df <- data.frame(
    Actual = test_data_scaled$label,
    Predicted = predicted_labels
  )

  ggplot(performance_df, aes(x = Actual, fill = as.factor(Predicted))) +
    geom_bar(position = "dodge") +
    labs(title = paste("Actual vs Predicted with", neurons, "Neurons"),
         x = "Actual Label",
         y = "Count",
         fill = "Predicted Label") +
    theme_minimal()
}

Constructed and visualized neural networks with 5, 10, and 20 neurons in a single hidden layer. These models, however, returned low accuracy (~25.93%), suggesting that the neural networks struggled with the given data.
The confusion matrices show that the models tend to predict only one class, indicating a significant performance issue.

Logistic Regression typically provides a straightforward interpretation of results and is less prone to overfitting, especially with smaller datasets. Neural Networks, while flexible and powerful, may struggle to generalize well on limited data, especially when they have a larger number of parameters relative to the number of training samples.
Logistic Regression is a simpler model that fits well on datasets with linear relationships between the features and the target. In cases where the underlying relationship is linear or nearly linear, Logistic Regression can be very effective.
In conclusion, while Neural Networks are powerful models capable of capturing complex relationships in data, they require larger datasets and careful tuning to avoid overfitting and ensure generalizability. Logistic Regression, with its simplicity and interpretability, can outperform more complex models in scenarios with smaller datasets or when the relationships in the data are primarily linear.