Logistic_Reg VS NN

Load necessary libraries

options(warn = -1)
library(imager)
## Loading required package: magrittr
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:imager':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nnet)  # For multinomial logistic regression
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(keras)
library(tensorflow)
## 
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
## 
##     train
library(pROC)  # For ROC analysis
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:imager':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
options(warn = -1)

Define the path to the image folders

base_dir <- "C:/Users/raksh/Downloads/natural_images"  # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)

Prepare lists for images and labels

images <- list()
labels <- c()

Load images, convert to grayscale, and store them

for (class in classes) {
  class_label <- basename(class)
  img_files <- list.files(class, full.names = TRUE)
  
  for (img_path in img_files) {
    # Read and convert to grayscale
    img <- load.image(img_path)
    img_gray <- grayscale(img)
    img_resized <- resize(img_gray, 64, 64)  # Resize images to a uniform size
    
    # Data Augmentation
    img_flipped <- imrotate(img_resized, 180)  # Flip image
    img_rotated <- imrotate(img_resized, 90)   # Rotate image
    img_brightened <- img_resized * 1.2  # Increase brightness
    img_darker <- img_resized * 0.8  # Decrease brightness
    
    # Flatten the images and append to list
    images[[length(images) + 1]] <- as.vector(img_resized)
    labels <- c(labels, class_label)
    
    # Append augmented images
    images[[length(images) + 1]] <- as.vector(img_flipped)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_rotated)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_brightened)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_darker)
    labels <- c(labels, class_label)
  }
}

Convert lists to data frame

X <- do.call(rbind, images)
y <- factor(labels)

Normalize the feature set

X <- scale(X)

Dimensionality reduction using PCA

pca <- prcomp(X, center = TRUE, scale. = TRUE)

Determine the number of components to retain based on explained variance

explained_variance <- summary(pca)$importance[2, ]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]  # Retain components for 95% variance
num_components <- min(num_components, 50)  # Limit number of components if too high
X_pca <- pca$x[, 1:num_components]

Split the data into training and testing sets

set.seed(42)  # For reproducibility
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_pca[train_index, ]
y_train <- y[train_index]
X_test <- X_pca[-train_index, ]
y_test <- y[-train_index]

Train the Multinomial Logistic Regression model

logistic_model <- multinom(y_train ~ ., data = as.data.frame(X_train))
## # weights:  416 (357 variable)
## initial  value 1996.263880 
## iter  10 value 1096.757646
## iter  20 value 990.563068
## iter  30 value 965.026206
## iter  40 value 939.066040
## iter  50 value 924.569493
## iter  60 value 913.577387
## iter  70 value 899.445219
## iter  80 value 890.240756
## iter  90 value 881.468122
## iter 100 value 875.539382
## final  value 875.539382 
## stopped after 100 iterations

Make predictions on the test set

pred_probs <- predict(logistic_model, newdata = as.data.frame(X_test), type = "class")

Evaluate the model

confusion_matrix <- table(Predicted = pred_probs, Actual = y_test)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 57.92 %"
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##            Actual
## Predicted   airplane car cat dog flower fruit motorbike person
##   airplane        19   4   2   3      2     0         0      2
##   car              1  16   2   1      2     0         0      1
##   cat              1   1   9   5      1     0         4      1
##   dog              2   5   8  10      4     1         6      1
##   flower           2   1   3   6     18     0         2      1
##   fruit            1   0   2   0      1    29         0      0
##   motorbike        1   1   2   5      1     0        15      1
##   person           3   2   2   0      1     0         3     23

Optionally, create a classification report

report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
## 
##            Actual
## Predicted   airplane car cat dog flower fruit motorbike person
##   airplane        19   4   2   3      2     0         0      2
##   car              1  16   2   1      2     0         0      1
##   cat              1   1   9   5      1     0         4      1
##   dog              2   5   8  10      4     1         6      1
##   flower           2   1   3   6     18     0         2      1
##   fruit            1   0   2   0      1    29         0      0
##   motorbike        1   1   2   5      1     0        15      1
##   person           3   2   2   0      1     0         3     23
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5792         
##                  95% CI : (0.514, 0.6424)
##     No Information Rate : 0.125          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.519          
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: airplane Class: car Class: cat Class: dog
## Sensitivity                  0.63333    0.53333    0.30000    0.33333
## Specificity                  0.93810    0.96667    0.93810    0.87143
## Pos Pred Value               0.59375    0.69565    0.40909    0.27027
## Neg Pred Value               0.94712    0.93548    0.90367    0.90148
## Prevalence                   0.12500    0.12500    0.12500    0.12500
## Detection Rate               0.07917    0.06667    0.03750    0.04167
## Detection Prevalence         0.13333    0.09583    0.09167    0.15417
## Balanced Accuracy            0.78571    0.75000    0.61905    0.60238
##                      Class: flower Class: fruit Class: motorbike Class: person
## Sensitivity                 0.6000       0.9667           0.5000       0.76667
## Specificity                 0.9286       0.9810           0.9476       0.94762
## Pos Pred Value              0.5455       0.8788           0.5769       0.67647
## Neg Pred Value              0.9420       0.9952           0.9299       0.96602
## Prevalence                  0.1250       0.1250           0.1250       0.12500
## Detection Rate              0.0750       0.1208           0.0625       0.09583
## Detection Prevalence        0.1375       0.1375           0.1083       0.14167
## Balanced Accuracy           0.7643       0.9738           0.7238       0.85714

Step 2: Clustering

Using PCA reduced data or scaled data for clustering

set.seed(42)  # For reproducibility
k <- 5  # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)

Add cluster assignments to the original data

X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)

Add cluster assignments to the original labels for summary

original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)

Create a summary of the most common category in each cluster

cluster_summary <- original_data %>%
  group_by(Cluster) %>%
  summarize(Most_Common_Category = names(which.max(table(Class))))  # Adjust if Class column is named differently

Visualize clustering results with a legend

Create the cluster plot

# Visualize clustering results with a legend
# Create the cluster plot
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
                  geom = "point", ellipse.type = "convex", 
                  ggtheme = theme_minimal())

# Add the legend title
p <- p + labs(color = "Cluster")  # This adds a legend for cluster colors

# Print the clustering plot
print(p)

# Alternative plot using base R graphics
# Ensure a base plot is created first
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19, 
     xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")

# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)

# Add labels for the most common category in each cluster
for (i in 1:k) {
  text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
       labels = cluster_summary$Most_Common_Category[i], 
       pos = 3, col = "black", cex = 0.8)
}

# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title = "Clusters")

Step 3: Neural network

Load necessary libraries

library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)

Set the directory and classes

image_dir <- "C:/Users/raksh/Downloads/natural_images"
classes <- list.files(image_dir)

Define image dimensions (Reduced size)

img_width <- 32
img_height <- 32

Helper function to load and preprocess images

process_image <- function(file_path, img_width, img_height) {
  img <- load.image(file_path) %>% resize(img_width, img_height)
  as.vector(img)  # Flatten the image to a vector
}

Create an empty list to store data

image_data <- list()
image_labels <- c()

Read images and create a dataframe

for (class_name in classes) {
  class_dir <- file.path(image_dir, class_name)
  image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
  
  for (image_file in image_files) {
    image_vector <- process_image(image_file, img_width, img_height)
    image_data <- append(image_data, list(image_vector))
    image_labels <- append(image_labels, class_name)  # Assign class label
  }
}

Create the data frame with image vectors and labels

image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)

Split the data into train and validation sets (80-20 split)

set.seed(123)  # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]

Apply PCA to reduce dimensionality of the image data (excluding labels)

pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)

Determine the maximum number of components available

max_components <- ncol(pca_result$ind$coord)

Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum

num_components <- min(150, max_components)  # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")
## Using 5 PCA components...

Extract the top components from PCA results

pca_features <- pca_result$ind$coord[, 1:num_components]

Normalize the PCA features

pca_features <- scale(pca_features)

Create a new dataframe with PCA features and labels

pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label

Split the PCA-transformed data into train and validation sets (80-20 split)

train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]

Define number of neurons in the hidden layer for experiments

hidden_layer_sizes <- c(5, 10, 20, 50)  # Increased layer size for experimentation

Train and evaluate models with different hidden layer sizes using nnet

for (units in hidden_layer_sizes) {
  cat("Training model with", units, "hidden units using PCA features...\n")
  
  # Train the neural network model
  nn_model <- nnet(
    label ~ .,
    data = train_df,
    size = units,
    maxit = 300,     # Increase the number of iterations
    decay = 0.001,   # Adjust regularization parameter
    trace = FALSE    # Suppress iteration output
  )
  
  # Predict on validation data
  validation_pred <- predict(nn_model, validation_df, type = "class")
  
  # Calculate accuracy
  accuracy <- mean(validation_pred == validation_df$label)
  cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy, "\n\n")
}
## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.5416667 
## 
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.5 
## 
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.5416667 
## 
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.5

Assuming the following accuracy values are already defined

logistic_accuracy <- accuracy  # Accuracy from the logistic regression model

Neural network accuracies from previous evaluations

nn_accuracies <- c(
  `5 hidden units` = 0.5416667, 
  `10 hidden units` = 0.5, 
  `20 hidden units` = 0.5416667, 
  `50 hidden units` = 0.5
)

Create a comparison data frame

comparison_df <- data.frame(
  Model = c("Logistic Regression", names(nn_accuracies)),
  Accuracy = c(logistic_accuracy, nn_accuracies)
)

Plot the accuracies using ggplot2

library(ggplot2)

ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

options(warn = 0)