Logistic_Reg VS NN

Load necessary libraries

options(warn = -1)
library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:imager':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(nnet)  # For multinomial logistic regression
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(keras)
library(tensorflow)

## 
## Attaching package: 'tensorflow'

## The following object is masked from 'package:caret':
## 
##     train

library(pROC)  # For ROC analysis

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:imager':
## 
##     ci

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

options(warn = -1)

Define the path to the image folders

base_dir <- "C:/Users/raksh/Downloads/natural_images"  # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)

Prepare lists for images and labels

images <- list()
labels <- c()

Load images, convert to grayscale, and store them

for (class in classes) {
  class_label <- basename(class)
  img_files <- list.files(class, full.names = TRUE)
  
  for (img_path in img_files) {
    # Read and convert to grayscale
    img <- load.image(img_path)
    img_gray <- grayscale(img)
    img_resized <- resize(img_gray, 64, 64)  # Resize images to a uniform size
    
    # Data Augmentation
    img_flipped <- imrotate(img_resized, 180)  # Flip image
    img_rotated <- imrotate(img_resized, 90)   # Rotate image
    img_brightened <- img_resized * 1.2  # Increase brightness
    img_darker <- img_resized * 0.8  # Decrease brightness
    
    # Flatten the images and append to list
    images[[length(images) + 1]] <- as.vector(img_resized)
    labels <- c(labels, class_label)
    
    # Append augmented images
    images[[length(images) + 1]] <- as.vector(img_flipped)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_rotated)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_brightened)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_darker)
    labels <- c(labels, class_label)
  }
}

Convert lists to data frame

X <- do.call(rbind, images)
y <- factor(labels)

Normalize the feature set

X <- scale(X)

Dimensionality reduction using PCA

pca <- prcomp(X, center = TRUE, scale. = TRUE)

Determine the number of components to retain based on explained variance

explained_variance <- summary(pca)$importance[2, ]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]  # Retain components for 95% variance
num_components <- min(num_components, 50)  # Limit number of components if too high
X_pca <- pca$x[, 1:num_components]

Split the data into training and testing sets

set.seed(42)  # For reproducibility
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_pca[train_index, ]
y_train <- y[train_index]
X_test <- X_pca[-train_index, ]
y_test <- y[-train_index]

Train the Multinomial Logistic Regression model

logistic_model <- multinom(y_train ~ ., data = as.data.frame(X_train))

## # weights:  416 (357 variable)
## initial  value 1996.263880 
## iter  10 value 1096.757646
## iter  20 value 990.563068
## iter  30 value 965.026206
## iter  40 value 939.066040
## iter  50 value 924.569493
## iter  60 value 913.577387
## iter  70 value 899.445219
## iter  80 value 890.240756
## iter  90 value 881.468122
## iter 100 value 875.539382
## final  value 875.539382 
## stopped after 100 iterations

Make predictions on the test set

pred_probs <- predict(logistic_model, newdata = as.data.frame(X_test), type = "class")

Evaluate the model

confusion_matrix <- table(Predicted = pred_probs, Actual = y_test)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 57.92 %"

print("Confusion Matrix:")

## [1] "Confusion Matrix:"

print(confusion_matrix)

##            Actual
## Predicted   airplane car cat dog flower fruit motorbike person
##   airplane        19   4   2   3      2     0         0      2
##   car              1  16   2   1      2     0         0      1
##   cat              1   1   9   5      1     0         4      1
##   dog              2   5   8  10      4     1         6      1
##   flower           2   1   3   6     18     0         2      1
##   fruit            1   0   2   0      1    29         0      0
##   motorbike        1   1   2   5      1     0        15      1
##   person           3   2   2   0      1     0         3     23

Optionally, create a classification report

report <- confusionMatrix(confusion_matrix)
print(report)

## Confusion Matrix and Statistics
## 
##            Actual
## Predicted   airplane car cat dog flower fruit motorbike person
##   airplane        19   4   2   3      2     0         0      2
##   car              1  16   2   1      2     0         0      1
##   cat              1   1   9   5      1     0         4      1
##   dog              2   5   8  10      4     1         6      1
##   flower           2   1   3   6     18     0         2      1
##   fruit            1   0   2   0      1    29         0      0
##   motorbike        1   1   2   5      1     0        15      1
##   person           3   2   2   0      1     0         3     23
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5792         
##                  95% CI : (0.514, 0.6424)
##     No Information Rate : 0.125          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.519          
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: airplane Class: car Class: cat Class: dog
## Sensitivity                  0.63333    0.53333    0.30000    0.33333
## Specificity                  0.93810    0.96667    0.93810    0.87143
## Pos Pred Value               0.59375    0.69565    0.40909    0.27027
## Neg Pred Value               0.94712    0.93548    0.90367    0.90148
## Prevalence                   0.12500    0.12500    0.12500    0.12500
## Detection Rate               0.07917    0.06667    0.03750    0.04167
## Detection Prevalence         0.13333    0.09583    0.09167    0.15417
## Balanced Accuracy            0.78571    0.75000    0.61905    0.60238
##                      Class: flower Class: fruit Class: motorbike Class: person
## Sensitivity                 0.6000       0.9667           0.5000       0.76667
## Specificity                 0.9286       0.9810           0.9476       0.94762
## Pos Pred Value              0.5455       0.8788           0.5769       0.67647
## Neg Pred Value              0.9420       0.9952           0.9299       0.96602
## Prevalence                  0.1250       0.1250           0.1250       0.12500
## Detection Rate              0.0750       0.1208           0.0625       0.09583
## Detection Prevalence        0.1375       0.1375           0.1083       0.14167
## Balanced Accuracy           0.7643       0.9738           0.7238       0.85714

Step 2: Clustering

Using PCA reduced data or scaled data for clustering

set.seed(42)  # For reproducibility
k <- 5  # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)

Add cluster assignments to the original data

X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)

Add cluster assignments to the original labels for summary

original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)

Create a summary of the most common category in each cluster

cluster_summary <- original_data %>%
  group_by(Cluster) %>%
  summarize(Most_Common_Category = names(which.max(table(Class))))  # Adjust if Class column is named differently

Visualize clustering results with a legend

Create the cluster plot

# Visualize clustering results with a legend
# Create the cluster plot
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
                  geom = "point", ellipse.type = "convex", 
                  ggtheme = theme_minimal())

# Add the legend title
p <- p + labs(color = "Cluster")  # This adds a legend for cluster colors

# Print the clustering plot
print(p)

# Alternative plot using base R graphics
# Ensure a base plot is created first
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19, 
     xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")

# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)

# Add labels for the most common category in each cluster
for (i in 1:k) {
  text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
       labels = cluster_summary$Most_Common_Category[i], 
       pos = 3, col = "black", cex = 0.8)
}

# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title = "Clusters")

Step 3: Neural network

Load necessary libraries

library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)

Set the directory and classes

image_dir <- "C:/Users/raksh/Downloads/natural_images"
classes <- list.files(image_dir)

Define image dimensions (Reduced size)

img_width <- 32
img_height <- 32

Helper function to load and preprocess images

process_image <- function(file_path, img_width, img_height) {
  img <- load.image(file_path) %>% resize(img_width, img_height)
  as.vector(img)  # Flatten the image to a vector
}

Create an empty list to store data

image_data <- list()
image_labels <- c()

Read images and create a dataframe

for (class_name in classes) {
  class_dir <- file.path(image_dir, class_name)
  image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
  
  for (image_file in image_files) {
    image_vector <- process_image(image_file, img_width, img_height)
    image_data <- append(image_data, list(image_vector))
    image_labels <- append(image_labels, class_name)  # Assign class label
  }
}

Create the data frame with image vectors and labels

image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)

Split the data into train and validation sets (80-20 split)

set.seed(123)  # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]

Apply PCA to reduce dimensionality of the image data (excluding labels)

pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)

Determine the maximum number of components available

max_components <- ncol(pca_result$ind$coord)

Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum

num_components <- min(150, max_components)  # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")

## Using 5 PCA components...

Extract the top components from PCA results

pca_features <- pca_result$ind$coord[, 1:num_components]

Normalize the PCA features

pca_features <- scale(pca_features)

Create a new dataframe with PCA features and labels

pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label

Split the PCA-transformed data into train and validation sets (80-20 split)

train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]

Define number of neurons in the hidden layer for experiments

hidden_layer_sizes <- c(5, 10, 20, 50)  # Increased layer size for experimentation

Train and evaluate models with different hidden layer sizes using nnet

for (units in hidden_layer_sizes) {
  cat("Training model with", units, "hidden units using PCA features...\n")
  
  # Train the neural network model
  nn_model <- nnet(
    label ~ .,
    data = train_df,
    size = units,
    maxit = 300,     # Increase the number of iterations
    decay = 0.001,   # Adjust regularization parameter
    trace = FALSE    # Suppress iteration output
  )
  
  # Predict on validation data
  validation_pred <- predict(nn_model, validation_df, type = "class")
  
  # Calculate accuracy
  accuracy <- mean(validation_pred == validation_df$label)
  cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy, "\n\n")
}

## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.5416667 
## 
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.5 
## 
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.5416667 
## 
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.5

Assuming the following accuracy values are already defined

logistic_accuracy <- accuracy  # Accuracy from the logistic regression model

Neural network accuracies from previous evaluations

nn_accuracies <- c(
  `5 hidden units` = 0.5416667, 
  `10 hidden units` = 0.5, 
  `20 hidden units` = 0.5416667, 
  `50 hidden units` = 0.5
)

Create a comparison data frame

comparison_df <- data.frame(
  Model = c("Logistic Regression", names(nn_accuracies)),
  Accuracy = c(logistic_accuracy, nn_accuracies)
)

Plot the accuracies using ggplot2

library(ggplot2)

ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

options(warn = 0)

Logistic_Reg VS NN

Rakshith Vijay

2024-10-06

Load necessary libraries

Define the path to the image folders

Prepare lists for images and labels

Load images, convert to grayscale, and store them

Convert lists to data frame

Normalize the feature set

Dimensionality reduction using PCA

Determine the number of components to retain based on explained variance

Split the data into training and testing sets

Train the Multinomial Logistic Regression model

Make predictions on the test set

Evaluate the model

Optionally, create a classification report

Step 2: Clustering

Using PCA reduced data or scaled data for clustering

Add cluster assignments to the original data

Add cluster assignments to the original labels for summary

Create a summary of the most common category in each cluster

Visualize clustering results with a legend

Create the cluster plot

Step 3: Neural network

Load necessary libraries

Set the directory and classes

Define image dimensions (Reduced size)

Helper function to load and preprocess images

Create an empty list to store data

Read images and create a dataframe

Create the data frame with image vectors and labels

Split the data into train and validation sets (80-20 split)

Apply PCA to reduce dimensionality of the image data (excluding labels)

Determine the maximum number of components available

Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum

Extract the top components from PCA results

Normalize the PCA features

Create a new dataframe with PCA features and labels

Split the PCA-transformed data into train and validation sets (80-20 split)

Define number of neurons in the hidden layer for experiments

Train and evaluate models with different hidden layer sizes using nnet

Assuming the following accuracy values are already defined

Neural network accuracies from previous evaluations

Create a comparison data frame

Plot the accuracies using ggplot2