R Markdown

Load necessary libraries

options(warn = -1)
library(imager)
## Loading required package: magrittr
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:imager':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nnet)  # For multinomial logistic regression
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(keras)
library(tensorflow)
## 
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
## 
##     train
library(pROC)  # For ROC analysis
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:imager':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
options(warn = -1)

Define the path to the image folders

base_dir <- "C:/Users/MEGHANA/Downloads/Linnaeus 5 256X256/Linnaeus 5 256X256/test"  # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)

Prepare lists for images and labels

images <- list()
labels <- c()

Load images, convert to grayscale, and store them

for (class in classes) {
  class_label <- basename(class)
  img_files <- list.files(class, full.names = TRUE)
  
  for (img_path in img_files) {
    # Read and convert to grayscale
    img <- load.image(img_path)
    img_gray <- grayscale(img)
    img_resized <- resize(img_gray, 64, 64)  # Resize images to a uniform size
    
    # Data Augmentation
    img_flipped <- imrotate(img_resized, 180)  # Flip image
    img_rotated <- imrotate(img_resized, 90)   # Rotate image
    img_brightened <- img_resized * 1.2  # Increase brightness
    img_darker <- img_resized * 0.8  # Decrease brightness
    
    # Flatten the images and append to list
    images[[length(images) + 1]] <- as.vector(img_resized)
    labels <- c(labels, class_label)
    
    # Append augmented images
    images[[length(images) + 1]] <- as.vector(img_flipped)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_rotated)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_brightened)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_darker)
    labels <- c(labels, class_label)
  }
}

Convert lists to data frame

X <- do.call(rbind, images)
y <- factor(labels)

Normalize the feature set

X <- scale(X)

Dimensionality reduction using PCA

pca <- prcomp(X, center = TRUE, scale. = TRUE)

Determine the number of components to retain based on explained variance

explained_variance <- summary(pca)$importance[2, ]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]  # Retain components for 95% variance
num_components <- min(num_components, 50)  # Limit number of components if too high
X_pca <- pca$x[, 1:num_components]

Split the data into training and testing sets

set.seed(42)  # For reproducibility
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_pca[train_index, ]
y_train <- y[train_index]
X_test <- X_pca[-train_index, ]
y_test <- y[-train_index]

Train the Multinomial Logistic Regression model

logistic_model <- multinom(y_train ~ ., data = as.data.frame(X_train))
## # weights:  260 (204 variable)
## initial  value 1609.437912 
## iter  10 value 1389.501652
## iter  20 value 1366.864314
## iter  30 value 1364.177524
## iter  40 value 1361.036433
## iter  50 value 1336.128165
## iter  60 value 1309.438861
## iter  70 value 1299.856231
## iter  80 value 1295.616284
## iter  90 value 1295.177898
## iter 100 value 1292.691010
## final  value 1292.691010 
## stopped after 100 iterations

Make predictions on the test set

pred_probs <- predict(logistic_model, newdata = as.data.frame(X_test), type = "class")

Evaluate the model

confusion_matrix <- table(Predicted = pred_probs, Actual = y_test)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 32.4 %"
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11

Optionally, create a classification report

report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.324           
##                  95% CI : (0.2664, 0.3858)
##     No Information Rate : 0.2             
##     P-Value [Acc > NIR] : 2.68e-06        
##                                           
##                   Kappa : 0.155           
##                                           
##  Mcnemar's Test P-Value : 0.2207          
## 
## Statistics by Class:
## 
##                      Class: berry Class: bird Class: dog Class: flower
## Sensitivity                0.4000      0.1800     0.3400        0.4800
## Specificity                0.7900      0.8950     0.8000        0.8450
## Pos Pred Value             0.3226      0.3000     0.2982        0.4364
## Neg Pred Value             0.8404      0.8136     0.8290        0.8667
## Prevalence                 0.2000      0.2000     0.2000        0.2000
## Detection Rate             0.0800      0.0360     0.0680        0.0960
## Detection Prevalence       0.2480      0.1200     0.2280        0.2200
## Balanced Accuracy          0.5950      0.5375     0.5700        0.6625
##                      Class: other
## Sensitivity                0.2200
## Specificity                0.8250
## Pos Pred Value             0.2391
## Neg Pred Value             0.8088
## Prevalence                 0.2000
## Detection Rate             0.0440
## Detection Prevalence       0.1840
## Balanced Accuracy          0.5225

Step 2: Clustering

Using PCA reduced data or scaled data for clustering

set.seed(42)  # For reproducibility
k <- 5  # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)

Add cluster assignments to the original data

X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)

Add cluster assignments to the original labels for summary

original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)

Create a summary of the most common category in each cluster

cluster_summary <- original_data %>%
  group_by(Cluster) %>%
  summarize(Most_Common_Category = names(which.max(table(Class))))  # Adjust if Class column is named differently

Visualize clustering results with a legend

Create the cluster plot

p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
                  geom = "point", ellipse.type = "convex", 
                  ggtheme = theme_minimal())

# Add the legend title
p <- p + labs(color = "Cluster")  # This adds a legend for cluster colors

# Print the clustering plot
print(p)

# Plotting the first two PCA dimensions with cluster assignments
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19, 
     xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")

# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)

# Add labels for the most common category in each cluster
for (i in 1:k) {
  text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
       labels = cluster_summary$Most_Common_Category[i], 
       pos = 3, col = "black", cex = 0.8)
}

# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title = "Clusters")

# Step 3: Neural network # Load necessary libraries

library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)

Set the directory and classes

image_dir <- "C:/Users/MEGHANA/Downloads/Linnaeus 5 256X256/Linnaeus 5 256X256/test"
classes <- list.files(image_dir)

Define image dimensions (Reduced size)

img_width <- 32
img_height <- 32

Helper function to load and preprocess images

process_image <- function(file_path, img_width, img_height) {
  img <- load.image(file_path) %>% resize(img_width, img_height)
  as.vector(img)  # Flatten the image to a vector
}

Create an empty list to store data

image_data <- list()
image_labels <- c()

Read images and create a dataframe

for (class_name in classes) {
  class_dir <- file.path(image_dir, class_name)
  image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
  
  for (image_file in image_files) {
    image_vector <- process_image(image_file, img_width, img_height)
    image_data <- append(image_data, list(image_vector))
    image_labels <- append(image_labels, class_name)  # Assign class label
  }
}

Create the data frame with image vectors and labels

image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)

Split the data into train and validation sets (80-20 split)

set.seed(123)  # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]

Apply PCA to reduce dimensionality of the image data (excluding labels)

pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)


# Determine the maximum number of components available
max_components <- ncol(pca_result$ind$coord)

# Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum
num_components <- min(150, max_components)  # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")
## Using 5 PCA components...
# Extract the top components from PCA results
pca_features <- pca_result$ind$coord[, 1:num_components]

# Normalize the PCA features
pca_features <- scale(pca_features)

# Create a new dataframe with PCA features and labels
pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label

# Split the PCA-transformed data into train and validation sets (80-20 split)
train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]

# Define number of neurons in the hidden layer for experiments
hidden_layer_sizes <- c(5, 10, 20, 50)  # Increased layer size for experimentation

# Train and evaluate models with different hidden layer sizes using nnet
for (units in hidden_layer_sizes) {
  cat("Training model with", units, "hidden units using PCA features...\n")
  
  # Train the neural network model
  nn_model <- nnet(
    label ~ .,
    data = train_df,
    size = units,
    maxit = 300,     # Increase the number of iterations
    decay = 0.001,   # Adjust regularization parameter
    trace = FALSE    # Suppress iteration output
  )
  

# Predict on validation data
  validation_pred <- predict(nn_model, validation_df, type = "class")
  
  # Calculate accuracy
  accuracy <- mean(validation_pred == validation_df$label)
  cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy, "\n\n")
}
## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.24 
## 
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.3 
## 
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.4 
## 
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.36

Load required libraries

library(ggplot2)
library(caret)  # For confusionMatrix function

Define accuracy values

logistic_accuracy <- 0.324  

Neural network accuracies from previous evaluations

nn_accuracies <- c(0.24, 0.3, 0.4, 0.36)
names(nn_accuracies) <- c("5 hidden units", "10 hidden units", "20 hidden units", "50 hidden units")

Create a comparison data frame

comparison_df <- data.frame(
  Model = c("Logistic Regression", names(nn_accuracies)),
  Accuracy = c(logistic_accuracy, nn_accuracies)
)

Plot the accuracies using ggplot2

ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the confusion matrix

print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11

Optionally, create a classification report

report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.324           
##                  95% CI : (0.2664, 0.3858)
##     No Information Rate : 0.2             
##     P-Value [Acc > NIR] : 2.68e-06        
##                                           
##                   Kappa : 0.155           
##                                           
##  Mcnemar's Test P-Value : 0.2207          
## 
## Statistics by Class:
## 
##                      Class: berry Class: bird Class: dog Class: flower
## Sensitivity                0.4000      0.1800     0.3400        0.4800
## Specificity                0.7900      0.8950     0.8000        0.8450
## Pos Pred Value             0.3226      0.3000     0.2982        0.4364
## Neg Pred Value             0.8404      0.8136     0.8290        0.8667
## Prevalence                 0.2000      0.2000     0.2000        0.2000
## Detection Rate             0.0800      0.0360     0.0680        0.0960
## Detection Prevalence       0.2480      0.1200     0.2280        0.2200
## Balanced Accuracy          0.5950      0.5375     0.5700        0.6625
##                      Class: other
## Sensitivity                0.2200
## Specificity                0.8250
## Pos Pred Value             0.2391
## Neg Pred Value             0.8088
## Prevalence                 0.2000
## Detection Rate             0.0440
## Detection Prevalence       0.1840
## Balanced Accuracy          0.5225