R Markdown

Load necessary libraries

options(warn = -1)
library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:imager':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(nnet)  # For multinomial logistic regression
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(keras)
library(tensorflow)

## 
## Attaching package: 'tensorflow'

## The following object is masked from 'package:caret':
## 
##     train

library(pROC)  # For ROC analysis

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:imager':
## 
##     ci

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

options(warn = -1)

Define the path to the image folders

base_dir <- "C:/Users/MEGHANA/Downloads/Linnaeus 5 256X256/Linnaeus 5 256X256/test"  # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)

Prepare lists for images and labels

images <- list()
labels <- c()

Load images, convert to grayscale, and store them

for (class in classes) {
  class_label <- basename(class)
  img_files <- list.files(class, full.names = TRUE)
  
  for (img_path in img_files) {
    # Read and convert to grayscale
    img <- load.image(img_path)
    img_gray <- grayscale(img)
    img_resized <- resize(img_gray, 64, 64)  # Resize images to a uniform size
    
    # Data Augmentation
    img_flipped <- imrotate(img_resized, 180)  # Flip image
    img_rotated <- imrotate(img_resized, 90)   # Rotate image
    img_brightened <- img_resized * 1.2  # Increase brightness
    img_darker <- img_resized * 0.8  # Decrease brightness
    
    # Flatten the images and append to list
    images[[length(images) + 1]] <- as.vector(img_resized)
    labels <- c(labels, class_label)
    
    # Append augmented images
    images[[length(images) + 1]] <- as.vector(img_flipped)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_rotated)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_brightened)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_darker)
    labels <- c(labels, class_label)
  }
}

Convert lists to data frame

X <- do.call(rbind, images)
y <- factor(labels)

Normalize the feature set

X <- scale(X)

Dimensionality reduction using PCA

pca <- prcomp(X, center = TRUE, scale. = TRUE)

Determine the number of components to retain based on explained variance

explained_variance <- summary(pca)$importance[2, ]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]  # Retain components for 95% variance
num_components <- min(num_components, 50)  # Limit number of components if too high
X_pca <- pca$x[, 1:num_components]

Split the data into training and testing sets

set.seed(42)  # For reproducibility
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_pca[train_index, ]
y_train <- y[train_index]
X_test <- X_pca[-train_index, ]
y_test <- y[-train_index]

Train the Multinomial Logistic Regression model

logistic_model <- multinom(y_train ~ ., data = as.data.frame(X_train))

## # weights:  260 (204 variable)
## initial  value 1609.437912 
## iter  10 value 1389.501652
## iter  20 value 1366.864314
## iter  30 value 1364.177524
## iter  40 value 1361.036433
## iter  50 value 1336.128165
## iter  60 value 1309.438861
## iter  70 value 1299.856231
## iter  80 value 1295.616284
## iter  90 value 1295.177898
## iter 100 value 1292.691010
## final  value 1292.691010 
## stopped after 100 iterations

Make predictions on the test set

pred_probs <- predict(logistic_model, newdata = as.data.frame(X_test), type = "class")

Evaluate the model

confusion_matrix <- table(Predicted = pred_probs, Actual = y_test)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Accuracy: 32.4 %"

print("Confusion Matrix:")

## [1] "Confusion Matrix:"

print(confusion_matrix)

##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11

Optionally, create a classification report

report <- confusionMatrix(confusion_matrix)
print(report)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.324           
##                  95% CI : (0.2664, 0.3858)
##     No Information Rate : 0.2             
##     P-Value [Acc > NIR] : 2.68e-06        
##                                           
##                   Kappa : 0.155           
##                                           
##  Mcnemar's Test P-Value : 0.2207          
## 
## Statistics by Class:
## 
##                      Class: berry Class: bird Class: dog Class: flower
## Sensitivity                0.4000      0.1800     0.3400        0.4800
## Specificity                0.7900      0.8950     0.8000        0.8450
## Pos Pred Value             0.3226      0.3000     0.2982        0.4364
## Neg Pred Value             0.8404      0.8136     0.8290        0.8667
## Prevalence                 0.2000      0.2000     0.2000        0.2000
## Detection Rate             0.0800      0.0360     0.0680        0.0960
## Detection Prevalence       0.2480      0.1200     0.2280        0.2200
## Balanced Accuracy          0.5950      0.5375     0.5700        0.6625
##                      Class: other
## Sensitivity                0.2200
## Specificity                0.8250
## Pos Pred Value             0.2391
## Neg Pred Value             0.8088
## Prevalence                 0.2000
## Detection Rate             0.0440
## Detection Prevalence       0.1840
## Balanced Accuracy          0.5225

Step 2: Clustering

Using PCA reduced data or scaled data for clustering

set.seed(42)  # For reproducibility
k <- 5  # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)

Add cluster assignments to the original data

X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)

Add cluster assignments to the original labels for summary

original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)

Create a summary of the most common category in each cluster

cluster_summary <- original_data %>%
  group_by(Cluster) %>%
  summarize(Most_Common_Category = names(which.max(table(Class))))  # Adjust if Class column is named differently

Visualize clustering results with a legend

Create the cluster plot

p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
                  geom = "point", ellipse.type = "convex", 
                  ggtheme = theme_minimal())

# Add the legend title
p <- p + labs(color = "Cluster")  # This adds a legend for cluster colors

# Print the clustering plot
print(p)

# Plotting the first two PCA dimensions with cluster assignments
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19, 
     xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")

# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)

# Add labels for the most common category in each cluster
for (i in 1:k) {
  text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
       labels = cluster_summary$Most_Common_Category[i], 
       pos = 3, col = "black", cex = 0.8)
}

# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title = "Clusters")

# Step 3: Neural network # Load necessary libraries

library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)

Set the directory and classes

image_dir <- "C:/Users/MEGHANA/Downloads/Linnaeus 5 256X256/Linnaeus 5 256X256/test"
classes <- list.files(image_dir)

Define image dimensions (Reduced size)

img_width <- 32
img_height <- 32

Helper function to load and preprocess images

process_image <- function(file_path, img_width, img_height) {
  img <- load.image(file_path) %>% resize(img_width, img_height)
  as.vector(img)  # Flatten the image to a vector
}

Create an empty list to store data

image_data <- list()
image_labels <- c()

Read images and create a dataframe

for (class_name in classes) {
  class_dir <- file.path(image_dir, class_name)
  image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
  
  for (image_file in image_files) {
    image_vector <- process_image(image_file, img_width, img_height)
    image_data <- append(image_data, list(image_vector))
    image_labels <- append(image_labels, class_name)  # Assign class label
  }
}

Create the data frame with image vectors and labels

image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)

Split the data into train and validation sets (80-20 split)

set.seed(123)  # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]

Apply PCA to reduce dimensionality of the image data (excluding labels)

pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)


# Determine the maximum number of components available
max_components <- ncol(pca_result$ind$coord)

# Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum
num_components <- min(150, max_components)  # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")

## Using 5 PCA components...

# Extract the top components from PCA results
pca_features <- pca_result$ind$coord[, 1:num_components]

# Normalize the PCA features
pca_features <- scale(pca_features)

# Create a new dataframe with PCA features and labels
pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label

# Split the PCA-transformed data into train and validation sets (80-20 split)
train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]

# Define number of neurons in the hidden layer for experiments
hidden_layer_sizes <- c(5, 10, 20, 50)  # Increased layer size for experimentation

# Train and evaluate models with different hidden layer sizes using nnet
for (units in hidden_layer_sizes) {
  cat("Training model with", units, "hidden units using PCA features...\n")
  
  # Train the neural network model
  nn_model <- nnet(
    label ~ .,
    data = train_df,
    size = units,
    maxit = 300,     # Increase the number of iterations
    decay = 0.001,   # Adjust regularization parameter
    trace = FALSE    # Suppress iteration output
  )
  

# Predict on validation data
  validation_pred <- predict(nn_model, validation_df, type = "class")
  
  # Calculate accuracy
  accuracy <- mean(validation_pred == validation_df$label)
  cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy, "\n\n")
}

## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.24 
## 
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.3 
## 
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.4 
## 
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.36

Load required libraries

library(ggplot2)
library(caret)  # For confusionMatrix function

Define accuracy values

logistic_accuracy <- 0.324

Neural network accuracies from previous evaluations

nn_accuracies <- c(0.24, 0.3, 0.4, 0.36)
names(nn_accuracies) <- c("5 hidden units", "10 hidden units", "20 hidden units", "50 hidden units")

Create a comparison data frame

comparison_df <- data.frame(
  Model = c("Logistic Regression", names(nn_accuracies)),
  Accuracy = c(logistic_accuracy, nn_accuracies)
)

Plot the accuracies using ggplot2

ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the confusion matrix

print("Confusion Matrix:")

## [1] "Confusion Matrix:"

print(confusion_matrix)

##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11

Optionally, create a classification report

report <- confusionMatrix(confusion_matrix)
print(report)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted berry bird dog flower other
##    berry     20   13   9      7    13
##    bird       4    9   3      5     9
##    dog       10   12  17     10     8
##    flower     5    6  11     24     9
##    other     11   10  10      4    11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.324           
##                  95% CI : (0.2664, 0.3858)
##     No Information Rate : 0.2             
##     P-Value [Acc > NIR] : 2.68e-06        
##                                           
##                   Kappa : 0.155           
##                                           
##  Mcnemar's Test P-Value : 0.2207          
## 
## Statistics by Class:
## 
##                      Class: berry Class: bird Class: dog Class: flower
## Sensitivity                0.4000      0.1800     0.3400        0.4800
## Specificity                0.7900      0.8950     0.8000        0.8450
## Pos Pred Value             0.3226      0.3000     0.2982        0.4364
## Neg Pred Value             0.8404      0.8136     0.8290        0.8667
## Prevalence                 0.2000      0.2000     0.2000        0.2000
## Detection Rate             0.0800      0.0360     0.0680        0.0960
## Detection Prevalence       0.2480      0.1200     0.2280        0.2200
## Balanced Accuracy          0.5950      0.5375     0.5700        0.6625
##                      Class: other
## Sensitivity                0.2200
## Specificity                0.8250
## Pos Pred Value             0.2391
## Neg Pred Value             0.8088
## Prevalence                 0.2000
## Detection Rate             0.0440
## Detection Prevalence       0.1840
## Balanced Accuracy          0.5225

dd

Meghana

2024-10-05

R Markdown

Load necessary libraries

Define the path to the image folders

Prepare lists for images and labels

Load images, convert to grayscale, and store them

Convert lists to data frame

Normalize the feature set

Dimensionality reduction using PCA

Determine the number of components to retain based on explained variance

Split the data into training and testing sets

Train the Multinomial Logistic Regression model

Make predictions on the test set

Evaluate the model

Optionally, create a classification report

Step 2: Clustering

Using PCA reduced data or scaled data for clustering

Add cluster assignments to the original data

Add cluster assignments to the original labels for summary

Create a summary of the most common category in each cluster

Visualize clustering results with a legend

Create the cluster plot

Set the directory and classes

Define image dimensions (Reduced size)

Helper function to load and preprocess images

Create an empty list to store data

Read images and create a dataframe

Create the data frame with image vectors and labels

Split the data into train and validation sets (80-20 split)

Apply PCA to reduce dimensionality of the image data (excluding labels)

Load required libraries

Define accuracy values

Neural network accuracies from previous evaluations

Create a comparison data frame

Plot the accuracies using ggplot2

Optionally, create a classification report