Assignment

# Load necessary libraries
library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:imager':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:imager':
## 
##     grow

library(e1071)  #For additional classification metrics
library(xgboost)  #For alternative model

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(factoextra)  #For clustering visualization

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(keras)
library(tensorflow)

## 
## Attaching package: 'tensorflow'

## The following object is masked from 'package:caret':
## 
##     train

# Define the path to the image folders
base_dir <- "D:\\Data"  # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)

# Prepare lists for images and labels
images <- list()
labels <- c()

# Load images, convert to grayscale, and store them
for (class in classes) {
  class_label <- basename(class)
  img_files <- list.files(class, full.names = TRUE)
  
  for (img_path in img_files) {
    # Read and convert to grayscale
    img <- load.image(img_path)
    img_gray <- grayscale(img)
    img_resized <- resize(img_gray, 64, 64)  # Resize images to a uniform size
    
#Data Augmentation (flip, rotate, and additional transformations)
    img_flipped <- imrotate(img_resized, 180)  # Flip image
    img_rotated <- imrotate(img_resized, 90)   # Rotate image
    img_brightened <- img_resized * 1.2  # Increase brightness
    img_darker <- img_resized * 0.8  # Decrease brightness
    
# Flatten the images and append to list
    images[[length(images) + 1]] <- as.vector(img_resized)
    labels <- c(labels, class_label)  # Append the class label
    
# Append augmented images
    images[[length(images) + 1]] <- as.vector(img_flipped)
    labels <- c(labels, class_label)  # Append the class label
    
    images[[length(images) + 1]] <- as.vector(img_rotated)
    labels <- c(labels, class_label)  # Append the class label
    
# Append additional augmented images
    images[[length(images) + 1]] <- as.vector(img_brightened)
    labels <- c(labels, class_label)
    
    images[[length(images) + 1]] <- as.vector(img_darker)
    labels <- c(labels, class_label)
  }
}

# Convert lists to data frame
X <- do.call(rbind, images)
y <- factor(labels)

# Normalize the feature set
X <- scale(X)
# Dimensionality reduction using PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)

# Create X_pca from PCA result
X_pca <- pca$x  # Extract the principal components

# K-Means Clustering
set.seed(42)  # For reproducibility
k <- 5  # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)

# Add cluster assignments to the original data
X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)

# Print the clustering plot
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
                  geom = "point", ellipse.type = "convex", 
                  ggtheme = theme_minimal())
print(p)

# Dimensionality reduction using PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)

# Determine the number of components to retain based on explained variance
explained_variance <- summary(pca)$importance[2, ]
plot(explained_variance, type = 'b', main = 'PCA Explained Variance', 
     xlab = 'Principal Components', ylab = 'Proportion of Variance Explained')

# Step 2: Clustering
# Using PCA reduced data or scaled data for clustering
set.seed(42)  # For reproducibility
k <- 5  # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)

# Add cluster assignments to the original data
X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)

# Add cluster assignments to the original labels for summary
original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)

# Create a summary of the most common category in each cluster
cluster_summary <- original_data %>%
  group_by(Cluster) %>%
  summarize(Most_Common_Category = names(which.max(table(Class))))  # Adjust if Class column is named differently

# Visualize clustering results with a legend
# Create the cluster plot
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
                  geom = "point", ellipse.type = "convex", 
                  ggtheme = theme_minimal())

# Add the legend title
p <- p + labs(color = "Cluster")  # This adds a legend for cluster colors

# Print the clustering plot
print(p)

# Plotting the first two PCA dimensions with cluster assignments
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19, 
     xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")

# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)

# Add labels for the most common category in each cluster
for (i in 1:k) {
  text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
       labels = cluster_summary$Most_Common_Category[i], 
       pos = 3, col = "black", cex = 0.8)
}

# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title ="Clusters")

# Step 3: Neural network
# Load necessary libraries
library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)

# Set the directory and classes
image_dir <- "D:\\Data" 
classes <- list.files(image_dir)

# Define image dimensions (Reduced size)
img_width <- 32
img_height <- 32

# Helper function to load and preprocess images
process_image <- function(file_path, img_width, img_height) {
  img <- load.image(file_path) %>% resize(img_width, img_height)
  as.vector(img)  # Flatten the image to a vector
}

# Create an empty list to store data
image_data <- list()
image_labels <- c()

# Read images and create a dataframe
for (class_name in classes) {
  class_dir <- file.path(image_dir, class_name)
  image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
  
  for (image_file in image_files) {
    image_vector <- process_image(image_file, img_width, img_height)
    image_data <- append(image_data, list(image_vector))
    image_labels <- append(image_labels, class_name)  # Assign class label
  }
}

# Create the data frame with image vectors and labels
image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)

# Split the data into train and validation sets (80-20 split)
set.seed(123)  # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]

# Apply PCA to reduce dimensionality of the image data (excluding labels)
pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)

# Determine the maximum number of components available
max_components <- ncol(pca_result$ind$coord)

# Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum
num_components <- min(150, max_components)  # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")

## Using 5 PCA components...

# Extract the top components from PCA results
pca_features <- pca_result$ind$coord[, 1:num_components]

# Normalize the PCA features
pca_features <- scale(pca_features)

# Create a new dataframe with PCA features and labels
pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label

# Split the PCA-transformed data into train and validation sets (80-20 split)
train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]

# Define number of neurons in the hidden layer for experiments
hidden_layer_sizes <- c(5, 10, 20, 50)  # Increased layer size for experimentation

# Train and evaluate models with different hidden layer sizes using nnet
for (units in hidden_layer_sizes) {
  cat("Training model with", units, "hidden units using PCA features...\n")
  
  # Train the neural network model
  nn_model <- nnet(
    label ~ .,
    data = train_df,
    size = units,
    maxit = 300,     # Increase the number of iterations
    decay = 0.001,   # Adjust regularization parameter
    trace = FALSE    # Suppress iteration output
  )
  
  # Predict on validation data
  validation_pred <- predict(nn_model, validation_df, type = "class")
  
  # Calculate accuracy
  accuracy <- mean(validation_pred == validation_df$label)
  cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy,"\n\n")
}

## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.4509804 
## 
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.3137255 
## 
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.4313725 
## 
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.3137255

# Load required libraries
library(ggplot2)
library(caret)  # For confusionMatrix function

# Define accuracy values
logistic_accuracy <- 0.324  

# Neural network accuracies from previous evaluations
nn_accuracies <- c(
  `5 hidden units` = 0.24, 
  `10 hidden units` = 0.3, 
  `20 hidden units` = 0.4, 
  `50 hidden units` = 0.36
)

# Create a comparison data frame
comparison_df <- data.frame(
  Model = c("Logistic Regression", names(nn_accuracies)),
  Accuracy = c(logistic_accuracy, nn_accuracies)
)

# Plot the accuracies using ggplot2
ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Predict on validation data
validation_pred <- predict(nn_model, validation_df, type = "class")

# Calculate the confusion matrix
confusion_matrix <- table(Predicted = validation_pred, Actual = validation_df$label)

# Print the confusion matrix
print("Confusion Matrix:")

## [1] "Confusion Matrix:"

print(confusion_matrix)

##             Actual
## Predicted    Birds cats dogs Fruits Vegetables
##   Birds          5    2    1      2          1
##   cats           0    2    4      1          3
##   dogs           1    2    2      1          2
##   Fruits         0    0    1      6          2
##   Vegetables     5    1    4      2          1

# Optionally, create a classification report
library(caret)
report <- confusionMatrix(confusion_matrix)
print(report)

## Confusion Matrix and Statistics
## 
##             Actual
## Predicted    Birds cats dogs Fruits Vegetables
##   Birds          5    2    1      2          1
##   cats           0    2    4      1          3
##   dogs           1    2    2      1          2
##   Fruits         0    0    1      6          2
##   Vegetables     5    1    4      2          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3137          
##                  95% CI : (0.1911, 0.4589)
##     No Information Rate : 0.2353          
##     P-Value [Acc > NIR] : 0.1254          
##                                           
##                   Kappa : 0.1455          
##                                           
##  Mcnemar's Test P-Value : 0.4405          
## 
## Statistics by Class:
## 
##                      Class: Birds Class: cats Class: dogs Class: Fruits
## Sensitivity               0.45455     0.28571     0.16667        0.5000
## Specificity               0.85000     0.81818     0.84615        0.9231
## Pos Pred Value            0.45455     0.20000     0.25000        0.6667
## Neg Pred Value            0.85000     0.87805     0.76744        0.8571
## Prevalence                0.21569     0.13725     0.23529        0.2353
## Detection Rate            0.09804     0.03922     0.03922        0.1176
## Detection Prevalence      0.21569     0.19608     0.15686        0.1765
## Balanced Accuracy         0.65227     0.55195     0.50641        0.7115
##                      Class: Vegetables
## Sensitivity                    0.11111
## Specificity                    0.71429
## Pos Pred Value                 0.07692
## Neg Pred Value                 0.78947
## Prevalence                     0.17647
## Detection Rate                 0.01961
## Detection Prevalence           0.25490
## Balanced Accuracy              0.41270

# Print the confusion matrix
print("Confusion Matrix:")

## [1] "Confusion Matrix:"

# Ensure you have calculated `confusion_matrix` before printing it
print(confusion_matrix)

##             Actual
## Predicted    Birds cats dogs Fruits Vegetables
##   Birds          5    2    1      2          1
##   cats           0    2    4      1          3
##   dogs           1    2    2      1          2
##   Fruits         0    0    1      6          2
##   Vegetables     5    1    4      2          1

# Optionally, create a classification report
report <- confusionMatrix(confusion_matrix)
print(report)

## Confusion Matrix and Statistics
## 
##             Actual
## Predicted    Birds cats dogs Fruits Vegetables
##   Birds          5    2    1      2          1
##   cats           0    2    4      1          3
##   dogs           1    2    2      1          2
##   Fruits         0    0    1      6          2
##   Vegetables     5    1    4      2          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3137          
##                  95% CI : (0.1911, 0.4589)
##     No Information Rate : 0.2353          
##     P-Value [Acc > NIR] : 0.1254          
##                                           
##                   Kappa : 0.1455          
##                                           
##  Mcnemar's Test P-Value : 0.4405          
## 
## Statistics by Class:
## 
##                      Class: Birds Class: cats Class: dogs Class: Fruits
## Sensitivity               0.45455     0.28571     0.16667        0.5000
## Specificity               0.85000     0.81818     0.84615        0.9231
## Pos Pred Value            0.45455     0.20000     0.25000        0.6667
## Neg Pred Value            0.85000     0.87805     0.76744        0.8571
## Prevalence                0.21569     0.13725     0.23529        0.2353
## Detection Rate            0.09804     0.03922     0.03922        0.1176
## Detection Prevalence      0.21569     0.19608     0.15686        0.1765
## Balanced Accuracy         0.65227     0.55195     0.50641        0.7115
##                      Class: Vegetables
## Sensitivity                    0.11111
## Specificity                    0.71429
## Pos Pred Value                 0.07692
## Neg Pred Value                 0.78947
## Prevalence                     0.17647
## Detection Rate                 0.01961
## Detection Prevalence           0.25490
## Balanced Accuracy              0.41270

Assignment

MADHU M

2024-10-05