# Load necessary libraries
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:imager':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:imager':
##
## grow
library(e1071) #For additional classification metrics
library(xgboost) #For alternative model
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(factoextra) #For clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(keras)
library(tensorflow)
##
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
##
## train
# Define the path to the image folders
base_dir <- "D:\\Data" # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)
# Prepare lists for images and labels
images <- list()
labels <- c()
# Load images, convert to grayscale, and store them
for (class in classes) {
class_label <- basename(class)
img_files <- list.files(class, full.names = TRUE)
for (img_path in img_files) {
# Read and convert to grayscale
img <- load.image(img_path)
img_gray <- grayscale(img)
img_resized <- resize(img_gray, 64, 64) # Resize images to a uniform size
#Data Augmentation (flip, rotate, and additional transformations)
img_flipped <- imrotate(img_resized, 180) # Flip image
img_rotated <- imrotate(img_resized, 90) # Rotate image
img_brightened <- img_resized * 1.2 # Increase brightness
img_darker <- img_resized * 0.8 # Decrease brightness
# Flatten the images and append to list
images[[length(images) + 1]] <- as.vector(img_resized)
labels <- c(labels, class_label) # Append the class label
# Append augmented images
images[[length(images) + 1]] <- as.vector(img_flipped)
labels <- c(labels, class_label) # Append the class label
images[[length(images) + 1]] <- as.vector(img_rotated)
labels <- c(labels, class_label) # Append the class label
# Append additional augmented images
images[[length(images) + 1]] <- as.vector(img_brightened)
labels <- c(labels, class_label)
images[[length(images) + 1]] <- as.vector(img_darker)
labels <- c(labels, class_label)
}
}
# Convert lists to data frame
X <- do.call(rbind, images)
y <- factor(labels)
# Normalize the feature set
X <- scale(X)
# Dimensionality reduction using PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)
# Create X_pca from PCA result
X_pca <- pca$x # Extract the principal components
# K-Means Clustering
set.seed(42) # For reproducibility
k <- 5 # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)
# Add cluster assignments to the original data
X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)
# Print the clustering plot
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal())
print(p)

# Dimensionality reduction using PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)
# Determine the number of components to retain based on explained variance
explained_variance <- summary(pca)$importance[2, ]
plot(explained_variance, type = 'b', main = 'PCA Explained Variance',
xlab = 'Principal Components', ylab = 'Proportion of Variance Explained')

# Step 2: Clustering
# Using PCA reduced data or scaled data for clustering
set.seed(42) # For reproducibility
k <- 5 # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)
# Add cluster assignments to the original data
X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)
# Add cluster assignments to the original labels for summary
original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)
# Create a summary of the most common category in each cluster
cluster_summary <- original_data %>%
group_by(Cluster) %>%
summarize(Most_Common_Category = names(which.max(table(Class)))) # Adjust if Class column is named differently
# Visualize clustering results with a legend
# Create the cluster plot
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal())
# Add the legend title
p <- p + labs(color = "Cluster") # This adds a legend for cluster colors
# Print the clustering plot
print(p)

# Plotting the first two PCA dimensions with cluster assignments
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19,
xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")
# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)
# Add labels for the most common category in each cluster
for (i in 1:k) {
text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
labels = cluster_summary$Most_Common_Category[i],
pos = 3, col = "black", cex = 0.8)
}
# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title ="Clusters")

# Step 3: Neural network
# Load necessary libraries
library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)
# Set the directory and classes
image_dir <- "D:\\Data"
classes <- list.files(image_dir)
# Define image dimensions (Reduced size)
img_width <- 32
img_height <- 32
# Helper function to load and preprocess images
process_image <- function(file_path, img_width, img_height) {
img <- load.image(file_path) %>% resize(img_width, img_height)
as.vector(img) # Flatten the image to a vector
}
# Create an empty list to store data
image_data <- list()
image_labels <- c()
# Read images and create a dataframe
for (class_name in classes) {
class_dir <- file.path(image_dir, class_name)
image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
for (image_file in image_files) {
image_vector <- process_image(image_file, img_width, img_height)
image_data <- append(image_data, list(image_vector))
image_labels <- append(image_labels, class_name) # Assign class label
}
}
# Create the data frame with image vectors and labels
image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)
# Split the data into train and validation sets (80-20 split)
set.seed(123) # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]
# Apply PCA to reduce dimensionality of the image data (excluding labels)
pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)
# Determine the maximum number of components available
max_components <- ncol(pca_result$ind$coord)
# Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum
num_components <- min(150, max_components) # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")
## Using 5 PCA components...
# Extract the top components from PCA results
pca_features <- pca_result$ind$coord[, 1:num_components]
# Normalize the PCA features
pca_features <- scale(pca_features)
# Create a new dataframe with PCA features and labels
pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label
# Split the PCA-transformed data into train and validation sets (80-20 split)
train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]
# Define number of neurons in the hidden layer for experiments
hidden_layer_sizes <- c(5, 10, 20, 50) # Increased layer size for experimentation
# Train and evaluate models with different hidden layer sizes using nnet
for (units in hidden_layer_sizes) {
cat("Training model with", units, "hidden units using PCA features...\n")
# Train the neural network model
nn_model <- nnet(
label ~ .,
data = train_df,
size = units,
maxit = 300, # Increase the number of iterations
decay = 0.001, # Adjust regularization parameter
trace = FALSE # Suppress iteration output
)
# Predict on validation data
validation_pred <- predict(nn_model, validation_df, type = "class")
# Calculate accuracy
accuracy <- mean(validation_pred == validation_df$label)
cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy,"\n\n")
}
## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.4509804
##
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.3137255
##
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.4313725
##
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.3137255
# Load required libraries
library(ggplot2)
library(caret) # For confusionMatrix function
# Define accuracy values
logistic_accuracy <- 0.324
# Neural network accuracies from previous evaluations
nn_accuracies <- c(
`5 hidden units` = 0.24,
`10 hidden units` = 0.3,
`20 hidden units` = 0.4,
`50 hidden units` = 0.36
)
# Create a comparison data frame
comparison_df <- data.frame(
Model = c("Logistic Regression", names(nn_accuracies)),
Accuracy = c(logistic_accuracy, nn_accuracies)
)
# Plot the accuracies using ggplot2
ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Predict on validation data
validation_pred <- predict(nn_model, validation_df, type = "class")
# Calculate the confusion matrix
confusion_matrix <- table(Predicted = validation_pred, Actual = validation_df$label)
# Print the confusion matrix
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
## Actual
## Predicted Birds cats dogs Fruits Vegetables
## Birds 5 2 1 2 1
## cats 0 2 4 1 3
## dogs 1 2 2 1 2
## Fruits 0 0 1 6 2
## Vegetables 5 1 4 2 1
# Optionally, create a classification report
library(caret)
report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Birds cats dogs Fruits Vegetables
## Birds 5 2 1 2 1
## cats 0 2 4 1 3
## dogs 1 2 2 1 2
## Fruits 0 0 1 6 2
## Vegetables 5 1 4 2 1
##
## Overall Statistics
##
## Accuracy : 0.3137
## 95% CI : (0.1911, 0.4589)
## No Information Rate : 0.2353
## P-Value [Acc > NIR] : 0.1254
##
## Kappa : 0.1455
##
## Mcnemar's Test P-Value : 0.4405
##
## Statistics by Class:
##
## Class: Birds Class: cats Class: dogs Class: Fruits
## Sensitivity 0.45455 0.28571 0.16667 0.5000
## Specificity 0.85000 0.81818 0.84615 0.9231
## Pos Pred Value 0.45455 0.20000 0.25000 0.6667
## Neg Pred Value 0.85000 0.87805 0.76744 0.8571
## Prevalence 0.21569 0.13725 0.23529 0.2353
## Detection Rate 0.09804 0.03922 0.03922 0.1176
## Detection Prevalence 0.21569 0.19608 0.15686 0.1765
## Balanced Accuracy 0.65227 0.55195 0.50641 0.7115
## Class: Vegetables
## Sensitivity 0.11111
## Specificity 0.71429
## Pos Pred Value 0.07692
## Neg Pred Value 0.78947
## Prevalence 0.17647
## Detection Rate 0.01961
## Detection Prevalence 0.25490
## Balanced Accuracy 0.41270
# Print the confusion matrix
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
# Ensure you have calculated `confusion_matrix` before printing it
print(confusion_matrix)
## Actual
## Predicted Birds cats dogs Fruits Vegetables
## Birds 5 2 1 2 1
## cats 0 2 4 1 3
## dogs 1 2 2 1 2
## Fruits 0 0 1 6 2
## Vegetables 5 1 4 2 1
# Optionally, create a classification report
report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Birds cats dogs Fruits Vegetables
## Birds 5 2 1 2 1
## cats 0 2 4 1 3
## dogs 1 2 2 1 2
## Fruits 0 0 1 6 2
## Vegetables 5 1 4 2 1
##
## Overall Statistics
##
## Accuracy : 0.3137
## 95% CI : (0.1911, 0.4589)
## No Information Rate : 0.2353
## P-Value [Acc > NIR] : 0.1254
##
## Kappa : 0.1455
##
## Mcnemar's Test P-Value : 0.4405
##
## Statistics by Class:
##
## Class: Birds Class: cats Class: dogs Class: Fruits
## Sensitivity 0.45455 0.28571 0.16667 0.5000
## Specificity 0.85000 0.81818 0.84615 0.9231
## Pos Pred Value 0.45455 0.20000 0.25000 0.6667
## Neg Pred Value 0.85000 0.87805 0.76744 0.8571
## Prevalence 0.21569 0.13725 0.23529 0.2353
## Detection Rate 0.09804 0.03922 0.03922 0.1176
## Detection Prevalence 0.21569 0.19608 0.15686 0.1765
## Balanced Accuracy 0.65227 0.55195 0.50641 0.7115
## Class: Vegetables
## Sensitivity 0.11111
## Specificity 0.71429
## Pos Pred Value 0.07692
## Neg Pred Value 0.78947
## Prevalence 0.17647
## Detection Rate 0.01961
## Detection Prevalence 0.25490
## Balanced Accuracy 0.41270