options(warn = -1)
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:imager':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nnet) # For multinomial logistic regression
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(keras)
library(tensorflow)
##
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
##
## train
library(pROC) # For ROC analysis
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:imager':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
options(warn = -1)
base_dir <- "C:/Users/MEGHANA/Downloads/Linnaeus 5 256X256/Linnaeus 5 256X256/test" # Adjust path as necessary
classes <- list.dirs(base_dir, full.names = TRUE, recursive = FALSE)
images <- list()
labels <- c()
for (class in classes) {
class_label <- basename(class)
img_files <- list.files(class, full.names = TRUE)
for (img_path in img_files) {
# Read and convert to grayscale
img <- load.image(img_path)
img_gray <- grayscale(img)
img_resized <- resize(img_gray, 64, 64) # Resize images to a uniform size
# Data Augmentation
img_flipped <- imrotate(img_resized, 180) # Flip image
img_rotated <- imrotate(img_resized, 90) # Rotate image
img_brightened <- img_resized * 1.2 # Increase brightness
img_darker <- img_resized * 0.8 # Decrease brightness
# Flatten the images and append to list
images[[length(images) + 1]] <- as.vector(img_resized)
labels <- c(labels, class_label)
# Append augmented images
images[[length(images) + 1]] <- as.vector(img_flipped)
labels <- c(labels, class_label)
images[[length(images) + 1]] <- as.vector(img_rotated)
labels <- c(labels, class_label)
images[[length(images) + 1]] <- as.vector(img_brightened)
labels <- c(labels, class_label)
images[[length(images) + 1]] <- as.vector(img_darker)
labels <- c(labels, class_label)
}
}
X <- do.call(rbind, images)
y <- factor(labels)
X <- scale(X)
pca <- prcomp(X, center = TRUE, scale. = TRUE)
explained_variance <- summary(pca)$importance[2, ]
num_components <- which(cumsum(explained_variance) >= 0.95)[1] # Retain components for 95% variance
num_components <- min(num_components, 50) # Limit number of components if too high
X_pca <- pca$x[, 1:num_components]
set.seed(42) # For reproducibility
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X_pca[train_index, ]
y_train <- y[train_index]
X_test <- X_pca[-train_index, ]
y_test <- y[-train_index]
logistic_model <- multinom(y_train ~ ., data = as.data.frame(X_train))
## # weights: 260 (204 variable)
## initial value 1609.437912
## iter 10 value 1389.501652
## iter 20 value 1366.864314
## iter 30 value 1364.177524
## iter 40 value 1361.036433
## iter 50 value 1336.128165
## iter 60 value 1309.438861
## iter 70 value 1299.856231
## iter 80 value 1295.616284
## iter 90 value 1295.177898
## iter 100 value 1292.691010
## final value 1292.691010
## stopped after 100 iterations
pred_probs <- predict(logistic_model, newdata = as.data.frame(X_test), type = "class")
confusion_matrix <- table(Predicted = pred_probs, Actual = y_test)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 32.4 %"
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
## Actual
## Predicted berry bird dog flower other
## berry 20 13 9 7 13
## bird 4 9 3 5 9
## dog 10 12 17 10 8
## flower 5 6 11 24 9
## other 11 10 10 4 11
report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
##
## Actual
## Predicted berry bird dog flower other
## berry 20 13 9 7 13
## bird 4 9 3 5 9
## dog 10 12 17 10 8
## flower 5 6 11 24 9
## other 11 10 10 4 11
##
## Overall Statistics
##
## Accuracy : 0.324
## 95% CI : (0.2664, 0.3858)
## No Information Rate : 0.2
## P-Value [Acc > NIR] : 2.68e-06
##
## Kappa : 0.155
##
## Mcnemar's Test P-Value : 0.2207
##
## Statistics by Class:
##
## Class: berry Class: bird Class: dog Class: flower
## Sensitivity 0.4000 0.1800 0.3400 0.4800
## Specificity 0.7900 0.8950 0.8000 0.8450
## Pos Pred Value 0.3226 0.3000 0.2982 0.4364
## Neg Pred Value 0.8404 0.8136 0.8290 0.8667
## Prevalence 0.2000 0.2000 0.2000 0.2000
## Detection Rate 0.0800 0.0360 0.0680 0.0960
## Detection Prevalence 0.2480 0.1200 0.2280 0.2200
## Balanced Accuracy 0.5950 0.5375 0.5700 0.6625
## Class: other
## Sensitivity 0.2200
## Specificity 0.8250
## Pos Pred Value 0.2391
## Neg Pred Value 0.8088
## Prevalence 0.2000
## Detection Rate 0.0440
## Detection Prevalence 0.1840
## Balanced Accuracy 0.5225
set.seed(42) # For reproducibility
k <- 5 # Choose the number of clusters
kmeans_result <- kmeans(X_pca, centers = k, nstart = 25)
X_clustered <- as.data.frame(X_pca)
X_clustered$Cluster <- as.factor(kmeans_result$cluster)
original_data <- data.frame(Features = I(images), Class = labels)
original_data$Cluster <- as.factor(kmeans_result$cluster)
cluster_summary <- original_data %>%
group_by(Cluster) %>%
summarize(Most_Common_Category = names(which.max(table(Class)))) # Adjust if Class column is named differently
p <- fviz_cluster(kmeans_result, data = X_clustered[, -ncol(X_clustered)],
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal())
# Add the legend title
p <- p + labs(color = "Cluster") # This adds a legend for cluster colors
# Print the clustering plot
print(p)
# Plotting the first two PCA dimensions with cluster assignments
plot(X_clustered[, 1:2], col = X_clustered$Cluster, pch = 19,
xlab = "PCA 1", ylab = "PCA 2", main = "K-means Clustering")
# Add cluster centers
points(kmeans_result$centers, col = 1:k, pch = 8, cex = 2)
# Add labels for the most common category in each cluster
for (i in 1:k) {
text(kmeans_result$centers[i, 1], kmeans_result$centers[i, 2],
labels = cluster_summary$Most_Common_Category[i],
pos = 3, col = "black", cex = 0.8)
}
# Create a custom legend
legend("topright", legend = paste("Cluster", 1:k), col = 1:k, pch = 19, title = "Clusters")
# Step 3: Neural network # Load necessary libraries
library(imager)
library(dplyr)
library(nnet)
library(FactoMineR)
library(factoextra)
image_dir <- "C:/Users/MEGHANA/Downloads/Linnaeus 5 256X256/Linnaeus 5 256X256/test"
classes <- list.files(image_dir)
img_width <- 32
img_height <- 32
process_image <- function(file_path, img_width, img_height) {
img <- load.image(file_path) %>% resize(img_width, img_height)
as.vector(img) # Flatten the image to a vector
}
image_data <- list()
image_labels <- c()
for (class_name in classes) {
class_dir <- file.path(image_dir, class_name)
image_files <- list.files(class_dir, full.names = TRUE, pattern = "\\.jpg$|\\.png$")
for (image_file in image_files) {
image_vector <- process_image(image_file, img_width, img_height)
image_data <- append(image_data, list(image_vector))
image_labels <- append(image_labels, class_name) # Assign class label
}
}
image_matrix <- do.call(rbind, image_data)
image_df <- as.data.frame(image_matrix)
image_df$label <- factor(image_labels)
set.seed(123) # For reproducibility
sample_index <- sample(seq_len(nrow(image_df)), size = 0.8 * nrow(image_df))
train_df <- image_df[sample_index, ]
validation_df <- image_df[-sample_index, ]
pca_result <- PCA(image_df[, -ncol(image_df)], graph = FALSE)
# Determine the maximum number of components available
max_components <- ncol(pca_result$ind$coord)
# Choose a higher number of components (e.g., 150), ensuring it does not exceed the maximum
num_components <- min(150, max_components) # Use 150 or the maximum available components
cat("Using", num_components, "PCA components...\n")
## Using 5 PCA components...
# Extract the top components from PCA results
pca_features <- pca_result$ind$coord[, 1:num_components]
# Normalize the PCA features
pca_features <- scale(pca_features)
# Create a new dataframe with PCA features and labels
pca_df <- as.data.frame(pca_features)
pca_df$label <- image_df$label
# Split the PCA-transformed data into train and validation sets (80-20 split)
train_df <- pca_df[sample_index, ]
validation_df <- pca_df[-sample_index, ]
# Define number of neurons in the hidden layer for experiments
hidden_layer_sizes <- c(5, 10, 20, 50) # Increased layer size for experimentation
# Train and evaluate models with different hidden layer sizes using nnet
for (units in hidden_layer_sizes) {
cat("Training model with", units, "hidden units using PCA features...\n")
# Train the neural network model
nn_model <- nnet(
label ~ .,
data = train_df,
size = units,
maxit = 300, # Increase the number of iterations
decay = 0.001, # Adjust regularization parameter
trace = FALSE # Suppress iteration output
)
# Predict on validation data
validation_pred <- predict(nn_model, validation_df, type = "class")
# Calculate accuracy
accuracy <- mean(validation_pred == validation_df$label)
cat("Model with", units, "hidden units - Accuracy with PCA features:", accuracy, "\n\n")
}
## Training model with 5 hidden units using PCA features...
## Model with 5 hidden units - Accuracy with PCA features: 0.24
##
## Training model with 10 hidden units using PCA features...
## Model with 10 hidden units - Accuracy with PCA features: 0.3
##
## Training model with 20 hidden units using PCA features...
## Model with 20 hidden units - Accuracy with PCA features: 0.4
##
## Training model with 50 hidden units using PCA features...
## Model with 50 hidden units - Accuracy with PCA features: 0.36
library(ggplot2)
library(caret) # For confusionMatrix function
logistic_accuracy <- 0.324
nn_accuracies <- c(0.24, 0.3, 0.4, 0.36)
names(nn_accuracies) <- c("5 hidden units", "10 hidden units", "20 hidden units", "50 hidden units")
comparison_df <- data.frame(
Model = c("Logistic Regression", names(nn_accuracies)),
Accuracy = c(logistic_accuracy, nn_accuracies)
)
ggplot(comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
geom_text(aes(label = round(Accuracy, 2)), position = position_dodge(0.7), vjust = -0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the confusion matrix
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
## Actual
## Predicted berry bird dog flower other
## berry 20 13 9 7 13
## bird 4 9 3 5 9
## dog 10 12 17 10 8
## flower 5 6 11 24 9
## other 11 10 10 4 11
report <- confusionMatrix(confusion_matrix)
print(report)
## Confusion Matrix and Statistics
##
## Actual
## Predicted berry bird dog flower other
## berry 20 13 9 7 13
## bird 4 9 3 5 9
## dog 10 12 17 10 8
## flower 5 6 11 24 9
## other 11 10 10 4 11
##
## Overall Statistics
##
## Accuracy : 0.324
## 95% CI : (0.2664, 0.3858)
## No Information Rate : 0.2
## P-Value [Acc > NIR] : 2.68e-06
##
## Kappa : 0.155
##
## Mcnemar's Test P-Value : 0.2207
##
## Statistics by Class:
##
## Class: berry Class: bird Class: dog Class: flower
## Sensitivity 0.4000 0.1800 0.3400 0.4800
## Specificity 0.7900 0.8950 0.8000 0.8450
## Pos Pred Value 0.3226 0.3000 0.2982 0.4364
## Neg Pred Value 0.8404 0.8136 0.8290 0.8667
## Prevalence 0.2000 0.2000 0.2000 0.2000
## Detection Rate 0.0800 0.0360 0.0680 0.0960
## Detection Prevalence 0.2480 0.1200 0.2280 0.2200
## Balanced Accuracy 0.5950 0.5375 0.5700 0.6625
## Class: other
## Sensitivity 0.2200
## Specificity 0.8250
## Pos Pred Value 0.2391
## Neg Pred Value 0.8088
## Prevalence 0.2000
## Detection Rate 0.0440
## Detection Prevalence 0.1840
## Balanced Accuracy 0.5225