#Introduction
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:imager':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caTools)
library(caret)
## Loading required package: lattice
library(nnet)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(keras)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(clustertend)
## Package `clustertend` is deprecated. Use package `hopkins` instead.
# Set the path to your dataset
image_path <- "C:/Users/Ajaykumar/Downloads/ml images"
# Get a list of all categories (folders)
categories <- list.dirs(image_path, recursive = FALSE)
labels <- basename(categories) # Use folder names as labels
labels
## [1] "Animal" "fruit" "plant"
# Function to resize images
resize_image <- function(img_path, img_size = 32) {
img <- load.image(img_path) # Load the image
img <- resize(img, img_size, img_size) # Resize to specified dimensions
as.numeric(img) # Convert to a numeric vector
}
# Function to load all images and create a dataset
load_images <- function(image_path, categories) {
image_data <- data.frame()
for (category in categories) {
label <- basename(category) # Get the label from the folder name
image_files <- list.files(category, full.names = TRUE) # List all image files in the category
for (image_file in image_files) {
img_vector <- resize_image(image_file) # Resize the image and convert to a vector
image_data <- rbind(image_data, data.frame(label = label, img_vector = I(list(img_vector)))) # Add to data frame
}
}
return(image_data) # Return the populated data frame
}
image_data <- load_images(image_path, categories)
dim(image_data)
## [1] 248 2
ggplot(image_data, aes(x = label)) +
geom_bar(fill = "brown") +
theme_minimal() +
labs(title = "Distribution of images", x = "Class Labels", y = "Frequency")
##LOGISTIC REGRESSION
# Flatten the img_vector column into multiple pixel columns
img_matrix <- do.call(rbind, lapply(image_data$img_vector, as.vector))
# Create a new data frame with flattened image data
image_data_flat <- data.frame(label = image_data$label, img_matrix)
# Check the new dimensions
print(dim(image_data_flat))
## [1] 248 3073
# Convert labels to a factor if they are not already
image_data_flat$label <- as.factor(image_data_flat$label)
# Normalize the features
image_data_scaled <- scale(image_data_flat[, -which(names(image_data_flat) == "label")])
# Perform PCA
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)
# Decide the number of components to keep (e.g., 95% variance)
explained_variance <- summary(pca_result)$importance[3,]
num_components <- min(which(cumsum(explained_variance) >= 0.95))
# Create a new dataset with the PCA components
image_data_pca <- data.frame(pca_result$x[, 1:num_components])
image_data_pca$label <- image_data_flat$label
dim(image_data_pca)
## [1] 248 3
sum(is.na(image_data_pca))
## [1] 0
# Set seed for reproducibility
set.seed(123)
# Specify the training set ratio (70% train, 30% test)
train_ratio <- 0.7
# Create a random sample of row indices for training
train_indices <- sample(seq_len(nrow(image_data_pca)), size = train_ratio * nrow(image_data_pca))
# Split the data into training and test sets
train_data <- image_data_pca[train_indices, ] # Training set
test_data <- image_data_pca[-train_indices, ] # Test set
# Optionally, separate features and labels for model input
x_train <- as.matrix(train_data[, -ncol(train_data)]) # Features (PCA components)
y_train <- train_data$label # Labels (target variable)
x_test <- as.matrix(test_data[, -ncol(test_data)]) # Features (PCA components)
y_test <- test_data$label # Labels (target variable)
# Check the dimensions of the training and test sets
dim(x_train) # Should match the number of rows and PCA components
## [1] 173 2
dim(x_test)
## [1] 75 2
x_train_df <- as.data.frame(x_train)
x_train_df$label <- y_train
# Ensure the label is a factor
x_train_df$label <- as.factor(x_train_df$label)
# Fit a multinomial logistic regression model using ridge regularization
multinom_glmnet <- cv.glmnet(x_train,y_train,family = "multinomial",alpha = 0)
predictions <- predict(multinom_glmnet, newx = x_test, s = "lambda.min", type = "class")
# Evaluate performance using a confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = y_test)
print(confusion_matrix)
## Actual
## Predicted Animal fruit plant
## Animal 28 0 9
## fruit 0 38 0
# Optionally calculate the accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Test Accuracy:", accuracy, "\n")
## Test Accuracy: 0.88
# Plot the first two principal components
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = label)) +
geom_point(alpha = 0.6) +
ggtitle("PCA of Image Data") +
theme_minimal() +
scale_color_discrete(name = "Category")
# Plot VAT
fviz_dist(dist(image_data_pca), show_labels = FALSE)
# Calculate the Hopkins statistic (assuming image_data is a numeric matrix)
hopkins_stat <- get_clust_tendency(img_matrix, n = 100)
# View result
hopkins_stat$hopkins_stat
## [1] 0.6054725
set.seed(123) # For reproducibility
k <- 4
# Perform K-means clustering
kmeans_result <- kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k)
# Add the cluster assignments to the original PCA data
image_data_pca$cluster <- as.factor(kmeans_result$cluster)
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.6) +
labs(title = "K-means Clustering of Image Data",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
# Function to fit a neural network model and return the accuracy
fit_nn_model <- function(hidden_neurons, train_data) {
# Fit the neural network model
nn_model <- nnet(label ~ ., data = train_data, size = hidden_neurons, maxit = 100, trace = FALSE)
# Predict the class labels
predicted_classes <- predict(nn_model, train_data, type = "class")
# Create a confusion matrix
confusion_matrix <- table(Actual = train_data$label, Predicted = predicted_classes)
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
return(accuracy)
}
# Fit models with different hidden layer sizes
neurons_list <- c(5, 10, 20)
accuracy_results <- data.frame(Hidden_Neurons = neurons_list, Accuracy = NA)
for (neurons in neurons_list) {
accuracy <- fit_nn_model(neurons, image_data_pca)
accuracy_results[accuracy_results$Hidden_Neurons == neurons, "Accuracy"] <- accuracy
}
# Print the results
print(accuracy_results)
## Hidden_Neurons Accuracy
## 1 5 0.8951613
## 2 10 0.9112903
## 3 20 0.9395161