Authentic Machine Learning Submissions

#Introduction

library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:imager':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(caTools)
library(caret)

## Loading required package: lattice

library(nnet)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(keras)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(clustertend)

## Package `clustertend` is deprecated.  Use package `hopkins` instead.

# Set the path to your dataset
image_path <- "C:/Users/Ajaykumar/Downloads/ml  images"

# Get a list of all categories (folders)
categories <- list.dirs(image_path, recursive = FALSE)
labels <- basename(categories)  # Use folder names as labels
labels

## [1] "Animal" "fruit"  "plant"

# Function to resize images
resize_image <- function(img_path, img_size = 32) {
  img <- load.image(img_path)  # Load the image
  img <- resize(img, img_size, img_size)  # Resize to specified dimensions
  as.numeric(img)  # Convert to a numeric vector
}

# Function to load all images and create a dataset
load_images <- function(image_path, categories) {
  image_data <- data.frame()
  for (category in categories) {
    label <- basename(category)  # Get the label from the folder name
    image_files <- list.files(category, full.names = TRUE)  # List all image files in the category
    
    for (image_file in image_files) {
      img_vector <- resize_image(image_file)  # Resize the image and convert to a vector
      image_data <- rbind(image_data, data.frame(label = label, img_vector = I(list(img_vector))))  # Add to data frame
    }
  }
  
  return(image_data)  # Return the populated data frame
}

image_data <- load_images(image_path, categories)
dim(image_data)

## [1] 248   2

ggplot(image_data, aes(x = label)) +
  geom_bar(fill = "brown") +
  theme_minimal() +
  labs(title = "Distribution of images", x = "Class Labels", y = "Frequency")

##LOGISTIC REGRESSION

# Flatten the img_vector column into multiple pixel columns
img_matrix <- do.call(rbind, lapply(image_data$img_vector, as.vector))

# Create a new data frame with flattened image data
image_data_flat <- data.frame(label = image_data$label, img_matrix)

# Check the new dimensions
print(dim(image_data_flat))

## [1]  248 3073

# Convert labels to a factor if they are not already
image_data_flat$label <- as.factor(image_data_flat$label)

# Normalize the features
image_data_scaled <- scale(image_data_flat[, -which(names(image_data_flat) == "label")])

# Perform PCA
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)

# Decide the number of components to keep (e.g., 95% variance)
explained_variance <- summary(pca_result)$importance[3,]
num_components <- min(which(cumsum(explained_variance) >= 0.95))

# Create a new dataset with the PCA components
image_data_pca <- data.frame(pca_result$x[, 1:num_components])
image_data_pca$label <- image_data_flat$label
dim(image_data_pca)

## [1] 248   3

sum(is.na(image_data_pca))

## [1] 0

# Set seed for reproducibility
set.seed(123)

# Specify the training set ratio (70% train, 30% test)
train_ratio <- 0.7

# Create a random sample of row indices for training
train_indices <- sample(seq_len(nrow(image_data_pca)), size = train_ratio * nrow(image_data_pca))

# Split the data into training and test sets
train_data <- image_data_pca[train_indices, ]  # Training set
test_data <- image_data_pca[-train_indices, ]  # Test set

# Optionally, separate features and labels for model input
x_train <- as.matrix(train_data[, -ncol(train_data)])  # Features (PCA components)
y_train <- train_data$label  # Labels (target variable)

x_test <- as.matrix(test_data[, -ncol(test_data)])  # Features (PCA components)
y_test <- test_data$label  # Labels (target variable)

# Check the dimensions of the training and test sets
dim(x_train)  # Should match the number of rows and PCA components

## [1] 173   2

dim(x_test)

## [1] 75  2

x_train_df <- as.data.frame(x_train)


x_train_df$label <- y_train

# Ensure the label is a factor
x_train_df$label <- as.factor(x_train_df$label)


# Fit a multinomial logistic regression model using ridge regularization
multinom_glmnet <- cv.glmnet(x_train,y_train,family = "multinomial",alpha = 0)

predictions <- predict(multinom_glmnet, newx = x_test, s = "lambda.min", type = "class")

# Evaluate performance using a confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = y_test)
print(confusion_matrix)

##          Actual
## Predicted Animal fruit plant
##    Animal     28     0     9
##    fruit       0    38     0

# Optionally calculate the accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Test Accuracy:", accuracy, "\n")

## Test Accuracy: 0.88

# Plot the first two principal components
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = label)) +
  geom_point(alpha = 0.6) +
  ggtitle("PCA of Image Data") +
  theme_minimal() +
  scale_color_discrete(name = "Category")

# Plot VAT
fviz_dist(dist(image_data_pca), show_labels = FALSE)

# Calculate the Hopkins statistic (assuming image_data is a numeric matrix)
hopkins_stat <- get_clust_tendency(img_matrix, n = 100)

# View result
hopkins_stat$hopkins_stat

## [1] 0.6054725

set.seed(123)  # For reproducibility
k <- 4

# Perform K-means clustering
kmeans_result <- kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k)
# Add the cluster assignments to the original PCA data
image_data_pca$cluster <- as.factor(kmeans_result$cluster)
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.6) +
  labs(title = "K-means Clustering of Image Data",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_minimal()

# Function to fit a neural network model and return the accuracy
fit_nn_model <- function(hidden_neurons, train_data) {
  # Fit the neural network model
  nn_model <- nnet(label ~ ., data = train_data, size = hidden_neurons, maxit = 100, trace = FALSE)
  
  # Predict the class labels
  predicted_classes <- predict(nn_model, train_data, type = "class")
  
  # Create a confusion matrix
  confusion_matrix <- table(Actual = train_data$label, Predicted = predicted_classes)
  
  # Calculate accuracy
  accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
  
  return(accuracy)
}


# Fit models with different hidden layer sizes
neurons_list <- c(5, 10, 20)
accuracy_results <- data.frame(Hidden_Neurons = neurons_list, Accuracy = NA)

for (neurons in neurons_list) {
  accuracy <- fit_nn_model(neurons, image_data_pca)
  accuracy_results[accuracy_results$Hidden_Neurons == neurons, "Accuracy"] <- accuracy
}

# Print the results
print(accuracy_results)

##   Hidden_Neurons  Accuracy
## 1              5 0.8951613
## 2             10 0.9112903
## 3             20 0.9395161

Authentic Machine Learning Submissions

AJAYKUMAR

2024-10-04