ML assignment Nature Images

NAME : DEEPIKA D

REG_NO : 2023MDTS07ALA034

ASSIGNMENT SUBMITTED TO : K A VENKATESH SIR

PROGRAM NAME : MSc DATA SCIENCE,3rd SEMESTER , ALLIANCE UNIVERSITY BANGLORE

library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:imager':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ imager::add()       masks magrittr::add()
## ✖ stringr::boundary() masks imager::boundary()
## ✖ tidyr::extract()    masks magrittr::extract()
## ✖ tidyr::fill()       masks imager::fill()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ purrr::set_names()  masks magrittr::set_names()
## ✖ dplyr::where()      masks imager::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(magick)

## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11

# Define the path to your images
image_folder <- "C:/Users/deepika.d/Desktop/Nature_image"

# Get a list of all categories (folders)
categories <- list.dirs(image_folder, recursive = FALSE)
labels <- basename(categories)  # Use folder names as labels

# Function to resize and convert images
resize_image <- function(img_path, img_size = 32) {
  tryCatch({
    img <- image_read(img_path)  # Load the image using magick
    img <- image_scale(img, paste0(img_size, "x", img_size))  # Resize to specified dimensions
    return(as.numeric(as.vector(image_data(img))))  # Convert to a numeric vector
  }, error = function(e) {
    cat("Error loading image:", img_path, "\n", e$message, "\n")
    return(NULL)  # Return NULL if there's an error
  })
}

# Function to load images from categories
load_images <- function(image_folder, categories) {
  image_data <- data.frame()  # Initialize an empty data frame
  
  for (category in categories) {
    label <- basename(category)  # Get the label from the folder name
    
    # List all image files in the category (support JPG, PNG, BMP)
    image_files <- list.files(category, pattern = "\\.(jpg|jpeg|png|bmp)$", full.names = TRUE, ignore.case = TRUE)
    
    for (image_file in image_files) {
      img_vector <- resize_image(image_file)  # Resize the image and convert to a vector
      
      if (!is.null(img_vector)) {  # Only add if img_vector is not NULL
        image_data <- rbind(image_data, data.frame(label = label, img_vector = I(list(img_vector))))
      }
    }
  }
  
  return(image_data)  # Return the populated data frame
}

# Load the images into a data frame
image_data <- load_images(image_folder, categories)

# Check the dimensions of the image data
dim(image_data)  # Should show the number of loaded images and columns

## [1] 250   2

# Unlist the image vectors into a matrix
image_matrix <- do.call(rbind, lapply(image_data$img_vector, unlist))

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)

# Check the dimensions of the matrix (each row corresponds to an image)
dim(image_matrix)

## [1]  250 3072

## LOGISTIC REGRESSION

# Load necessary libraries
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(nnet)
library(ggplot2)

# Step 1: Flatten Image Data and Prepare Dataset
# Assuming image_data contains img_vector for images and label for image categories

# Convert image vectors into matrix form
img_matrix <- do.call(rbind, lapply(image_data$img_vector, as.vector))

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)

# Create a data frame with the flattened image data and labels
image_data_flat <- data.frame(label = image_data$label, img_matrix)

# Ensure labels are treated as factors (for classification)
image_data_flat$label <- as.factor(image_data_flat$label)

# Step 2: Normalize Features and Perform PCA
# Normalize the pixel data (excluding the label column)
image_data_scaled <- scale(image_data_flat[, -1])  # Exclude the label column for normalization

# Perform Principal Component Analysis (PCA)
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)

# Calculate the explained variance and select components for 95% variance retention
explained_variance <- summary(pca_result)$importance[3,]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]

# Create a new dataset using the selected PCA components along with the labels
image_data_pca <- data.frame(pca_result$x[, 1:num_components])
image_data_pca$label <- image_data_flat$label

# Step 3: Train Logistic Regression Model (Multinomial)
# Fit a multinomial logistic regression model using the PCA components
multinom_model <- multinom(label ~ ., data = image_data_pca)

## # weights:  25 (16 variable)
## initial  value 402.359478 
## iter  10 value 373.543815
## iter  20 value 365.752200
## final  value 365.750375 
## converged

# Step 4: Make Predictions
# Get predicted class probabilities and predicted class labels
predicted_classes <- predict(multinom_model)

# Step 5: Evaluate Model Performance
# Generate a confusion matrix and calculate accuracy
confusion_matrix <- confusionMatrix(as.factor(predicted_classes), image_data_pca$label)
accuracy <- confusion_matrix$overall['Accuracy']

# Step 6: Display Results
# Output accuracy and the confusion matrix
cat("Logistic Regression Accuracy:", round(accuracy, 4), "\n")

## Logistic Regression Accuracy: 0.4

cat("Confusion Matrix:\n")

## Confusion Matrix:

print(confusion_matrix$table)

##             Reference
## Prediction   Animals Birds Flowers Fruits vegetables
##   Animals         11     9       2      5          6
##   Birds            9    19       8      7         10
##   Flowers         16    11      33     10          6
##   Fruits           6     5       5     19         10
##   vegetables       8     6       2      9         18

# Step 7: Visualize PCA Components (Optional)
# Plot the first two principal components with labels
ggplot(data = image_data_pca, aes(x = PC1, y = PC2, color = label)) +
  geom_point(size = 2, alpha = 0.6) +
  labs(title = "PCA of Image Data", x = "Principal Component 1", y = "Principal Component 2") +
  theme_minimal()

##NEURAL NETWORKS

# Load necessary libraries
library(nnet)
library(caret)
library(dplyr)

# Set seed for reproducibility
set.seed(123)

# Step 1: Function to Fit Neural Network with Cross-Validation
fit_nn_model_cv <- function(hidden_neurons, train_data, folds = 5) {
  # Define control method for cross-validation
  train_control <- trainControl(method = "cv", number = folds)
  
  # Define the model formula (label as dependent, rest as independent variables)
  formula <- label ~ .
  
  # Train the neural network model with cross-validation
  nn_model_cv <- train(
    formula,
    data = train_data,
    method = "nnet",
    trace = FALSE,
    linout = FALSE,
    maxit = 100,  # Maximum iterations
    tuneGrid = expand.grid(size = hidden_neurons, decay = 0),  # Tune the hidden layer size
    trControl = train_control  # Cross-validation control
  )
  
  # Extract and return the best accuracy from cross-validation results
  best_accuracy <- max(nn_model_cv$results$Accuracy)
  
  return(best_accuracy)
}

# Step 2: Data Preparation
# Assuming 'image_data_pca' is already PCA-processed image data with labels
# Example: image_data_pca <- your_pca_data

# Step 3: Evaluate Neural Network with Different Hidden Layer Sizes
# Define a list of hidden neuron sizes to evaluate
neurons_list <- c(5, 10, 20)

# Create an empty data frame to store the results
accuracy_results <- data.frame(Hidden_Neurons = neurons_list, Accuracy = NA)

# Loop through each hidden neuron size and compute accuracy
for (neurons in neurons_list) {
  # Fit the neural network and compute accuracy
  accuracy <- fit_nn_model_cv(neurons, image_data_pca)
  
  # Store the accuracy in the results data frame
  accuracy_results[accuracy_results$Hidden_Neurons == neurons, "Accuracy"] <- accuracy
}

# Step 4: Display the Results
print("Cross-Validated Accuracy for Different Hidden Neuron Sizes:")

## [1] "Cross-Validated Accuracy for Different Hidden Neuron Sizes:"

print(accuracy_results)

##   Hidden_Neurons Accuracy
## 1              5    0.292
## 2             10    0.352
## 3             20    0.256

# Optional: Visualize the Accuracy Results
ggplot(accuracy_results, aes(x = Hidden_Neurons, y = Accuracy)) +
  geom_line() +
  geom_point(size = 3) +
  labs(title = "Neural Network Accuracy with Varying Hidden Neurons",
       x = "Number of Hidden Neurons",
       y = "Cross-Validated Accuracy") +
  theme_minimal()

##  Hierarchical Clustering with Dendrogram

# Compute distance matrix and perform hierarchical clustering
dist_matrix <- dist(image_data_pca[, -which(names(image_data_pca) == "label")])
hclust_result <- hclust(dist_matrix)

# Plot the dendrogram
plot(hclust_result, labels = FALSE, main = "Hierarchical Clustering Dendrogram")

# Cut the dendrogram to assign clusters
image_data_pca$hclust_cluster <- cutree(hclust_result, k = 3)  # Define number of clusters

# View the first few rows with cluster assignments
head(image_data_pca)

##           PC1         PC2       PC3   label hclust_cluster
## 1  -0.8350153  -1.6947381 20.809302 Animals              1
## 2  -3.9022212 -14.7137086 -8.810251 Animals              2
## 3 -13.8781805   0.6009621  1.179549 Animals              2
## 4  16.5787465 -18.7657632  5.471768 Animals              1
## 5 -30.3531123  -3.9939294 -7.934942 Animals              2
## 6   7.7178964 -10.6335292 11.549392 Animals              1

library(pheatmap)
library(ggplot2)

## K-means Clustering

# Set seed and perform K-means clustering
set.seed(123)
k <- 4  # Define number of clusters for K-means
kmeans_result <- kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k)

# Assign clusters to the dataset
image_data_pca$kmeans_cluster <- as.factor(kmeans_result$cluster)

# Step 6: Visualization
# Plot K-means clustering results with the first two principal components
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = kmeans_cluster)) +
  geom_point(alpha = 0.6) +
  labs(title = "K-means Clustering of Image Data",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_minimal()

# Apply 10-fold cross-validation
set.seed(123)
train_control <- trainControl(method = "cv", number = 10)
cv_model <- train(label ~ ., data = image_data_pca, method = "multinom", trControl = train_control)

## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 335.014032
## iter  20 value 319.650830
## iter  30 value 313.053032
## final  value 313.025950 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 335.020366
## iter  20 value 320.152505
## iter  30 value 316.259454
## final  value 316.259413 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 335.014039
## iter  20 value 319.651376
## iter  30 value 313.062556
## final  value 313.035722 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 336.477553
## iter  20 value 322.086442
## iter  30 value 315.045028
## final  value 314.998298 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 336.481401
## iter  20 value 322.512590
## iter  30 value 317.868821
## final  value 317.868641 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 336.477557
## iter  20 value 322.086902
## iter  30 value 315.053056
## final  value 315.006772 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 334.052028
## iter  20 value 315.643235
## iter  30 value 309.328531
## final  value 309.294531 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 334.055761
## iter  20 value 316.133251
## iter  30 value 312.418124
## final  value 312.417441 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 334.052032
## iter  20 value 315.643753
## iter  30 value 309.338213
## final  value 309.304544 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 334.510107
## iter  20 value 317.728329
## iter  30 value 311.165388
## final  value 311.111320 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 334.515486
## iter  20 value 318.326773
## iter  30 value 314.743732
## final  value 314.743629 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 334.510112
## iter  20 value 317.728980
## iter  30 value 311.175957
## final  value 311.122483 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 332.467331
## iter  20 value 317.888814
## iter  30 value 308.502500
## final  value 308.428125 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 332.471442
## iter  20 value 318.888326
## iter  30 value 310.752736
## final  value 310.751858 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 332.467335
## iter  20 value 317.889916
## iter  30 value 308.507614
## final  value 308.433963 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 331.899418
## iter  20 value 316.683041
## iter  30 value 310.848478
## final  value 310.773284 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 331.908264
## iter  20 value 317.564556
## iter  30 value 313.836567
## final  value 313.836204 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 331.899427
## iter  20 value 316.683968
## iter  30 value 310.856940
## final  value 310.782330 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 340.166842
## iter  20 value 325.231018
## iter  30 value 318.782134
## final  value 318.746443 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 340.173069
## iter  20 value 325.796308
## iter  30 value 321.893168
## final  value 321.893066 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 340.166848
## iter  20 value 325.231621
## iter  30 value 318.791018
## final  value 318.755636 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 337.860005
## iter  20 value 320.936486
## iter  30 value 314.757403
## iter  40 value 314.499675
## final  value 314.499668 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 337.865225
## iter  20 value 321.537289
## iter  30 value 318.310528
## final  value 318.310146 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 337.860010
## iter  20 value 320.937134
## iter  30 value 314.766615
## iter  40 value 314.511791
## final  value 314.511784 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 335.087846
## iter  20 value 318.888681
## iter  30 value 313.811822
## final  value 313.774109 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 335.093510
## iter  20 value 319.494100
## iter  30 value 316.929348
## final  value 316.929286 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 335.087852
## iter  20 value 318.889338
## iter  30 value 313.820681
## final  value 313.783299 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 337.718369
## iter  20 value 321.997334
## iter  30 value 317.310981
## final  value 317.255044 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 337.723745
## iter  20 value 322.506425
## iter  30 value 319.803685
## final  value 319.803122 
## converged
## # weights:  45 (32 variable)
## initial  value 362.123530 
## iter  10 value 337.718374
## iter  20 value 321.997883
## iter  30 value 317.317594
## final  value 317.262185 
## converged
## # weights:  45 (32 variable)
## initial  value 402.359478 
## iter  10 value 372.014313
## iter  20 value 355.680810
## iter  30 value 349.746858
## final  value 349.709710 
## converged

# View cross-validation results
print(cv_model)

## Penalized Multinomial Regression 
## 
## 250 samples
##   5 predictor
##   5 classes: 'Animals', 'Birds', 'Flowers', 'Fruits', 'vegetables' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 225, 225, 225, 225, 225, 225, ... 
## Resampling results across tuning parameters:
## 
##   decay  Accuracy  Kappa
##   0e+00  0.352     0.190
##   1e-04  0.352     0.190
##   1e-01  0.324     0.155
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 1e-04.

# Try different values for k (number of clusters) in K-means
set.seed(123)
wss <- sapply(1:10, function(k){
  kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k, nstart = 10)$tot.withinss
})

# Plot the total within-cluster sum of squares to determine the optimal k
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of Clusters", ylab = "Total Within-cluster SS",
     main = "Elbow Method for Optimal K")

ML assignment Nature Images

Deepika D

2024-10-24