Authentic Machine Learning Model

options(repos = c(CRAN = "https://cran.rstudio.com/"))

install.packages("magick")

## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'magick' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'magick'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Naman\AppData\Local\R\win-library\4.4\00LOCK\magick\libs\x64\magick.dll
## to C:\Users\Naman\AppData\Local\R\win-library\4.4\magick\libs\x64\magick.dll:
## Permission denied

## Warning: restored 'magick'

## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\RtmpIxKL9K\downloaded_packages

library(magick)

## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11

# Define the path to your images
image_folder<-"C:\\Users\\Naman\\Downloads\\Namana Herle G"

# Get a list of all categories (folders)
categories <- list.dirs(image_folder, recursive = FALSE)
labels <- basename(categories)  # Use folder names as labels

# Function to resize and convert images
resize_image <- function(img_path, img_size = 32) {
  tryCatch({
    img <- image_read(img_path)  # Load the image using magick
    img <- image_scale(img, paste0(img_size, "x", img_size))  # Resize to specified dimensions
    return(as.numeric(as.vector(image_data(img))))  # Convert to a numeric vector
  }, error = function(e) {
    cat("Error loading image:", img_path, "\n", e$message, "\n")
    return(NULL)  # Return NULL if there's an error
  })
}

# Function to load images from categories
load_images <- function(image_folder, categories) {
  image_data <- data.frame()  # Initialize an empty data frame
  
  for (category in categories) {
    label <- basename(category)  # Get the label from the folder name
    
    # List all image files in the category (support JPG, PNG, BMP)
    image_files <- list.files(category, pattern = "\\.(jpg|jpeg|png|bmp)$", full.names = TRUE, ignore.case = TRUE)
    
    for (image_file in image_files) {
      img_vector <- resize_image(image_file)  # Resize the image and convert to a vector
      
      if (!is.null(img_vector)) {  # Only add if img_vector is not NULL
        image_data <- rbind(image_data, data.frame(label = label, img_vector = I(list(img_vector))))
      }
    }
  }
  
  return(image_data)  # Return the populated data frame
}

# Load the images into a data frame
image_data <- load_images(image_folder, categories)

# Check the dimensions of the image data
dim(image_data)  # Should show the number of loaded images and columns

## [1] 251   2

# Unlist the image vectors into a matrix
image_matrix <- do.call(rbind, lapply(image_data$img_vector, unlist))

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)

# Check the dimensions of the matrix (each row corresponds to an image)
dim(image_matrix)

## [1]  251 3072

LOGISTIC REGRESSION

# Load necessary libraries
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(nnet)
library(ggplot2)

# Step 1: Flatten Image Data and Prepare Dataset
# Assuming image_data contains img_vector for images and label for image categories
# Convert image vectors into matrix form
img_matrix <- do.call(rbind, lapply(image_data$img_vector, as.vector))

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)

# Create a data frame with the flattened image data and labels
image_data_flat <- data.frame(label = image_data$label, img_matrix)

# Ensure labels are treated as factors (for classification)
image_data_flat$label <- as.factor(image_data_flat$label)

# Step 2: Normalize Features and Perform PCA
# Normalize the pixel data (excluding the label column)
image_data_scaled <- scale(image_data_flat[, -1])  # Exclude the label column for normalization

# Perform Principal Component Analysis (PCA)
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)

# Calculate the explained variance and select components for 95% variance retention
explained_variance <- summary(pca_result)$importance[3,]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]

# Create a new dataset using the selected PCA components along with the labels
image_data_pca <- data.frame(pca_result$x[, 1:num_components])
image_data_pca$label <- image_data_flat$label

# Step 3: Train Logistic Regression Model (Multinomial)
# Fit a multinomial logistic regression model using the PCA components
multinom_model <- multinom(label ~ ., data = image_data_pca)

## # weights:  20 (12 variable)
## initial  value 403.968916 
## iter  10 value 355.326246
## final  value 348.220079 
## converged

# Step 4: Make Predictions
# Get predicted class probabilities and predicted class labels
predicted_classes <- predict(multinom_model)

# Step 5: Evaluate Model Performance
# Generate a confusion matrix and calculate accuracy
confusion_matrix <- confusionMatrix(as.factor(predicted_classes), image_data_pca$label)
accuracy <- confusion_matrix$overall['Accuracy']

# Step 6: Display Results
# Output accuracy and the confusion matrix
cat("Logistic Regression Accuracy:", round(accuracy, 4), "\n")

## Logistic Regression Accuracy: 0.3944

cat("Confusion Matrix:\n")

## Confusion Matrix:

print(confusion_matrix$table)

##           Reference
## Prediction Animals Birds Flowers Fruits Trees
##    Animals       7    11       3      9    12
##    Birds        11    16       9      1     5
##    Flowers      13    14      32      4     5
##    Fruits       12     7       4     31    15
##    Trees         8     2       2      5    13

# Step 7: Visualize PCA Components (Optional)
# Plot the first two principal components with labels
ggplot(data = image_data_pca, aes(x = PC1, y = PC2, color = label)) +
  geom_point(size = 2, alpha = 0.6) +
  labs(title = "PCA of Image Data", x = "Principal Component 1", y = "Principal Component 2") + theme_minimal()

# Step 7: Visualize PCA Components (Optional)
# Plot the first two principal components with labels
ggplot(data = image_data_pca, aes(x = PC1, y = PC2, color = label)) +
  geom_point(size = 2, alpha = 0.6) +  labs(title = "PCA of Image Data", 
       x = "Principal Component 1", 
       y = "Principal Component 2") + theme_minimal()

# Load necessary libraries
library(nnet)
library(caret)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Set seed for reproducibility
set.seed(123)

# Step 1: Function to Fit Neural Network with Cross-Validation
fit_nn_model_cv <- function(hidden_neurons, train_data, folds = 5) {
  # Define control method for cross-validation
  train_control <- trainControl(method = "cv", number = folds)
  
  # Define the model formula (label as dependent, rest as independent variables)
  formula <- label ~ .
  
  # Train the neural network model with cross-validation
  nn_model_cv <- train(
    formula,
    data = train_data,
    method = "nnet",
    trace = FALSE,
    linout = FALSE,
    maxit = 100,  # Maximum iterations
    tuneGrid = expand.grid(size = hidden_neurons, decay = 0),  # Tune the hidden layer size
    trControl = train_control  # Cross-validation control
  )
  
  # Extract and return the best accuracy from cross-validation results
  best_accuracy <- max(nn_model_cv$results$Accuracy)
  
  return(best_accuracy)
}

# Step 2: Data Preparation
# Assuming 'image_data_pca' is already PCA-processed image data with labels
# Example: image_data_pca <- your_pca_data

# Step 3: Evaluate Neural Network with Different Hidden Layer Sizes
# Define a list of hidden neuron sizes to evaluate
neurons_list <- c(5, 10, 20)

# Create an empty data frame to store the results
accuracy_results <- data.frame(Hidden_Neurons = neurons_list, Accuracy = NA)

# Loop through each hidden neuron size and compute accuracy
for (neurons in neurons_list) {
  # Fit the neural network and compute accuracy
  accuracy <- fit_nn_model_cv(neurons, image_data_pca)
  
  # Store the accuracy in the results data frame
  accuracy_results[accuracy_results$Hidden_Neurons == neurons, "Accuracy"] <- accuracy
}

# Step 4: Display the Results
print("Cross-Validated Accuracy for Different Hidden Neuron Sizes:")

## [1] "Cross-Validated Accuracy for Different Hidden Neuron Sizes:"

print(accuracy_results)

##   Hidden_Neurons  Accuracy
## 1              5 0.3187451
## 2             10 0.3263529
## 3             20 0.3585882

# Optional: Visualize the Accuracy Results
ggplot(accuracy_results, aes(x = Hidden_Neurons, y = Accuracy)) +
  geom_line() +
  geom_point(size = 3) +
  labs(title = "Neural Network Accuracy with Varying Hidden Neurons",
       x = "Number of Hidden Neurons",
       y = "Cross-Validated Accuracy") +
  theme_minimal()

##  Hierarchical Clustering with Dendrogram

# Compute distance matrix and perform hierarchical clustering
dist_matrix <- dist(image_data_pca[, -which(names(image_data_pca) == "label")])
hclust_result <- hclust(dist_matrix)

# Plot the dendrogram
plot(hclust_result, labels = FALSE, main = "Hierarchical Clustering Dendrogram")

# Cut the dendrogram to assign clusters
image_data_pca$hclust_cluster <- cutree(hclust_result, k = 3)  # Define number of clusters

# View the first few rows with cluster assignments
head(image_data_pca)

##         PC1        PC2   label hclust_cluster
## 1 -45.57601 -10.815621 Animals              1
## 2 -51.72867   9.364695 Animals              1
## 3  10.01604  12.084211 Animals              2
## 4 -62.18010   3.390804 Animals              1
## 5   1.88313  10.511209 Animals              2
## 6  44.95109 -11.087802 Animals              3

install.packages("pheatmap")

## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'pheatmap' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\RtmpIxKL9K\downloaded_packages

library(pheatmap)
library(ggplot2)

K-means Clustering

# Set seed and perform K-means clustering
set.seed(123)
k <- 4  # Define number of clusters for K-means
kmeans_result <- kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k)

# Assign clusters to the dataset
image_data_pca$kmeans_cluster <- as.factor(kmeans_result$cluster)

# Step 6: Visualization
# Plot K-means clustering results with the first two principal components
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = kmeans_cluster)) +
  geom_point(alpha = 0.6) +
  labs(title = "K-means Clustering of Image Data",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_minimal()

# Apply 10-fold cross-validation
set.seed(123)
train_control <- trainControl(method = "cv", number = 10)
cv_model <- train(label ~ ., data = image_data_pca, method = "multinom", trControl = train_control)

## # weights:  40 (28 variable)
## initial  value 362.123530 
## iter  10 value 316.060857
## iter  20 value 299.989193
## iter  30 value 299.531757
## iter  40 value 299.512328
## iter  50 value 299.511736
## final  value 299.511731 
## converged
## # weights:  40 (28 variable)
## initial  value 362.123530 
## iter  10 value 316.063570
## iter  20 value 301.731437
## final  value 301.518255 
## converged
## # weights:  40 (28 variable)
## initial  value 362.123530 
## iter  10 value 316.060860
## iter  20 value 299.991796
## iter  30 value 299.535922
## iter  40 value 299.519144
## final  value 299.519038 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.590670
## iter  20 value 305.571376
## iter  30 value 304.650243
## iter  40 value 304.622313
## iter  50 value 304.620797
## final  value 304.620781 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.593416
## iter  20 value 307.913754
## final  value 307.593004 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.590673
## iter  20 value 305.575401
## iter  30 value 304.656878
## iter  40 value 304.631641
## final  value 304.631153 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 321.234630
## iter  20 value 302.555716
## iter  30 value 301.829782
## iter  40 value 301.789714
## iter  50 value 301.788457
## iter  50 value 301.788454
## iter  50 value 301.788454
## final  value 301.788454 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 321.240221
## iter  20 value 305.020400
## final  value 304.801919 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 321.234636
## iter  20 value 302.560062
## iter  30 value 301.838110
## iter  40 value 301.804617
## final  value 301.804561 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 324.684150
## iter  20 value 308.481379
## iter  30 value 308.162682
## iter  40 value 308.144567
## iter  50 value 308.141978
## final  value 308.141963 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 324.686289
## iter  20 value 310.912274
## final  value 310.878824 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 324.684152
## iter  20 value 308.485409
## iter  30 value 308.168565
## iter  40 value 308.152405
## final  value 308.151312 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 322.612635
## iter  20 value 305.624309
## iter  30 value 305.148674
## iter  40 value 305.131230
## iter  50 value 305.130710
## final  value 305.130705 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 322.615339
## iter  20 value 307.605912
## final  value 307.423483 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 322.612637
## iter  20 value 305.627448
## iter  30 value 305.153434
## iter  40 value 305.138526
## final  value 305.138471 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.914664
## iter  20 value 310.342372
## iter  30 value 309.970113
## iter  40 value 309.954599
## iter  50 value 309.954199
## iter  50 value 309.954197
## iter  50 value 309.954197
## final  value 309.954197 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.917037
## iter  20 value 312.145823
## final  value 312.079031 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.914667
## iter  20 value 310.345412
## iter  30 value 309.975141
## iter  40 value 309.962688
## final  value 309.962642 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 319.807747
## iter  20 value 303.625568
## iter  30 value 303.180657
## iter  40 value 303.161144
## iter  50 value 303.160102
## final  value 303.160096 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 319.810609
## iter  20 value 305.844262
## final  value 305.677404 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 319.807750
## iter  20 value 303.629189
## iter  30 value 303.186013
## iter  40 value 303.169199
## final  value 303.168984 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.763026
## iter  20 value 308.626785
## iter  30 value 307.755843
## iter  40 value 307.737155
## iter  50 value 307.735272
## final  value 307.735266 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.765323
## iter  20 value 310.387373
## final  value 310.029463 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.763028
## iter  20 value 308.629766
## iter  30 value 307.761478
## iter  40 value 307.744906
## final  value 307.744318 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.315693
## iter  20 value 309.039660
## iter  30 value 308.694379
## iter  40 value 308.684256
## iter  50 value 308.683974
## iter  50 value 308.683971
## iter  50 value 308.683971
## final  value 308.683971 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.317305
## iter  20 value 310.916096
## final  value 310.795246 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 323.315694
## iter  20 value 309.042820
## iter  30 value 308.699441
## iter  40 value 308.691736
## final  value 308.691712 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 320.097757
## iter  20 value 302.785069
## iter  30 value 302.358744
## iter  40 value 302.338744
## iter  50 value 302.337749
## final  value 302.337739 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 320.100294
## iter  20 value 304.494980
## final  value 304.348439 
## converged
## # weights:  40 (28 variable)
## initial  value 363.732968 
## iter  10 value 320.097760
## iter  20 value 302.787686
## iter  30 value 302.362943
## iter  40 value 302.345502
## final  value 302.345292 
## converged
## # weights:  40 (28 variable)
## initial  value 403.968916 
## iter  10 value 355.215005
## iter  20 value 341.069368
## iter  30 value 340.502608
## iter  40 value 340.485752
## final  value 340.485742 
## converged

# View cross-validation results
print(cv_model)

## Penalized Multinomial Regression 
## 
## 251 samples
##   4 predictor
##   5 classes: 'Animals', 'Birds', 'Flowers', 'Fruits', 'Trees' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 225, 226, 226, 226, 226, 226, ... 
## Resampling results across tuning parameters:
## 
##   decay  Accuracy   Kappa    
##   0e+00  0.3544615  0.1931054
##   1e-04  0.3544615  0.1931054
##   1e-01  0.3506154  0.1884502
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 1e-04.

# Try different values for k (number of clusters) in K-means
set.seed(123)
wss <- sapply(1:10, function(k){
  kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k, nstart = 10)$tot.withinss
})

# Plot the total within-cluster sum of squares to determine the optimal k
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of Clusters", ylab = "Total Within-cluster SS",
     main = "Elbow Method for Optimal K")

Authentic Machine Learning Model

Namana Herle G

2024-10-28

LOGISTIC REGRESSION

K-means Clustering