NAME : DEEPIKA D
REG_NO : 2023MDTS07ALA034
ASSIGNMENT SUBMITTED TO : K A VENKATESH SIR
PROGRAM NAME : MSc DATA SCIENCE,3rd SEMESTER , ALLIANCE UNIVERSITY BANGLORE
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:imager':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ imager::add() masks magrittr::add()
## ✖ stringr::boundary() masks imager::boundary()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ tidyr::fill() masks imager::fill()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ dplyr::where() masks imager::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(magick)
## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
# Define the path to your images
image_folder <- "C:/Users/deepika.d/Desktop/Nature_image"
# Get a list of all categories (folders)
categories <- list.dirs(image_folder, recursive = FALSE)
labels <- basename(categories) # Use folder names as labels
# Function to resize and convert images
resize_image <- function(img_path, img_size = 32) {
tryCatch({
img <- image_read(img_path) # Load the image using magick
img <- image_scale(img, paste0(img_size, "x", img_size)) # Resize to specified dimensions
return(as.numeric(as.vector(image_data(img)))) # Convert to a numeric vector
}, error = function(e) {
cat("Error loading image:", img_path, "\n", e$message, "\n")
return(NULL) # Return NULL if there's an error
})
}
# Function to load images from categories
load_images <- function(image_folder, categories) {
image_data <- data.frame() # Initialize an empty data frame
for (category in categories) {
label <- basename(category) # Get the label from the folder name
# List all image files in the category (support JPG, PNG, BMP)
image_files <- list.files(category, pattern = "\\.(jpg|jpeg|png|bmp)$", full.names = TRUE, ignore.case = TRUE)
for (image_file in image_files) {
img_vector <- resize_image(image_file) # Resize the image and convert to a vector
if (!is.null(img_vector)) { # Only add if img_vector is not NULL
image_data <- rbind(image_data, data.frame(label = label, img_vector = I(list(img_vector))))
}
}
}
return(image_data) # Return the populated data frame
}
# Load the images into a data frame
image_data <- load_images(image_folder, categories)
# Check the dimensions of the image data
dim(image_data) # Should show the number of loaded images and columns
## [1] 250 2
# Unlist the image vectors into a matrix
image_matrix <- do.call(rbind, lapply(image_data$img_vector, unlist))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
# Check the dimensions of the matrix (each row corresponds to an image)
dim(image_matrix)
## [1] 250 3072
## LOGISTIC REGRESSION
# Load necessary libraries
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(nnet)
library(ggplot2)
# Step 1: Flatten Image Data and Prepare Dataset
# Assuming image_data contains img_vector for images and label for image categories
# Convert image vectors into matrix form
img_matrix <- do.call(rbind, lapply(image_data$img_vector, as.vector))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
# Create a data frame with the flattened image data and labels
image_data_flat <- data.frame(label = image_data$label, img_matrix)
# Ensure labels are treated as factors (for classification)
image_data_flat$label <- as.factor(image_data_flat$label)
# Step 2: Normalize Features and Perform PCA
# Normalize the pixel data (excluding the label column)
image_data_scaled <- scale(image_data_flat[, -1]) # Exclude the label column for normalization
# Perform Principal Component Analysis (PCA)
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)
# Calculate the explained variance and select components for 95% variance retention
explained_variance <- summary(pca_result)$importance[3,]
num_components <- which(cumsum(explained_variance) >= 0.95)[1]
# Create a new dataset using the selected PCA components along with the labels
image_data_pca <- data.frame(pca_result$x[, 1:num_components])
image_data_pca$label <- image_data_flat$label
# Step 3: Train Logistic Regression Model (Multinomial)
# Fit a multinomial logistic regression model using the PCA components
multinom_model <- multinom(label ~ ., data = image_data_pca)
## # weights: 25 (16 variable)
## initial value 402.359478
## iter 10 value 373.543815
## iter 20 value 365.752200
## final value 365.750375
## converged
# Step 4: Make Predictions
# Get predicted class probabilities and predicted class labels
predicted_classes <- predict(multinom_model)
# Step 5: Evaluate Model Performance
# Generate a confusion matrix and calculate accuracy
confusion_matrix <- confusionMatrix(as.factor(predicted_classes), image_data_pca$label)
accuracy <- confusion_matrix$overall['Accuracy']
# Step 6: Display Results
# Output accuracy and the confusion matrix
cat("Logistic Regression Accuracy:", round(accuracy, 4), "\n")
## Logistic Regression Accuracy: 0.4
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix$table)
## Reference
## Prediction Animals Birds Flowers Fruits vegetables
## Animals 11 9 2 5 6
## Birds 9 19 8 7 10
## Flowers 16 11 33 10 6
## Fruits 6 5 5 19 10
## vegetables 8 6 2 9 18
# Step 7: Visualize PCA Components (Optional)
# Plot the first two principal components with labels
ggplot(data = image_data_pca, aes(x = PC1, y = PC2, color = label)) +
geom_point(size = 2, alpha = 0.6) +
labs(title = "PCA of Image Data", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal()
##NEURAL NETWORKS
# Load necessary libraries
library(nnet)
library(caret)
library(dplyr)
# Set seed for reproducibility
set.seed(123)
# Step 1: Function to Fit Neural Network with Cross-Validation
fit_nn_model_cv <- function(hidden_neurons, train_data, folds = 5) {
# Define control method for cross-validation
train_control <- trainControl(method = "cv", number = folds)
# Define the model formula (label as dependent, rest as independent variables)
formula <- label ~ .
# Train the neural network model with cross-validation
nn_model_cv <- train(
formula,
data = train_data,
method = "nnet",
trace = FALSE,
linout = FALSE,
maxit = 100, # Maximum iterations
tuneGrid = expand.grid(size = hidden_neurons, decay = 0), # Tune the hidden layer size
trControl = train_control # Cross-validation control
)
# Extract and return the best accuracy from cross-validation results
best_accuracy <- max(nn_model_cv$results$Accuracy)
return(best_accuracy)
}
# Step 2: Data Preparation
# Assuming 'image_data_pca' is already PCA-processed image data with labels
# Example: image_data_pca <- your_pca_data
# Step 3: Evaluate Neural Network with Different Hidden Layer Sizes
# Define a list of hidden neuron sizes to evaluate
neurons_list <- c(5, 10, 20)
# Create an empty data frame to store the results
accuracy_results <- data.frame(Hidden_Neurons = neurons_list, Accuracy = NA)
# Loop through each hidden neuron size and compute accuracy
for (neurons in neurons_list) {
# Fit the neural network and compute accuracy
accuracy <- fit_nn_model_cv(neurons, image_data_pca)
# Store the accuracy in the results data frame
accuracy_results[accuracy_results$Hidden_Neurons == neurons, "Accuracy"] <- accuracy
}
# Step 4: Display the Results
print("Cross-Validated Accuracy for Different Hidden Neuron Sizes:")
## [1] "Cross-Validated Accuracy for Different Hidden Neuron Sizes:"
print(accuracy_results)
## Hidden_Neurons Accuracy
## 1 5 0.292
## 2 10 0.352
## 3 20 0.256
# Optional: Visualize the Accuracy Results
ggplot(accuracy_results, aes(x = Hidden_Neurons, y = Accuracy)) +
geom_line() +
geom_point(size = 3) +
labs(title = "Neural Network Accuracy with Varying Hidden Neurons",
x = "Number of Hidden Neurons",
y = "Cross-Validated Accuracy") +
theme_minimal()
## Hierarchical Clustering with Dendrogram
# Compute distance matrix and perform hierarchical clustering
dist_matrix <- dist(image_data_pca[, -which(names(image_data_pca) == "label")])
hclust_result <- hclust(dist_matrix)
# Plot the dendrogram
plot(hclust_result, labels = FALSE, main = "Hierarchical Clustering Dendrogram")
# Cut the dendrogram to assign clusters
image_data_pca$hclust_cluster <- cutree(hclust_result, k = 3) # Define number of clusters
# View the first few rows with cluster assignments
head(image_data_pca)
## PC1 PC2 PC3 label hclust_cluster
## 1 -0.8350153 -1.6947381 20.809302 Animals 1
## 2 -3.9022212 -14.7137086 -8.810251 Animals 2
## 3 -13.8781805 0.6009621 1.179549 Animals 2
## 4 16.5787465 -18.7657632 5.471768 Animals 1
## 5 -30.3531123 -3.9939294 -7.934942 Animals 2
## 6 7.7178964 -10.6335292 11.549392 Animals 1
library(pheatmap)
library(ggplot2)
## K-means Clustering
# Set seed and perform K-means clustering
set.seed(123)
k <- 4 # Define number of clusters for K-means
kmeans_result <- kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k)
# Assign clusters to the dataset
image_data_pca$kmeans_cluster <- as.factor(kmeans_result$cluster)
# Step 6: Visualization
# Plot K-means clustering results with the first two principal components
ggplot(image_data_pca, aes(x = PC1, y = PC2, color = kmeans_cluster)) +
geom_point(alpha = 0.6) +
labs(title = "K-means Clustering of Image Data",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
# Apply 10-fold cross-validation
set.seed(123)
train_control <- trainControl(method = "cv", number = 10)
cv_model <- train(label ~ ., data = image_data_pca, method = "multinom", trControl = train_control)
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 335.014032
## iter 20 value 319.650830
## iter 30 value 313.053032
## final value 313.025950
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 335.020366
## iter 20 value 320.152505
## iter 30 value 316.259454
## final value 316.259413
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 335.014039
## iter 20 value 319.651376
## iter 30 value 313.062556
## final value 313.035722
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 336.477553
## iter 20 value 322.086442
## iter 30 value 315.045028
## final value 314.998298
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 336.481401
## iter 20 value 322.512590
## iter 30 value 317.868821
## final value 317.868641
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 336.477557
## iter 20 value 322.086902
## iter 30 value 315.053056
## final value 315.006772
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 334.052028
## iter 20 value 315.643235
## iter 30 value 309.328531
## final value 309.294531
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 334.055761
## iter 20 value 316.133251
## iter 30 value 312.418124
## final value 312.417441
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 334.052032
## iter 20 value 315.643753
## iter 30 value 309.338213
## final value 309.304544
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 334.510107
## iter 20 value 317.728329
## iter 30 value 311.165388
## final value 311.111320
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 334.515486
## iter 20 value 318.326773
## iter 30 value 314.743732
## final value 314.743629
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 334.510112
## iter 20 value 317.728980
## iter 30 value 311.175957
## final value 311.122483
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 332.467331
## iter 20 value 317.888814
## iter 30 value 308.502500
## final value 308.428125
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 332.471442
## iter 20 value 318.888326
## iter 30 value 310.752736
## final value 310.751858
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 332.467335
## iter 20 value 317.889916
## iter 30 value 308.507614
## final value 308.433963
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 331.899418
## iter 20 value 316.683041
## iter 30 value 310.848478
## final value 310.773284
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 331.908264
## iter 20 value 317.564556
## iter 30 value 313.836567
## final value 313.836204
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 331.899427
## iter 20 value 316.683968
## iter 30 value 310.856940
## final value 310.782330
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 340.166842
## iter 20 value 325.231018
## iter 30 value 318.782134
## final value 318.746443
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 340.173069
## iter 20 value 325.796308
## iter 30 value 321.893168
## final value 321.893066
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 340.166848
## iter 20 value 325.231621
## iter 30 value 318.791018
## final value 318.755636
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 337.860005
## iter 20 value 320.936486
## iter 30 value 314.757403
## iter 40 value 314.499675
## final value 314.499668
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 337.865225
## iter 20 value 321.537289
## iter 30 value 318.310528
## final value 318.310146
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 337.860010
## iter 20 value 320.937134
## iter 30 value 314.766615
## iter 40 value 314.511791
## final value 314.511784
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 335.087846
## iter 20 value 318.888681
## iter 30 value 313.811822
## final value 313.774109
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 335.093510
## iter 20 value 319.494100
## iter 30 value 316.929348
## final value 316.929286
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 335.087852
## iter 20 value 318.889338
## iter 30 value 313.820681
## final value 313.783299
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 337.718369
## iter 20 value 321.997334
## iter 30 value 317.310981
## final value 317.255044
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 337.723745
## iter 20 value 322.506425
## iter 30 value 319.803685
## final value 319.803122
## converged
## # weights: 45 (32 variable)
## initial value 362.123530
## iter 10 value 337.718374
## iter 20 value 321.997883
## iter 30 value 317.317594
## final value 317.262185
## converged
## # weights: 45 (32 variable)
## initial value 402.359478
## iter 10 value 372.014313
## iter 20 value 355.680810
## iter 30 value 349.746858
## final value 349.709710
## converged
# View cross-validation results
print(cv_model)
## Penalized Multinomial Regression
##
## 250 samples
## 5 predictor
## 5 classes: 'Animals', 'Birds', 'Flowers', 'Fruits', 'vegetables'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 225, 225, 225, 225, 225, 225, ...
## Resampling results across tuning parameters:
##
## decay Accuracy Kappa
## 0e+00 0.352 0.190
## 1e-04 0.352 0.190
## 1e-01 0.324 0.155
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was decay = 1e-04.
# Try different values for k (number of clusters) in K-means
set.seed(123)
wss <- sapply(1:10, function(k){
kmeans(image_data_pca[, -which(names(image_data_pca) == "label")], centers = k, nstart = 10)$tot.withinss
})
# Plot the total within-cluster sum of squares to determine the optimal k
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters", ylab = "Total Within-cluster SS",
main = "Elbow Method for Optimal K")