SAAIRAM ASSIGNMENT

## #LOADING REQUIRED LIBRARIES
library(imager)

## Warning: package 'imager' was built under R version 4.3.3

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:imager':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ imager::add()       masks magrittr::add()
## ✖ stringr::boundary() masks imager::boundary()
## ✖ tidyr::extract()    masks magrittr::extract()
## ✖ tidyr::fill()       masks imager::fill()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ purrr::set_names()  masks magrittr::set_names()
## ✖ dplyr::where()      masks imager::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

install.packages("BiocManager")

## 
## The downloaded binary packages are in
##  /var/folders/92/s_f3d4816pz3mzm229sxs9k80000gn/T//RtmpeJdrAE/downloaded_packages

BiocManager::install("EBImage")

## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: https://cloud.r-project.org/
## Bioconductor version 3.18 (BiocManager 1.30.25), R 4.3.1 (2023-06-16)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'EBImage'

## Old packages: 'uwot'

#LOADING THE DATASET

# Set the path to your dataset
image_path <- "/Users/saairam/Downloads/project"

# Get a list of all categories (folders)
categories <- list.dirs(image_path, recursive = FALSE)
labels <- basename(categories)  # Use folder names as labels

# Function to resize images
resize_image <- function(img_path, img_size = 32) {
  img <- load.image(img_path)  # Load the image
  img <- resize(img, img_size, img_size)  # Resize to specified dimensions
  as.numeric(img)  # Convert to a numeric vector
}

# Function to load all images and create a dataset
load_images <- function(image_path, categories) {
  image_data <- data.frame()  # Initialize an empty data frame
  
  for (category in categories) {
    label <- basename(category)  # Get the label from the folder name
    image_files <- list.files(category, full.names = TRUE)  # List all image files in the category
    
    for (image_file in image_files) {
      img_vector <- resize_image(image_file)  # Resize the image and convert to a vector
      image_data <- rbind(image_data, data.frame(label = label, img_vector = I(list(img_vector))))  # Add to data frame
    }
  }
  
  return(image_data)  # Return the populated data frame
}

# Load the images into a data frame
image_data <- load_images(image_path, categories)

# Check the dimensions of the image data
dim(image_data)  # Should show the number of loaded images and columns

## [1] 299   2

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#UNDERSTANDING OUR DATASET

table(image_data$label)

## 
##    ani fruits plants 
##     82    115    102

# Check the structure of the data
str(image_data)

## 'data.frame':    299 obs. of  2 variables:
##  $ label     : chr  "ani" "ani" "ani" "ani" ...
##  $ img_vector:List of 299
##   ..$ : num  0.624 0.667 0.482 0.471 0.549 ...
##   ..$ : num  0.51 0.494 0.525 0.553 0.553 ...
##   ..$ : num  0.475 0.157 0.176 0.141 0.141 ...
##   ..$ : num  0.651 0.494 0.624 0.663 0.698 ...
##   ..$ : num  0.165 0.145 0.169 0.247 0.2 ...
##   ..$ : num  0.282 0.247 0.161 0.212 0.196 ...
##   ..$ : num  0.5294 0.4471 0.651 0.6431 0.0392 ...
##   ..$ : num  0.325 0.212 0.706 0.718 0.647 ...
##   ..$ : num  0.0784 0.0902 0.0863 0.0392 0.1765 ...
##   ..$ : num  0.596 0.753 0.765 0.757 0.867 ...
##   ..$ : num  0.592 0.624 0.655 0.702 0.706 ...
##   ..$ : num  0.369 0.275 0.267 0.208 0.255 ...
##   ..$ : num  0.227 0.247 0.204 0.165 0.141 ...
##   ..$ : num  0.482 0.506 0.518 0.482 0.467 ...
##   ..$ : num  0.643 0.71 0.565 0.467 0.537 ...
##   ..$ : num  0.118 0.102 0.137 0.137 0.129 ...
##   ..$ : num  0.667 0.667 0.667 0.667 0.667 ...
##   ..$ : num  0.667 0.675 0.631 0.616 0.604 ...
##   ..$ : num  0.318 0.396 0.204 0.165 0.294 ...
##   ..$ : num  0.157 0.18 0.18 0.18 0.18 ...
##   ..$ : num  0.639 0.631 0.62 0.624 0.596 ...
##   ..$ : num  0.188 0.208 0.173 0.329 0.412 ...
##   ..$ : num  0.529 0.71 0.682 0.624 0.804 ...
##   ..$ : num  0.604 0.6 0.596 0.592 0.592 ...
##   ..$ : num  0.192 0.361 0.478 0.333 0.278 ...
##   ..$ : num  0.263 0.435 0.494 0.51 0.49 ...
##   ..$ : num  0.098 0.0235 0.3412 0.3608 0.1686 ...
##   ..$ : num  0.973 0.973 0.973 0.973 0.973 ...
##   ..$ : num  0.314 0.314 0.137 0.204 0.137 ...
##   ..$ : num  0.0353 0.0588 0.1059 0.0431 0.0588 ...
##   ..$ : num  0.451 0.486 0.616 0.482 0.604 ...
##   ..$ : num  0.0941 0.1098 0.1098 0.0745 0.051 ...
##   ..$ : num  0.0902 0.1098 0.1804 0.1098 0.1216 ...
##   ..$ : num  0.6353 0 0.0784 0.2588 0.349 ...
##   ..$ : num  0.0471 0.0667 0.1255 0.3098 0.4392 ...
##   ..$ : num  0.149 0.1804 0.1412 0.0824 0.1294 ...
##   ..$ : num  0.878 0.969 0.906 0.906 0.89 ...
##   ..$ : num  0.675 0.608 0.533 0.529 0.596 ...
##   ..$ : num  0.173 0.169 0.176 0.373 0.506 ...
##   ..$ : num  0.761 0.71 0.643 0.659 0.659 ...
##   ..$ : num  0.314 0.631 0.792 0.616 0.447 ...
##   ..$ : num  0.384 0.384 0.243 0.173 0.259 ...
##   ..$ : num  0.584 0.588 0.573 0.62 0.651 ...
##   ..$ : num  0.3333 0.1725 0.1882 0.1686 0.0863 ...
##   ..$ : num  0.118 0.102 0.188 0.165 0.196 ...
##   ..$ : num  0.18 0.212 0.247 0.255 0.267 ...
##   ..$ : num  0.1882 0.2392 0.302 0.2353 0.0863 ...
##   ..$ : num  0.00392 0.01961 0.01961 0.03137 0.03529 ...
##   ..$ : num  0.612 0.62 0.627 0.639 0.663 ...
##   ..$ : num  0.161 0.38 0.573 0.718 0.588 ...
##   ..$ : num  0 0.0314 0.0118 0 0 ...
##   ..$ : num  0.129 0.133 0.137 0.149 0.157 ...
##   ..$ : num  0.522 0.467 0.478 0.514 0.467 ...
##   ..$ : num  0.557 0.576 0.443 0.271 0.337 ...
##   ..$ : num  0 0 0.557 0.325 0.624 ...
##   ..$ : num  0.114 0.173 0.439 0.42 0.337 ...
##   ..$ : num  0.914 0.906 0.925 0.933 0.945 ...
##   ..$ : num  0.694 0.678 0.659 0.631 0.596 ...
##   ..$ : num  0.584 0.584 0.592 0.6 0.6 ...
##   ..$ : num  0.439 0.243 0.2 0.384 0.604 ...
##   ..$ : num  0.769 0.71 0.725 0.741 0.757 ...
##   ..$ : num  0.255 0.243 0.239 0.239 0.251 ...
##   ..$ : num  0.514 0.471 0.502 0.498 0.506 ...
##   ..$ : num  0.831 0.808 0.82 0.831 0.827 ...
##   ..$ : num  0.447 0.459 0.451 0.482 0.494 ...
##   ..$ : num  0.624 0.647 0.659 0.643 0.627 ...
##   ..$ : num  0.31 0.302 0.302 0.31 0.298 ...
##   ..$ : num  0.769 0.749 0.757 0.753 0.725 ...
##   ..$ : num  0.361 0.396 0.196 0.22 0.184 ...
##   ..$ : num  0.745 0.757 0.788 0.788 0.816 ...
##   ..$ : num  0.345 0.333 0.357 0.31 0.314 ...
##   ..$ : num  0.0471 0.1333 0.0863 0.1137 0 ...
##   ..$ : num  0.698 0.424 0.345 0.404 0.369 ...
##   ..$ : num  0.553 0.51 0.541 0.647 0.518 ...
##   ..$ : num  0.0902 0.2118 0.2706 0.1294 0.0353 ...
##   ..$ : num  0.416 0.439 0.494 0.506 0.455 ...
##   ..$ : num  0.224 0.22 0.267 0.31 0.333 ...
##   ..$ : num  0.475 0.467 0.439 0.286 0.529 ...
##   ..$ : num  0 0 0 0 0.267 ...
##   ..$ : num  0.651 0.706 0.698 0.647 0.537 ...
##   ..$ : num  0.322 0.369 0.384 0.392 0.42 ...
##   ..$ : num  0.596 0.608 0.608 0.616 0.62 ...
##   ..$ : num  1 1 1 1 1 ...
##   ..$ : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..$ : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..$ : num  0.992 0.988 0.973 0.996 0.922 ...
##   ..$ : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..$ : num  0.992 1 0.996 0.996 0.992 ...
##   ..$ : num  0.282 0.325 0.224 0.953 1 ...
##   ..$ : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..$ : num  0.01176 0.00784 0.00784 0.00784 0.01176 ...
##   ..$ : num  0.145 0.18 0.169 0.137 0.408 ...
##   ..$ : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..$ : num  0.353 0.349 0.353 0.38 0.392 ...
##   ..$ : num  0.988 0.988 0.988 0.988 0.988 ...
##   ..$ : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..$ : num  1 1 1 1 1 ...
##   ..$ : num  0.851 0.855 0.863 0.859 0.859 ...
##   ..$ : num  0.796 0.78 0.737 0.329 0.286 ...
##   .. [list output truncated]
##   ..- attr(*, "class")= chr "AsIs"

# Display the names of the columns
colnames(image_data)

## [1] "label"      "img_vector"

# Print the first few rows of the dataset
head(image_data)

#LOGISTIC REGRESSION

# Load necessary libraries
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

install.packages("nnet")

## 
## The downloaded binary packages are in
##  /var/folders/92/s_f3d4816pz3mzm229sxs9k80000gn/T//RtmpeJdrAE/downloaded_packages

library(nnet)

# Flatten the img_vector column into multiple pixel columns
img_matrix <- do.call(rbind, lapply(image_data$img_vector, as.vector))

# Create a new data frame with flattened image data
image_data_flat <- data.frame(label = image_data$label, img_matrix)

# Check the new dimensions
print(dim(image_data_flat))  # Should show the number of loaded images and the number of pixel columns

## [1]  299 3073

# Convert labels to a factor if they are not already
image_data_flat$label <- as.factor(image_data_flat$label)

# Normalize the features
image_data_scaled <- scale(image_data_flat[, -which(names(image_data_flat) == "label")])

# Perform PCA
pca_result <- prcomp(image_data_scaled, center = TRUE, scale. = TRUE)

# Decide the number of components to keep (e.g., 95% variance)
explained_variance <- summary(pca_result)$importance[3,]
num_components <- min(which(cumsum(explained_variance) >= 0.95))

# Create a new dataset with the PCA components
image_data_pca <- data.frame(pca_result$x[, 1:num_components])
image_data_pca$label <- image_data_flat$label

# Split the data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(image_data_pca$label, p = 0.7, list = FALSE)
train_data <- image_data_pca[train_index, ]
test_data <- image_data_pca[-train_index, ]

# Fit the logistic regression model on training data
logistic_model <- multinom(label ~ ., data = train_data)  # Using multinom for logistic regression in multi-class case

## # weights:  15 (8 variable)
## initial  value 231.807193 
## iter  10 value 171.384719
## final  value 170.351731 
## converged

# Check the summary of the model
summary(logistic_model)

## Call:
## multinom(formula = label ~ ., data = train_data)
## 
## Coefficients:
##        (Intercept)         PC1         PC2         PC3
## fruits  0.03502584  0.03595141 -0.06328572 -0.03084746
## plants -0.22233688 -0.01989788  0.06386126 -0.03402685
## 
## Std. Errors:
##        (Intercept)         PC1        PC2        PC3
## fruits   0.2117822 0.007271005 0.01818642 0.01502432
## plants   0.2385523 0.008490523 0.01881783 0.01741396
## 
## Residual Deviance: 340.7035 
## AIC: 356.7035

# Predict the probabilities for the test dataset
predicted_probs <- predict(logistic_model, newdata = test_data, type = "prob")

# Predict the class labels for the test dataset
predicted_classes <- predict(logistic_model, newdata = test_data)

# View the predicted probabilities and class labels
head(predicted_probs)

##          ani    fruits     plants
## 1  0.2077034 0.2601034 0.53219323
## 2  0.4155087 0.2920042 0.29248707
## 3  0.4195379 0.3261922 0.25426986
## 4  0.3640130 0.5448666 0.09112042
## 11 0.2057989 0.6897261 0.10447499
## 19 0.4081748 0.1428221 0.44900311

head(predicted_classes)

## [1] plants ani    ani    fruits fruits plants
## Levels: ani fruits plants

# Create a confusion matrix for the test set
confusion_matrix <- table(Actual = test_data$label, Predicted = predicted_classes)

# View the confusion matrix
print(confusion_matrix)

##         Predicted
## Actual   ani fruits plants
##   ani      5      9     10
##   fruits   6     23      5
##   plants   8      3     19

# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Test Accuracy:", accuracy))

## [1] "Test Accuracy: 0.534090909090909"

#CLUSTERING

##Is the data clusterable

# Install and load the 'clustertend' package if you haven't already
install.packages("clustertend")

## 
## The downloaded binary packages are in
##  /var/folders/92/s_f3d4816pz3mzm229sxs9k80000gn/T//RtmpeJdrAE/downloaded_packages

library(clustertend)

## Package `clustertend` is deprecated.  Use package `hopkins` instead.

# Compute Hopkins statistic
set.seed(123)  # Set a seed for reproducibility
hopkins_stat <- hopkins(image_data_pca[, -which(names(image_data_pca) == "label")], n = nrow(image_data_pca) - 1)

## Warning: Package `clustertend` is deprecated.  Use package `hopkins` instead.

print(paste("Hopkins Statistic: ", hopkins_stat))

## [1] "Hopkins Statistic:  0.285333384588006"

##HEIRARCHIAL CLUSTERING WITH DENDROGRAM:
# Compute the distance matrix
dist_matrix <- dist(image_data_pca[, -which(names(image_data_pca) == "label")])

# Perform hierarchical clustering
hclust_result <- hclust(dist_matrix)

# Plot the dendrogram
plot(hclust_result, labels = FALSE, main = "Hierarchical Clustering Dendrogram")

# Cut the dendrogram to create clusters
image_data_pca$hclust_cluster <- cutree(hclust_result, k = 3)  # Use the same number of clusters as above

# View the first few rows with cluster assignments
head(image_data_pca)

# Set the number of clusters (k)
set.seed(123)  # For reproducibility
k <- 4  # You can also specify a different number of clusters

str(image_data_pca)# Assuming kmeans has been applied, e.g., using kmeans_result$cluster

## 'data.frame':    299 obs. of  5 variables:
##  $ PC1           : num  -8.37 -19.32 -6.19 -5.18 -44.64 ...
##  $ PC2           : num  3.406 -6.37 -2.475 -14.046 0.155 ...
##  $ PC3           : num  -22.9 3.13 7.16 10.84 6.27 ...
##  $ label         : Factor w/ 3 levels "ani","fruits",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ hclust_cluster: int  1 2 2 2 2 2 1 2 2 1 ...

kmeans_result <- kmeans(image_data_pca[,1:2], centers = 3) # Example with 3 clusters
image_data_pca$cluster <- as.factor(kmeans_result$cluster)  # Add cluster as a factor

# View the first few rows with cluster assignments
head(image_data_pca)

# Optional: Visualize the clusters in a 2D plot (using the first two PCA components)
library(ggplot2)

ggplot(image_data_pca, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.6) +
  labs(title = "K-means Clustering of Image Data",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_minimal()

# Check the structure of the dataset

#NEURAL NETWORKS

# Load necessary libraries
library(nnet)
library(caret)

# Set seed for reproducibility
set.seed(123)

# Function to fit a neural network model and return the accuracy
fit_nn_model <- function(hidden_neurons, train_data) {
  # Fit the neural network model
  nn_model <- nnet(label ~ ., data = train_data, size = hidden_neurons, maxit = 100, trace = FALSE)
  
  # Predict the class labels
  predicted_classes <- predict(nn_model, train_data, type = "class")
  
  # Create a confusion matrix
  confusion_matrix <- table(Actual = train_data$label, Predicted = predicted_classes)
  
  # Calculate accuracy
  accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
  
  return(accuracy)
}


# Fit models with different hidden layer sizes
neurons_list <- c(5, 10, 20)
accuracy_results <- data.frame(Hidden_Neurons = neurons_list, Accuracy = NA)

for (neurons in neurons_list) {
  accuracy <- fit_nn_model(neurons, image_data_pca)
  accuracy_results[accuracy_results$Hidden_Neurons == neurons, "Accuracy"] <- accuracy
}

# Print the results
print(accuracy_results)

##   Hidden_Neurons  Accuracy
## 1              5 0.7157191
## 2             10 0.7491639
## 3             20 0.8193980

# Printing the neurons list
print(neurons_list)

## [1]  5 10 20

SAAIRAM ASSIGNMENT

2024-10-05

#LOADING THE DATASET

#UNDERSTANDING OUR DATASET

#LOGISTIC REGRESSION

#CLUSTERING

##Is the data clusterable

#NEURAL NETWORKS