Prep Environment Package and Library Load

#install.packages("imager")
#install.packages("BiocManager") 
#BiocManager::install("EBImage")
#install.packages("ggplot2")


# Load the libraries
library(imager)
library(jpeg)
library(EBImage)
library(ggplot2)


Read the image files

# Specifies the file path to the directory where JPG images are stored.
folder_path <- "/Users/jdr2158/jpg"

# Searches the specified folder for files ending in '.jpg',
jpg_files <- list.files(path = folder_path, pattern = "\\.jpg$", full.names = TRUE)

# Creates an empty list to hold image objects once they are loaded.
image_list <- list()

# Iterates over each file path in the 'jpg_files' array, loading each image into memory one at a time.
for (i in 1:length(jpg_files)) {
  # Utilizes the 'load.image' function to read the current image file, converting it into an R image object.
  img <- load.image(jpg_files[i])
  
  # Inserts the newly loaded image object into the 'image_list', 
  # Preserves the order of images as found in the source folder.
  image_list[[i]] <- img
}

# Displays the first image from the 'image_list' on the plotting area,
plot(image_list[[1]])


Load images into array

# Defines a function to plot a 3D array as an image, utilizing the EBImage package for image manipulation.
plot_jpg_array <- function(img_array) {
  # Converts the input array into an EBImage object, specifying 'Color' as the color mode.
  img_eb <- EBImage::Image(data = img_array, colormode = 'Color')
  
  # Rotates the EBImage object by 90 degrees to ensure the image is oriented correctly for display.
  img_rotated <- EBImage::rotate(img_eb, 90)
  
  # Displays the rotated image on the screen, allowing for visual inspection or analysis.
  EBImage::display(img_rotated)
}

# Initializes parameters for resizing images, setting specific height, width, and scale values.
height <- 1200
width <- 2500
scale <- 20

# Initializes a 4D array with dimensions tailored to store resized images, 
# Based on the specified scale and the number of images.
im <- array(rep(0, length(image_list) * (height / scale) * (width / scale) * 3),
            dim = c(length(image_list), height / scale, width / scale, 3))

# Iterates through each image in the 'image_list', 
# Resizes each image and storing it within the pre-initialized 4D array.
for (i in 1:length(image_list)) {
  # Retrieves the file path for the current image from 'jpg_files' and 
  # reads the image using the 'readJPEG' function.
  tmp <- jpg_files[i]
  img <- readJPEG(tmp)
  
  # Resizes the current image to match the specified dimensions and stores the resized image in the 4D array.
  temp <- EBImage::resize(as.Image(img), height / scale, width / scale)
  im[i, , , ] <- array(temp@.Data, dim = c(1, height / scale, width / scale, 3))
}


Plot the shoes

# Configures plotting parameters to arrange the output in a 3x3 grid, 
# Allows for multiple images to be displayed simultaneously.
par(mfrow=c(3,3))

# Sets the margins around each plot in the grid to ensure each image has adequate spacing and is clearly visible.
par(mai=c(0.3,0.3,0.3,0.3))

# Iterates through the images, up to a maximum of 9 (the capacity of the 3x3 grid), 
# Calls the 'plot_jpg_array' function for each to display
for (i in 1:min(length(image_list), dim(im)[1])) {
  plot_jpg_array(im[i, , , ])
}


Perform Principle Components Analysis

# Sets the parameters for image dimensions and scaling factor for the analysis.
height = 1200
width = 2500
scale = 20

# Prepares data for PCA by assigning the 4D array of resized images to 'newdata' and reshapes to a 2D matrix.
newdata = im
dim(newdata) = c(length(jpg_files), height * width * 3 / scale^2)

# Executes the PCA on the transposed matrix of 'newdata', 
# Calculates scores for each principal component and standardizing variables.
mypca = princomp(t(as.matrix(newdata)), scores = TRUE, cor = TRUE)

# Calculates and verifies that the sum of the squared standard deviations (variances) of the principal components 
# equals 1.
sum(mypca$sdev^2 / sum(mypca$sdev^2))
## [1] 1


Results

The first 2 components nearly make up 80% of the variability. The first 5 components make up almost 90% of the variability.

# Calculates the proportion of variance explained by each principal component relative to the total variance.
# This step quantifies the importance of each component in capturing the variability in the dataset.
mycomponents = mypca$sdev^2 / sum(mypca$sdev^2)

# Summarizes the cumulative variance explained by the first two principal components;
# The first two components make up 80% of the variability
sum(mycomponents[1:2])
## [1] 0.7940449
# First 5 components account for 90% of the variance.
sum(mycomponents[1:5])
## [1] 0.8913841


Scatterplot:Distribution of Images in Principal Component Space

# Create a data frame from the PCA scores for the first two components
pca_scores_df <- data.frame(PC1 = mypca$scores[, 1], PC2 = mypca$scores[, 2])

# Generate a scatter plot of the first two principal components
ggplot(pca_scores_df, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.5) + # Use semi-transparent points for better visibility of data density
  theme_minimal() + # Apply a minimal theme for a clean and clear visualization
  labs(title = "Distribution of Images in Principal Component Space",
       subtitle = "Visualizing the spread and clustering of images based on the first two principal components",
       x = "Principal Component 1 (PC1)",
       y = "Principal Component 2 (PC2)",
       caption = "Each point represents an image projected onto the first two principal components.") +
  geom_hline(yintercept = 0, linetype = "dashed") + # Add a horizontal dashed line at y=0 for reference
  geom_vline(xintercept = 0, linetype = "dashed") # Add a vertical dashed line at x=0 for reference


Scree Plot: Variance Explained by Each Principal Component

# Generate a Scree Plot to visualize the proportion of variance explained by each principal component
plot(mypca$sdev^2 / sum(mypca$sdev^2), 
     xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained", 
     type = 'b', # Use both points and lines
     main = "Scree Plot: Variance Explained by Each Principal Component",
     sub = "Identifies the optimal number of components to retain")

# Add a horizontal line at y=0.01 to highlight smaller contributions of variance
abline(h = 0.01, col = "red", lty = 2) # Dashed red line as a threshold indicator


Cumulative Plot: Cumulative Variance Explained by PCA Components

# Calculate the cumulative variance explained by the principal components
cum_var_explained <- cumsum(mypca$sdev^2 / sum(mypca$sdev^2))

# Generate a plot to visualize the cumulative proportion of variance explained by the principal components
plot(cum_var_explained, 
     xlab = "Number of Principal Components", 
     ylab = "Cumulative Proportion of Variance Explained", 
     type = 'b', # Use both points and lines for visualization
     main = "Cumulative Variance Explained by PCA Components",
     sub = "Assessing the amount of information retained across components")


Heatmap of PCA Loadings: Variable Contributions to Components

# Generate a heatmap to visualize the loadings of each principal component
heatmap(t(mypca$loadings), 
        Rowv = NA, 
        Colv = NA, 
        scale = "column", 
        main = "PCA Loadings: Variable Contributions to Components",
        sub = "The influence of original variables on principal components")


Loadings for 3 Components with 80% Variance

# Calculates the proportion of variance explained by each principal component 
# by dividing the square of each singular value (sdev^2) by the sum of all squared singular values. 
# This step quantifies how much of the total dataset variability each principal component captures.
var_explained <- mypca$sdev^2 / sum(mypca$sdev^2)

# Calculates the cumulative variance explained by summing the variance explained by each component up to that point. 
# This cumulative measure helps to understand how much of the total dataset variability 
# is captured as more components are considered.
cum_var_explained <- cumsum(var_explained)

# Identifies the minimum number of principal components required to reach or exceed 80% of the total variance. 
# Used for dimensionality reduction, aiming to retain most of the information while using fewer components.
num_components_80pct <- which(cum_var_explained >= 0.8)[1]

# Extracts the loadings for the principal components that contribute to at least 80% of the total variance. 
# Loadings indicate the contribution of each original variable (e.g., pixel intensity in image analysis) to the principal component
loadings_80pct <- mypca$loadings[, 1:num_components_80pct]

# Prints the loadings for the principal components that together account for almost 80% variability. 
# Visualizes the variables' contributions to the identified components.
print(loadings_80pct)
##          Comp.1      Comp.2      Comp.3
##  [1,] 0.2515577  0.05962807  0.14114605
##  [2,] 0.2564669 -0.22970932  0.09482706
##  [3,] 0.1974907  0.34526438  0.24576573
##  [4,] 0.2391458 -0.30516320  0.13606194
##  [5,] 0.2525203 -0.23895414  0.06096558
##  [6,] 0.2096918  0.34776361  0.42324640
##  [7,] 0.2220439  0.32176935  0.36923615
##  [8,] 0.2597468 -0.13861061  0.27362524
##  [9,] 0.2242754 -0.39008169  0.17677165
## [10,] 0.2523894 -0.26939880 -0.02645111
## [11,] 0.2504276 -0.23813195 -0.14578328
## [12,] 0.2541524  0.16064493 -0.16973475
## [13,] 0.2374627  0.25443032 -0.18739393
## [14,] 0.2431988  0.17131145 -0.34992145
## [15,] 0.2531910  0.06188346 -0.32463869
## [16,] 0.2571186  0.11980858 -0.24383184
## [17,] 0.2513643  0.10507094 -0.30609685


Eigen Shoes

# Transposes the PCA scores matrix to align with the original image data structure, 
mypca2 = t(mypca$scores)

# Reshapes the transposed PCA scores to match the original dimensions of the images, adjusted by the scale factor.
dim(mypca2) = c(length(jpg_files), height / scale, width / scale, 3)

# Prepares the plotting area to display the first 17 principal components as images in a 5x5 grid, leaving some grid spaces empty
par(mfrow = c(5, 5))
par(mai = c(0.001, 0.001, 0.001, 0.001)) # Sets the margins around each plot to minimal values for a compact display.

# Iterates through the first 17 principal components to plot them as images. 
# This loop visualizes the components, revealing the patterns or features that each component captures.
for (i in 1:17) {
  
  # Rescales each principal component's image data to a [0, 1] range
  pc_image = mypca2[i, , , ]
  min_val = min(pc_image)
  max_val = max(pc_image)
  rescaled_pc_image = (pc_image - min_val) / (max_val - min_val)
  
  # Plots the rescaled principal component as an image, using a custom plotting function designed for this purpose. 
  # This visualization allows for the examination of what image features or patterns are most prominent.
  plot_jpg_array(rescaled_pc_image)
}