Getting started: Load the libraries

# Set working directory 
setwd("/Users/heleinefouda/Desktop/jpg")
image_directory <- "/Users/heleinefouda/Desktop/jpg"

Import the image files et define parameters

path_files <-"/Users/heleinefouda/Desktop/jpg"

# List JPEG files in the directory:
jpeg_files <- list.files(image_directory, pattern = "\\.jpg$", full.names = TRUE)

# Define the paramaters: width, height and scale
 height <- 1200
 scale <- 20
 width <- 2500
 # Set the scaling factor
scale <- 20

# Calculate new dimensions after scaling
new_height <- height / scale
new_width <- width / scale

Define the functions to be used

# Function to ...
height=1200; width=2500;scale=20
plot_jpeg = function(path, add=FALSE)
{ jpg = readJPEG(path, native=T) # read the file
  res = dim(jpg)[2:1] # get the resolution, [x, y]
  if (!add) # initialize an empty plot area if add==FALSE
    plot(1,1,xlim=c(1,res[1]),ylim=c(1,res[2]),asp=1,type='n',xaxs='i',yaxs='i',xaxt='n',yaxt='n',xlab='',ylab='',bty='n')
  rasterImage(jpg,1,1,res[1],res[2])
}

Read images into a list:

# Read images into a list:
image_list <- lapply(jpeg_files, jpeg::readJPEG)

Check number of files

# number of files
num <- length(jpeg_files )
num
## [1] 17

Read and plot the third shoe first, as a test.

# Read third shoe
img <- jpeg::readJPEG(paste0( path_files,"/RC_2500x1200_2014_us_53469.jpg"))
dim(img)
## [1] 1200 2500    3

Find the dimension of the matrix

filenames <- list.files(path = path_files, pattern = ".jpg")
# matrix where all the files will be stored
images_data <- matrix(0, length(filenames), prod(dim(img)))
# Show the dimensions of the matrix
dim(images_data)
## [1]      17 9000000

code gives us a 17 * 9MM matrix, which is equals to 17 shoes, each having dimensions 1200 * 2500 * 3

# Save the row,col,channels dimension
row_dim <- dim(img)[1]
col_dim <- dim(img)[2]
channel_dim <- dim(img)[3]

Read in all the images contained in the files

# Combine images into a single matrix
image_matrix <- abind(image_list, along = 3)

Load all images into a single vector

# Initialize images_data before the loop
images_data <- matrix(0, nrow = length(filenames), ncol = 2500 * 1200 * 3)  # Replace width and height

counter <- 1
for (filename in filenames) {
  print(paste("loading file: ", filename))
  
  # Read the image
  img <- jpeg::readJPEG(file.path("/Users/heleinefouda/Desktop/jpg", filename))
  
  # Extract red, green, and blue channels
  red <- as.vector(img[, , 1])
  green <- as.vector(img[, , 2])
  blue <- as.vector(img[, , 3])
  
  # Concatenate channels into a single vector and store in images_data
  images_data[counter,] <- c(red, green, blue)
  
  counter <- counter + 1
}
## [1] "loading file:  RC_2500x1200_2014_us_53446.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53455.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53469.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53626.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53632.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53649.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53655.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53663.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53697.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54018.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54067.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54106.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54130.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54148.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54157.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54165.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54172.jpg"

Plot third image(test)

library(imager)

# Plot third shoe


# Convert image to imager format
img_imager <- imager::as.cimg(img)
## Warning in as.cimg.array(img): Assuming third dimension corresponds to colour
# Plot the image using base R plotting functions
plot(as.raster(img_imager), main = "RC_2500x1200_2014_us_53469.jpg")

# Plot the original and transposed images side by side
par(mfrow = c(1, 2))
plot(as.raster(img), main = "Original Image")

Applying transposition

images_data  <- t(images_data )
dim(images_data )
## [1] 9000000      17

The transposed version gives us a 9 MM * 17

Load the Data into an Array and Vectorize

 # Create an array to store resized images
array_a <- array(rep(0, num * new_height * new_width * 3), dim = c(num, new_height, new_width, 3))

# Resize and store each image in the array
for (i in 1:num) {
    temp <- resizeImage(readJPEG(jpeg_files[i]), new_height, new_width)
    array_a[i, , , ] <- array(temp, dim = c(1, new_height, new_width, 3))
}

Flatten the array and store pixel values in a data frame

 # Flatten the array and store pixel values in a data frame
flat <- matrix(0, num, prod(dim(array_a)))

for (i in 1:num) {
    r <- as.vector(array_a[i, , , 1])
    g <- as.vector(array_a[i, , , 2])
    b <- as.vector(array_a[i, , , 3])
    flat[i, ] <- t(c(r, g, b))
}

# Convert the flattened matrix to a data frame
shoes <- as.data.frame(t(flat))

Create a Function to Plot the Images

# Define a function to plot JPEG images
plot_jpeg <- function(path, add = FALSE) {
    jpg <- readJPEG(path, native = TRUE)
    res <- dim(jpg)[2:1]
    if (!add) {
        plot(1, 1, xlim = c(1, res[1]), ylim = c(1, res[2]), asp = 1, type = "n",
            xaxs = "i", yaxs = "i", xaxt = "n", yaxt = "n", xlab = "", ylab = "",
            bty = "n")
    }
    rasterImage(jpg, 1, 1, res[1], res[2])
}

# Plot resized images
par(mfrow = c(3, 3))
par(mai = c(0.3, 0.3, 0.3, 0.3))
for (i in 1:num) {
    plot_jpeg(writeJPEG(array_a[i, , , ]))
}

Standardize the Pixel Values

# Standardize the pixel values
scaled <- scale(shoes, center = TRUE, scale = TRUE)

# Extract mean and standard deviation of standardized pixel values
mean.shoe <- attr(scaled, "scaled:center")
std.shoe <- attr(scaled, "scaled:scale")

Calculate the Correlation Matrix

# Calculate the correlation matrix
Sigma <- cor(scaled)
sigma
## function (object, ...) 
## UseMethod("sigma")
## <bytecode: 0x10f268390>
## <environment: namespace:stats>

# Compute eigenvalues of the correlation matrix

# Compute eigenvalues of the correlation matrix
myeigen <- eigen(Sigma)
eigenvalues <- myeigen$values
eigenvalues
##  [1] 11.61745316  1.70394885  0.87429472  0.47497591  0.33549455  0.28761630
##  [7]  0.24989310  0.21531927  0.17872717  0.16921863  0.15382298  0.14492412
## [13]  0.14248337  0.12123246  0.11947244  0.11496463  0.09615834

Extract eigenvectors of the correlation matrix

# Extract eigenvectors of the correlation matrix
eigenvectors <- myeigen$vectors

# Display the first few rows of eigenvectors in a nice table format
knitr::kable(head(eigenvectors), format = "markdown")
-0.2525635 -0.0575731 -0.1386619 0.3612579 0.3367958 0.0771434 0.5101649 0.4590323 0.1004874 -0.0066637 0.0399923 0.2091592 0.0802011 -0.0279929 0.1473921 -0.3173218 0.0876285
-0.2568621 0.2285982 -0.0981094 0.2282754 -0.0291860 0.1245893 0.1998114 0.1230211 -0.2442517 0.0694919 -0.0229789 -0.2066317 0.2766962 0.0141970 -0.1019089 0.7341851 -0.1212779
-0.1969535 -0.3465766 -0.2315417 0.6676894 -0.1570505 0.2328825 -0.4447300 -0.2179298 0.0839843 -0.0133999 -0.0233552 -0.0477726 -0.0002179 0.0037787 -0.0300553 -0.0703296 0.0396419
-0.2402368 0.3049035 -0.1377157 -0.1120160 -0.3214305 0.3240779 0.0642133 -0.1110570 -0.0661180 -0.0363408 0.3765760 0.4259917 -0.2934239 -0.2116585 -0.1245587 0.0376213 0.3408931
-0.2538292 0.2390470 -0.0601217 -0.0198136 0.3055220 -0.1282683 -0.2885175 0.0737153 0.1689848 0.2884454 0.2680998 -0.1259732 -0.3649987 0.4875641 0.2938786 0.1177442 0.0726146
-0.2082231 -0.3477848 -0.4348510 -0.3317499 0.0020995 -0.1940209 0.0210512 0.0688897 0.1548276 -0.2519525 -0.2612493 -0.1494615 -0.0409279 -0.0791256 0.1059181 0.2341891 0.4915866

Processing matrix with a singular Value Decomposition (SVD)

scaled_data <- scale(images_data)
scaled_data [is.nan(scaled_data)] = 0
svd_decomp <- svd(scaled_data)

Computing variance

# Calculate cumulative variance explained by each eigenshoe
cumulative_variance <- cumsum(myeigen$values) / sum(myeigen$values)
cumulative_variance
##  [1] 0.6833796 0.7836119 0.8350410 0.8629807 0.8827157 0.8996343 0.9143339
##  [8] 0.9269998 0.9375131 0.9474672 0.9565156 0.9650405 0.9734219 0.9805532
## [15] 0.9875810 0.9943436 1.0000000

Plotting variance

plot(svd_decomp$d^2 / sum(svd_decomp$d^2), type = "b", ylab = "Prop. of Variance Explained", pch = 19)

# Read and plot a sample JPEG image

testing_img <- readJPEG(jpeg_files[4])
plot(1:2, type = "n", main = "")
rasterImage(testing_img, 1, 1, 2, 2)

Determine the number of principal components explaining at least 80% of the variance

# Determine the number of principal components explaining at least 80% of the variance
num_comp <- which(cumulative_variance >= 0.8)[1]
num_comp
## [1] 3

Plot cumulative variance explained by eigenshoes

# Calculate cumulative variance explained by each eigenshoe
cumulative_variance <- cumsum(myeigen$values) / sum(myeigen$values)
cumulative_variance
##  [1] 0.6833796 0.7836119 0.8350410 0.8629807 0.8827157 0.8996343 0.9143339
##  [8] 0.9269998 0.9375131 0.9474672 0.9565156 0.9650405 0.9734219 0.9805532
## [15] 0.9875810 0.9943436 1.0000000
# Plot cumulative variance explained by eigenshoes
ggplot(as.data.frame(cumulative_variance), aes(x = 1:num, cumulative_variance)) +
    geom_line() + geom_point() + labs(x = "Number of eigenshoes", y = "Cumulative Variance") +
    scale_x_continuous(breaks = seq(1, 17, by = 2)) + theme_minimal()

## Determine the number of principal components explaining at least 80% of the variance

# Determine the number of principal components explaining at least 80% of the variance
num_comp <- which(cumulative_variance >= 0.8)[1]
num_comp
## [1] 3

Compare different levels of variance

### 80%

var_pct <- 0.8

vectors <- which(cumsum(svd_decomp$d^2 / sum(svd_decomp$d^2)) >= var_pct)[1]
print(paste0("Vectors to use: ", vectors))
## [1] "Vectors to use: 3"

70%

var_pct <- 0.7
vectors <- which(cumsum(svd_decomp$d^2/sum(svd_decomp$d^2)) >= var_pct)[1]
print(paste0("Vectors to use: ", vectors))
## [1] "Vectors to use: 2"

90%

var_pct <- 0.9
vectors <- which(cumsum(svd_decomp$d^2/sum(svd_decomp$d^2)) >= var_pct)[1]
print(paste0("Vectors to use: ", vectors))
## [1] "Vectors to use: 7"

95%

var_pct <- 0.95
vectors <- which(cumsum(svd_decomp$d^2/sum(svd_decomp$d^2)) >= var_pct)[1]
print(paste0("Vectors to use: ", vectors))
## [1] "Vectors to use: 11"

99%

var_pct <- 0.99
vectors <- which(cumsum(svd_decomp$d^2/sum(svd_decomp$d^2)) >= var_pct)[1]
print(paste0("Vectors to use: ", vectors))
## [1] "Vectors to use: 16"