Sys.setenv('R_MAX_VSIZE'=32000000000)  #  Getting Error: vector memory exhausted (limit reached) when running all shoes

rm(list = ls())
library(jpeg)
library(OpenImageR)
## Warning: package 'OpenImageR' was built under R version 4.0.5
library(Matrix)
# Initialization
# Set the file path directory of the jpg images
v_file_path <- "/Users/Audiorunner13/CUNY MSDS Course Work/DATA605 Fall 2022/Week 4/jpg"

# Test using the first shoe image
v_shoe_test <- 1
#  Read and Plot the first image (.jpg)
v_image <- jpeg::readJPEG(paste0(v_file_path,"/RC_2500x1200_2014_us_53446.jpg"))

# Get the dimension for the first image
dim(v_image)
## [1] 1200 2500    3
# Save the row,col,channels dimension
v_row_dim <- dim(v_image)[1]
v_col_dim <- dim(v_image)[2]
v_channel_dim <- dim(v_image)[3]

print(v_row_dim); print(v_col_dim); print(v_channel_dim)
## [1] 1200
## [1] 2500
## [1] 3
# plot the image
imageShow(v_image)

# Read in all image files
# Read to a list all filenames in folder

v_filenames <- list.files(path=v_file_path,pattern = ".jpg")

# Initialize the matrix where all files will be stored
v_all_images_data <- matrix(0, length(v_filenames), prod(dim(v_image)))

# Show the dimensions of the matrix
# Matrix is 17 x 9MM.  That is equals to 17 shoes, each having dimensions 1200 x 2500 x 3

dim(v_all_images_data)
## [1]      20 9000000
# Function to Plot 1d Image Files

plot_shoe <- function(v_img_1d) {
  v_img3d <- array(v_img_1d,c(v_row_dim, v_col_dim, v_channel_dim))
  imageShow(v_img_3d)
  }
# Read all images into a single matrix

v_counter <- 1

for (v_filename in v_filenames) {
  print(paste("loading file: ", v_filename))
  v_img <- jpeg::readJPEG(paste0("/Users/Audiorunner13/CUNY MSDS Course Work/DATA605 Fall 2022/Week 4/jpg/", v_filename))
  v_red  <- as.vector(v_img[,,1])
  v_green  <- as.vector(v_img[,,2])
  v_blue  <- as.vector(v_img[,,3])
  
  #Images are stored a single vector red-green-blue  
  v_all_images_data[v_counter,] <- t(c(v_red, v_green, v_blue))  

  v_counter <- v_counter+1
}
## [1] "loading file:  RC_2500x1200_2014_us_53446_2.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53446.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53455.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53469.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53626.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53632.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53649_2.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53649.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53655.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53663.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_53697.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54018.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54067.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54106.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54130.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54148.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54157.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54165.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54172_2.jpg"
## [1] "loading file:  RC_2500x1200_2014_us_54172.jpg"
# Transpose the matrix to get each image in a column
v_all_images_data <- t(v_all_images_data)
dim(v_all_images_data)
## [1] 9000000      20
# Test plotting chosen shoe image
# plot_shoe(all_images_data[,shoe_test])
imageShow(array(v_all_images_data[,v_shoe_test],c(v_row_dim, v_col_dim, v_channel_dim)))

Process the matrix with SVD (Singular Value Decomposition) by using the svd function to get U, Sigma and V matrices

# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(v_all_images_data)

v_scaled_data[is.nan(v_scaled_data)] = 0 # if a value is null then set equal to 0

v_svd_decomp <- svd(v_scaled_data)

Compute the variance explained by each component.

plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

As the above plot shows, roughly 80% of the variance is shown by 3 of the vectors and about 90% by 6 vectors.

Next, calculate number of vectors needed to explain 80% of variance

v_var_pct <- 0.8
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 3"

I am plotting only one shoe image using only a few vectors for 80% variance. Although I have all shoe data in the v_all_images_data array when trying to plot and process them, I am getting a vector memory exhausted (limit reached?) error.

# Reconstruct all shoes using only a subset of columns which explain X% of variability.

v_newimage <- v_svd_decomp$u[, 1:v_vectors] %*% diag(v_svd_decomp$d[1:v_vectors]) %*% t(v_svd_decomp$v[,1:v_vectors])

# Select ONE shoe of the reconstructed matrix and plot it.
imageShow(array(v_newimage[,v_shoe_test],c(v_row_dim, v_col_dim, v_channel_dim)))

Here I am getting a vector memory exhausted (limit reached?) error.

# Select all shoes of the reconstructed matrix and plot it.
# imageShow(array(newimage[,all_images_data],c(row_dim,col_dim,channel_dim)))

Calculating number of vectors needed to explain 90% of variance

v_var_pct <- 0.9
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 6"
# Reconstruct all shoes using only a subset of columns which explain X% of variability.

v_newimage <- v_svd_decomp$u[, 1:v_vectors] %*% diag(v_svd_decomp$d[1:v_vectors]) %*% t(v_svd_decomp$v[,1:v_vectors])

# Select ONE shoe of the reconstructed matrix and plot it.
imageShow(array(v_newimage[,v_shoe_test],c(v_row_dim, v_col_dim, v_channel_dim)))

Calculating number of vectors needed to explain 90% of variance

v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 10"
# Reconstruct all shoes using only a subset of columns which explain X% of variability.

v_newimage <- v_svd_decomp$u[, 1:v_vectors] %*% diag(v_svd_decomp$d[1:v_vectors]) %*% t(v_svd_decomp$v[,1:v_vectors])

# Select ONE shoe of the reconstructed matrix and plot it.
imageShow(array(v_newimage[,v_shoe_test],c(v_row_dim, v_col_dim, v_channel_dim)))

Interpretation:

You will notice that the higher the percentage of variance the more clear the image becomes and that is because as you add more vectors the cumulative variance gets closer to 100%.

Also, one can see that 80% of their shoe business is driven off of 3 of their shoe products. Perhaps, their focus can be to reduce the number of shoe type inventory and focus on their biggest sellers.