The below code finds the “eigenshoes” from a group of images of shoes. This process makes sense to me on a procedural level- we want to find the principal components that describe the majority (or 80%) of these shoes. I did get a little lost trying to follow along what the code was doing, but I will try my best to explain each step.

library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach)
library(jpeg)
## if (!require("BiocManager", quietly = TRUE))
##    install.packages("BiocManager")

## BiocManager::install("EBImage")
library(EBImage)
files=list.files(path='/Users/aelsaeyed/Downloads/jpg',pattern="\\.jpg")
files
##  [1] "RC_2500x1200_2014_us_53446.jpg" "RC_2500x1200_2014_us_53455.jpg"
##  [3] "RC_2500x1200_2014_us_53469.jpg" "RC_2500x1200_2014_us_53626.jpg"
##  [5] "RC_2500x1200_2014_us_53632.jpg" "RC_2500x1200_2014_us_53649.jpg"
##  [7] "RC_2500x1200_2014_us_53655.jpg" "RC_2500x1200_2014_us_53663.jpg"
##  [9] "RC_2500x1200_2014_us_53697.jpg" "RC_2500x1200_2014_us_54018.jpg"
## [11] "RC_2500x1200_2014_us_54067.jpg" "RC_2500x1200_2014_us_54106.jpg"
## [13] "RC_2500x1200_2014_us_54130.jpg" "RC_2500x1200_2014_us_54148.jpg"
## [15] "RC_2500x1200_2014_us_54157.jpg" "RC_2500x1200_2014_us_54165.jpg"
## [17] "RC_2500x1200_2014_us_54172.jpg"

This is a function that plots a jpeg image given a path to that image.

height=1200
width=2500
scale=1 # Removed scaling to get crispy images 
plot_jpeg = function(path, add=FALSE) #initialize function
{
  ##require('jpeg')
  jpg = readJPEG(path, native=T) # read the file
  res = dim(jpg)[2:1] # get the resolution, [x is 2, y is 1]
  if (!add) # initialize an empty plot area if add==FALSE
    plot(1,1,xlim=c(1,res[1]),ylim=c(1,res[2]), #set the X Limits by size
         asp=1, #aspect ratio
         type='n', #don't plot
         xaxs='i',yaxs='i',#prevents expanding axis windows +6% as normal
         xaxt='n',yaxt='n',xlab='',ylab='', # no axes or labels
         bty='n') # no box around graph
  rasterImage(jpg,1,1,res[1],res[2]) #image, xleft,ybottom,xright,ytop
}

‘im’ is a 4D array that stores images. The first dimension is the number of images. Each image is stored as a 3D array of height, width, and color (represented 0-255). ‘im’ is initialized with 0’s for all values and is then populated with each image’s information in the for loop.

im[i,,,] refers to the image at position i (17 in total)

im=array(rep(0, length(files) * height/scale * width/scale * 3),
         #set dimension to N, x, y, 3 colors, 4D array)
         dim=c(length(files), height/scale, width/scale, 3)) 

for (i in 1:length(files)){
  #define file to be read
  tmp=paste0("/Users/aelsaeyed/Downloads/jpg/", files[i])
  #read the file
  temp=EBImage::resize(readJPEG(tmp),height/scale, width/scale)
  #assign to the array
  im[i,,,]=array(temp,dim=c(1, height/scale, width/scale,3))
}
dim(im[1,,,])
## [1] 1200 2500    3
par(mfrow=c(3,3)) #set graphics to 3 x 3 table
par(mai=c(.3,.3,.3,.3)) #set margins 
for (i in 1:length(files)){  #plot all 17 images
plot_jpeg(writeJPEG(im[i,,,]))
}

Finding principal components of these images is the same as calculating the eigenvalues/vectors of the covariance matrix.

#height=1200
#width=2500
#scale=10
# commenting these out because they are repeated. 
newdata=im
dim(newdata)=c(length(files),height*width*3/scale^2)
mypca=princomp(t(as.matrix(newdata)), scores=TRUE, cor=TRUE)
sum(mypca$sdev^2/sum(mypca$sdev^2)) #verify that sum of variance=1
## [1] 1

Interrogating the result of the PCA analysis, we see that the first 3 components are what we need:

mycomponents=mypca$sdev^2/sum(mypca$sdev^2)
mycomponents
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5      Comp.6 
## 0.683313840 0.099160161 0.052878788 0.027574171 0.019576999 0.017105915 
##      Comp.7      Comp.8      Comp.9     Comp.10     Comp.11     Comp.12 
## 0.014862472 0.012713283 0.010260604 0.009839721 0.008899992 0.008610455 
##     Comp.13     Comp.14     Comp.15     Comp.16     Comp.17 
## 0.008460707 0.007167084 0.006979611 0.006747343 0.005848854
sum(mycomponents[1:2]) #first 2 components account for 78% of variability
## [1] 0.782474
sum(mycomponents[1:3]) #first 3 components account for 86% of variability
## [1] 0.8353528
sum(mycomponents[1:4]) #first 4 components account for 87% of variability
## [1] 0.862927
sum(mycomponents[1:5]) #first 5 components account for 88% of variability
## [1] 0.882504

We can plot all the components to visualize them:

mypca2=t(mypca$scores)
dim(mypca2)=c(length(files),height/scale,width/scale,3)
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for (i in 1:17){ 
plot_jpeg(writeJPEG(mypca2[i,,,], quality=1,bg="white"))
}

By saying that the first 3 components represent 80% of the variability, we are saying that the information preserved by these components is enough to recreate 80% of the original images.

These 3 components are represented visually below:

mypca2=t(mypca$scores)
dim(mypca2)=c(length(files),height/scale,width/scale,3)
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for (i in 1:3){ 
plot_jpeg(writeJPEG(mypca2[i,,,], quality=1,bg="white"))
}