#tidyverse is used for a ton of stuff
library(tidyverse)
#utils is used for unzip functionality
library(utils)
#jpeg is used for both reading and writing images
library(jpeg)
#OpenImageR is used mainly for displaying images through imageShow
library(OpenImageR)
#EBImage is used for its rescaling capabilities
library(EBImage)
With the attached data file, build and visualize eigenimagery that accounts for 80% of the variability. Provide full R code and discussion.
The first step is to load in the images that we will end up vectorizing. I have uploaded the .zip file with all the images to one drive and will be extracting the zip file within R.
url <- r"(https://spsmailcuny-my.sharepoint.com/:u:/g/personal/taha_ahmad25_spsmail_cuny_edu/EfrxkM_QAvFIrBvXaco5S7EB_NmVnIpx5jKOWH9uFdMzag?e=Oe2Aqx&download=1)"
zipfilename <- r"(eigenjpg.zip)"
unzippath <- r"(eigenjpg)"
download.file(url, zipfilename, mode = "wb")
unzip(zipfilename, exdir = unzippath)
After extracting the .zip file I will store all the extracted files within R by getting all the files within our newly created directory. After getting all the files, I tried to test out imageShow on a valid directory name. The command goes through, but silently fails. Thinking the error was due to the test image being too big, I have attempted to rescale the individual image. However, imageShow does not work in this case either, so I will be processing the data without showing the images at first.
names <- list.files(path = unzippath, full.names = TRUE)
# test <- readJPEG(names[1])
# imageShow(test)
height <- 1200
width <- 2500
scale <- 20
The next step is to create a matrix that holds the RGB values of each image as a vector, with the images scaled down for fear of having a matrix that was too big. Then we flatten the matrix down and attempt to scale it. After scaling the covariance matrix is calculated and the eigenvalues had an attempt to be derived from the covariance matrix. I say attempt because following this method, sigma_ ended up being a 4GB matrix and attempting to calculate the eigenvalues of it was just something my computer was not up to. So we will have to move on to another method of attempted calculation.
# image1 <- EBImage::resize(readJPEG(names[1]), height/scale, width/scale)
# imageShow(image1)
# data1 <- matrix(0, length(names), prod(dim(image1)))
# for (i in 1:length(names)) {
# im <- resize(readJPEG(names[i]), height/scale, width/scale)
# r <- as.vector(im[,,1])
# g <- as.vector(im[,,2])
# b <- as.vector(im[,,3])
# data1[i,] <- t(c(r, g, b))
# }
# faces <- as.data.frame(t(data1))
# scaled <- scale(faces, center = TRUE, scale = TRUE)
# mean.face <- attr(scaled, "scaled:center")
# std.face <- attr(scaled, "scaled:scale")
# Sigma_ <- scaled%*%t(scaled) / (nrow(scaled)-1)
# eig <- eigen(Sigma_)
# eigenvalues <- eig$values
# eigenvectors <- eig$vectors
To begin our path on a very heavily trodden one with help from our professor’s example (https://rpubs.com/R-Minator/eigenshoes), we create an empty initial matrix where we store our images that have been read and resized to a more size friendly scale of 60x125. A 4th channel is also provided in the matrix for RGB.
im <- array(rep(0,length(names)*height/scale*width/scale*3), dim = c(length(names), height/scale, width/scale,3))
for (i in 1:length(names)){
temp=resize(readJPEG(names[i]),height/scale, width/scale)
im[i,,,]=array(temp,dim=c(1, height/scale, width/scale,3))}
Now that our image has been stored within a matrix, we can create a function that rasterizes images within R on a plot. We loop through each image stored in the matrix with this function in order to see that we have been succesful in converting our images to integers stored in matrices.
plot_jpeg = function(path, add=FALSE)
{ jpg = readJPEG(path, native=T) # read the file
res = dim(jpg)[2:1] # get the resolution, [x, y]
if (!add) # initialize an empty plot area if add==FALSE
plot(1,1,xlim=c(1,res[1]),ylim=c(1,res[2]),asp=1,type='n',xaxs='i',yaxs='i',xaxt='n',yaxt='n',xlab='',ylab='',bty='n')
rasterImage(jpg,1,1,res[1],res[2])
}
par(mfrow=c(3,6))
par(mai=c(.01,.01,.01,.01))
for (i in 1:length(names)){ #plot the first images only
plot_jpeg(writeJPEG(im[i,,,]))
}
We now convert the previous matrix into a flattened vector to hold linear transformations of the jpegs broken down into their RGB values. We transpose the RGB values into this vector, and then transpose it back out to end up with 17 columns for each image that contain 382,500 observations.
flat <- matrix(0, length(names), prod(dim(im)))
for (i in 1:length(names)) {
newim <- readJPEG(names[i])
r <- as.vector(im[i,,,1])
g <- as.vector(im[i,,,2])
b <- as.vector(im[i,,,3])
flat[i,] <- t(c(r, g, b))
}
shoes <- as.data.frame(t(flat))
Now that we have a flattened data set, the first thing we should do is have the data centered and scaled so it will be properly standardized to a normal distribution for use of creating eigenimagery. After standardizing the data, we calculate the covariance matrix, a square matrix containing the correlation between ever single entry. Finally, we can calculate the eigenvalues from the correlation matrix, which is the same as the eigenvalues of the principal components.
scaled=scale(shoes, center = TRUE, scale = TRUE)
Sigma_=cor(scaled)
myeigen=eigen(Sigma_)
What we do next is essentially generate a linear combination of the eigenvalues. Here we can see that the shoes that would be generated from the first three eigenvectors would be needed to account for at least 80% of the variability in how these shoes look like. We need to transform the eigenvalues into the transpose of the scaled data to get the original eigenvectors of the unscaled original flattened matrix of the shoes.
cum_var <- cumsum(myeigen$values) / sum(myeigen$values)
cum_var
## [1] 0.6928202 0.7940449 0.8451073 0.8723847 0.8913841 0.9076338 0.9216282
## [8] 0.9336889 0.9433872 0.9524455 0.9609037 0.9688907 0.9765235 0.9832209
## [15] 0.9894033 0.9953587 1.0000000
thresh_var <- min(which(cum_var >= .80))
thresh_var
## [1] 3
scaling=diag(myeigen$values[1:thresh_var]^(-1/2)) / (sqrt(nrow(scaled)-1))
eigenshoes=scaled%*%myeigen$vectors[,1:thresh_var]%*%scaling
As the initial goal was to visualize eigenimagery that accounts for 80% of variability, if we generate images from the first three eigenshoes then we have shown the three shoes which would account for almost 84.5% of variability between these 17 images.
The shoe that accounts for 69.3% of variability
imageShow(array(eigenshoes[,1], c(60,125,3)))
The shoe that accounts for 10.1% of variability
imageShow(array(eigenshoes[,2], c(60,125,3)))
The shoe that accounts for 5.1% of variability
imageShow(array(eigenshoes[,3], c(60,125,3)))