#Task ##With the attached data file, build and visualize eigenimagery that accounts for 80% of the variability. Provide full R code and discussion
#Packages used
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.3.2
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.3.2
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.3.2
## Loading required package: parallel
library(foreach)
library(jpeg)
library(iterators)
library(parallel)
library(EBImage)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.2
## Loading required package: ggplot2
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.92 loaded
library(OpenImageR)
## Warning: package 'OpenImageR' was built under R version 4.3.2
##
## Attaching package: 'OpenImageR'
## The following objects are masked from 'package:EBImage':
##
## readImage, writeImage
rm(list = ls())
files=list.files(path='C:/Users/Jonathan Burns/Documents/DATA 605/jpeg',pattern="\\.jpg")
#Read the Images into R
height=1200
width=2500
scale=20
plot_jpeg = function(path, add=FALSE) #initialize function
{
require('jpeg')
jpg = readJPEG(path, native=T) # read the file
res = dim(jpg)[2:1] # get the resolution, [x is 2, y is 1]
if (!add) # initialize an empty plot area if add==FALSE
plot(1,1,xlim=c(1,res[1]),ylim=c(1,res[2]), #set the X Limits by size
asp=1, #aspect ratio
type='n', #don't plot
xaxs='i',yaxs='i',#prevents expanding axis windows +6% as normal
xaxt='n',yaxt='n',xlab='',ylab='', # no axes or labels
bty='n') # no box around graph
rasterImage(jpg,1,1,res[1],res[2]) #image, xleft,ybottom,xright,ytop
}
#Resizing the Images in ‘Files’ ##Putting the resized images in array ‘im’
im=array(rep(0,length(files)*height/scale*width/scale*3),
#set dimension to N, x, y, 3 colors, 4D array)
dim=c(length(files), height/scale, width/scale,3))
for (i in 1:length(files)){
#define file to be read
tmp=paste0("C:/Users/Jonathan Burns/Documents/DATA 605/jpeg/", files[i])
#read the file
temp=EBImage::resize(readJPEG(tmp),height/scale, width/scale)
#assign to the array
im[i,,,]=array(temp,dim=c(1, height/scale, width/scale,3))
}
#Visualize Original JPGs from ‘files’
par(mfrow=c(3,3)) #set graphics to 3 x 3 table
par(mai=c(.3,.3,.3,.3)) #set margins
for (i in 1:17){ #plot the first images only
plot_jpeg(writeJPEG(im[i,,,]))
}
#Vectorize
flat=matrix(0, 17, prod(dim(im)))
for (i in 1:17) {
newim <- readJPEG(paste0("C:/Users/Jonathan Burns/Documents/DATA 605/jpeg/", files[i]))
r=as.vector(im[i,,,1]); g=as.vector(im[i,,,2]);b=as.vector(im[i,,,3])
flat[i,] <- t(c(r, g, b))
}
shoes=as.data.frame(t(flat))
#Eigencomponents
#################################################
scaled=scale(shoes, center = TRUE, scale = TRUE)
mean.shoe=attr(scaled, "scaled:center") #saving for classification
std.shoe=attr(scaled, "scaled:scale") #saving for classification...later
#################################################
#Calculate Co-Variance
cov_shoes <- cor(scaled)
myeigen <- eigen(cov_shoes)
cumsum(myeigen$values) / sum(myeigen$values)
## [1] 0.6928202 0.7940449 0.8451073 0.8723847 0.8913841 0.9076338 0.9216282
## [8] 0.9336889 0.9433872 0.9524455 0.9609037 0.9688907 0.9765235 0.9832209
## [15] 0.9894033 0.9953587 1.0000000
#Eigenshoe
windows()
scaling=diag(myeigen$values[1:5]^(-1/2)) / (sqrt(nrow(scaled)-1))
eigenshoes=scaled%*%myeigen$vectors[,1:5]%*%scaling
par(mfrow=c(2,3))
imageShow(array(eigenshoes[,1], c(60,125,3)))
Sys.sleep(10)
#Generate Principal Components
height=1200
width=2500
scale=20
newdata=im
dim(newdata)=c(length(files),height*width*3/scale^2)
mypca=princomp(t(as.matrix(newdata)), scores=TRUE, cor=TRUE)
sum(mypca$sdev^2/sum(mypca$sdev^2)) #verify that sum of variance=1
## [1] 1
mycomponents=mypca$sdev^2/sum(mypca$sdev^2)
sum(mycomponents[1:3]) #first 3 components account for 84% of variability
## [1] 0.8451073
#Scree Plot to visualize the distrubution of variance in the factors For the purposes of this assignment, building and visualizing eigenimagery that accounts for 80% of the variability of the original set of images. Reading the scree plot, 80% or more variability is achieved by using PC[1:3]. PC[1:2] is just under, coming in at 79.4%
fviz_eig(mypca, addlabels = TRUE, ylim = c(0,90))
vars <- get_pca_var(mypca)
head(vars$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## [1,] 0.8633211 0.0782201 0.13150542 0.25847126 0.186628097 0.03340035
## [2,] 0.8801689 -0.3013327 0.08835013 0.15705547 -0.018550879 0.06224184
## [3,] 0.6777686 0.4529178 0.22897931 0.44409552 -0.097596000 0.12862747
## [4,] 0.8207247 -0.4003131 0.12676857 -0.08388422 -0.185993060 0.17726256
## [5,] 0.8666247 -0.3134600 0.05680148 -0.01030995 0.181998134 -0.07047314
## [6,] 0.7196414 0.4561963 0.39433761 -0.22535845 0.002947831 -0.10748617
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## [1,] 0.25593785 0.20954720 0.032847670 0.01125784 0.031348603 0.021979810
## [2,] 0.09114030 0.04647299 -0.090485403 -0.05759761 0.001141095 -0.132043680
## [3,] -0.21564782 -0.10320265 0.034392492 0.02236158 -0.012272042 -0.008579068
## [4,] 0.04550352 -0.04920257 -0.008196665 -0.01687250 0.146359488 0.191228394
## [5,] -0.15058659 0.03398895 0.125413916 -0.12293550 0.029373623 0.094804386
## [6,] 0.01604161 0.03747101 0.013477629 0.14280999 -0.070295718 -0.011238270
## Dim.13 Dim.14 Dim.15 Dim.16 Dim.17
## [1,] 0.06263059 0.007171977 0.110194949 0.005853876 0.01657369
## [2,] -0.01577211 0.001714993 -0.223485892 -0.088782342 -0.01796484
## [3,] -0.00229346 -0.002558053 0.018429573 0.014827152 0.01181976
## [4,] 0.03267434 0.062477971 -0.025550435 0.034884099 0.08581751
## [5,] -0.11913717 -0.148156996 0.008984605 -0.090933598 0.01625364
## [6,] -0.06092234 0.027210395 -0.043282381 -0.061368691 0.13694393
head(vars$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## [1,] 0.7453232 0.006118384 0.017293676 0.066807391 3.483005e-02 0.001115584
## [2,] 0.7746974 0.090801387 0.007805746 0.024666420 3.441351e-04 0.003874047
## [3,] 0.4593702 0.205134544 0.052431527 0.197220830 9.524979e-03 0.016545027
## [4,] 0.6735890 0.160250572 0.016070271 0.007036563 3.459342e-02 0.031422015
## [5,] 0.7510383 0.098257195 0.003226408 0.000106295 3.312332e-02 0.004966464
## [6,] 0.5178838 0.208115074 0.155502154 0.050786431 8.689706e-06 0.011553278
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11
## [1,] 0.0655041850 0.043910030 1.078969e-03 0.0001267390 9.827349e-04
## [2,] 0.0083065549 0.002159738 8.187608e-03 0.0033174849 1.302098e-06
## [3,] 0.0465039829 0.010650787 1.182843e-03 0.0005000404 1.506030e-04
## [4,] 0.0020705699 0.002420893 6.718531e-05 0.0002846811 2.142110e-02
## [5,] 0.0226763220 0.001155249 1.572865e-02 0.0151131375 8.628097e-04
## [6,] 0.0002573334 0.001404077 1.816465e-04 0.0203946920 4.941488e-03
## Dim.12 Dim.13 Dim.14 Dim.15 Dim.16
## [1,] 0.0004831120 3.922591e-03 5.143725e-05 1.214293e-02 3.426787e-05
## [2,] 0.0174355334 2.487595e-04 2.941202e-06 4.994594e-02 7.882304e-03
## [3,] 0.0000736004 5.259957e-06 6.543636e-06 3.396492e-04 2.198444e-04
## [4,] 0.0365682986 1.067613e-03 3.903497e-03 6.528247e-04 1.216900e-03
## [5,] 0.0089878716 1.419366e-02 2.195050e-02 8.072313e-05 8.268919e-03
## [6,] 0.0001262987 3.711532e-03 7.404056e-04 1.873364e-03 3.766116e-03
## Dim.17
## [1,] 0.0002746872
## [2,] 0.0003227356
## [3,] 0.0001397067
## [4,] 0.0073646451
## [5,] 0.0002641807
## [6,] 0.0187536393
head(vars$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## [1,] 6.328127 0.3555507 1.9922207 14.40692085 10.783637363 0.4038409
## [2,] 6.577526 5.2766371 0.8992171 5.31927924 0.106546749 1.4024036
## [3,] 3.900258 11.9207491 6.0400794 42.53039758 2.949003276 5.9892942
## [4,] 5.719071 9.3124581 1.8512852 1.51742500 10.710375508 11.3747587
## [5,] 6.376650 5.7099080 0.3716802 0.02292237 10.255222500 1.7978582
## [6,] 4.397064 12.0939532 17.9137518 10.95202312 0.002690397 4.1822826
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## [1,] 27.5336967 21.4161548 0.65443599 0.08230263 6.834547e-01 0.35580566
## [2,] 3.4915351 1.0533651 4.96609582 2.15433033 9.055597e-04 12.84104120
## [3,] 19.5472482 5.1946878 0.71743957 0.32471957 1.047387e-01 0.05420573
## [4,] 0.8703328 1.1807375 0.04075045 0.18486812 1.489756e+01 26.93207140
## [5,] 9.5316501 0.5634472 9.54002476 9.81426944 6.000513e-01 6.61944931
## [6,] 0.1081662 0.6848076 0.11017550 13.24404029 3.436616e+00 0.09301734
## Dim.13 Dim.14 Dim.15 Dim.16 Dim.17
## [1,] 3.023017285 0.045177536 11.5536709 0.03384725 0.3481392
## [2,] 0.191711084 0.002583269 47.5222333 7.78555318 0.4090358
## [3,] 0.004053683 0.005747300 0.3231671 0.21714597 0.1770645
## [4,] 0.822775436 3.428456374 0.6211453 1.20196352 9.3339675
## [5,] 10.938609648 19.279205015 0.0768059 8.16742265 0.3348232
## [6,] 2.860360427 0.650301090 1.7824563 3.71988916 23.7684041
head(vars$coord, 3)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## [1,] 0.8633211 0.0782201 0.13150542 0.2584713 0.18662810 0.03340035
## [2,] 0.8801689 -0.3013327 0.08835013 0.1570555 -0.01855088 0.06224184
## [3,] 0.6777686 0.4529178 0.22897931 0.4440955 -0.09759600 0.12862747
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## [1,] 0.2559379 0.20954720 0.03284767 0.01125784 0.031348603 0.021979810
## [2,] 0.0911403 0.04647299 -0.09048540 -0.05759761 0.001141095 -0.132043680
## [3,] -0.2156478 -0.10320265 0.03439249 0.02236158 -0.012272042 -0.008579068
## Dim.13 Dim.14 Dim.15 Dim.16 Dim.17
## [1,] 0.06263059 0.007171977 0.11019495 0.005853876 0.01657369
## [2,] -0.01577211 0.001714993 -0.22348589 -0.088782342 -0.01796484
## [3,] -0.00229346 -0.002558053 0.01842957 0.014827152 0.01181976
Now that I think about it, this visualization doesnt make much sense, since the originals are jpg and do not have any defined variables to compare, so this is just plotting explained variance of the 17 shoes, not exactly helpful.
fviz_pca_var(mypca, col.var = "black")
A coorplot might be a better option to visualize which shoes are most
important. This reinforces the finding from above, but now its spread
across all of the variables. The first dimension for most of the shoes
represents the largest variability. With some shoes having less dominant
1st dimensions like 3 and 6 for example.
corrplot(vars$cos2, is.corr = FALSE)
#Eigenshoes
mypca2=t(mypca$scores)
dim(mypca2)=c(length(files),height/scale,width/scale,3)
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for (i in 1:17){ # plotting 17 eigenshoes
plot_jpeg(writeJPEG(mypca2[i,,,], quality=1,bg="white"))
}
a=round(mypca$sdev[1:17]^2/ sum(mypca$sdev^2),3)
cumsum(a)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## 0.693 0.794 0.845 0.872 0.891 0.907 0.921 0.933 0.943 0.952
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## 0.960 0.968 0.976 0.983 0.989 0.995 1.000
#New Data Set
shoes_v2 <- t(t(eigenshoes)%*%scaled)