Requirement: With the attached data file, build and visualize eigenimagery that accounts for 80% of the variability. Provide full R code and discussion.
Provied data contains 17 images.
# Define the file path where the images are located
files <- list.files(path = "/Users/linda/Desktop/elearning/CUNY-MSDS/data 605/w4/jpg", pattern = "\\.jpg")
# Parameters for image processing
height <- 1200
width <- 2500
scale <- 20
# Function to plot a JPEG image
plot_jpeg <- function(path, add = FALSE) {
require('jpeg')
jpg <- readJPEG(path, native = TRUE)
res <- dim(jpg)[2:1]
if (!add) {
plot(1, 1, xlim = c(1, res[1]), ylim = c(1, res[2]),
asp = 1, type = 'n', xaxs = 'i', yaxs = 'i',
xaxt = 'n', yaxt = 'n', xlab = '', ylab = '',
bty = 'n')
}
rasterImage(jpg, 1, 1, res[1], res[2])
}
# Initialize an array to store the images
im <- array(rep(0, length(files) * height/scale * width/scale * 3),
dim = c(length(files), height/scale, width/scale, 3))
# Load and resize images
for (i in seq_along(files)) {
tmp <- paste0("/Users/linda/Desktop/elearning/CUNY-MSDS/data 605/w4/jpg/", files[i])
temp <- EBImage::resize(readJPEG(tmp), height/scale, width/scale)
im[i,,,] <- array(temp, dim = c(1, height/scale, width/scale, 3))
}
# Set up plotting parameters
par(mfrow = c(3, 3))
par(mai = c(0.3, 0.3, 0.3, 0.3))
# Plot the first 9 images only
for (i in 1:9) {
plot_jpeg(writeJPEG(im[i,,,]))
}
# Prepare the data for PCA
dim(im) <- c(length(files), height * width * 3 / scale^2)
mypca <- princomp(t(as.matrix(im)), scores = TRUE, cor = TRUE)
# Calculate and print the variance
cat("Variance explained by each component:\n")
## Variance explained by each component:
print(mypca$sdev^2 / sum(mypca$sdev^2))
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 0.692820228 0.101224662 0.051062366 0.027277491 0.018999399 0.016249609
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11 Comp.12
## 0.013994442 0.012060722 0.009698242 0.009058322 0.008458196 0.007987044
## Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## 0.007632793 0.006697401 0.006182362 0.005955453 0.004641268
# Calculate the cumulative variance
cumulative_variance <- cumsum(mypca$sdev^2 / sum(mypca$sdev^2))
cat("Cumulative variance explained:\n")
## Cumulative variance explained:
print(cumulative_variance)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 0.6928202 0.7940449 0.8451073 0.8723847 0.8913841 0.9076338 0.9216282 0.9336889
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16
## 0.9433872 0.9524455 0.9609037 0.9688907 0.9765235 0.9832209 0.9894033 0.9953587
## Comp.17
## 1.0000000
# Find the number of components explaining 80% variance
num_components_80 <- which(cumulative_variance >= 0.80)[1]
cat("Number of components explaining at least 80% variance: ", num_components_80, "\n")
## Number of components explaining at least 80% variance: 3
# Reshape the PCA scores to visualize the eigenimages
mypca2 <- t(mypca$scores)
dim(mypca2) <- c(length(files), height/scale, width/scale, 3)
# Set up plotting parameters for eigenimages
par(mfrow = c(1, num_components_80))
par(mai = c(0.001, 0.001, 0.001, 0.001))
# Plot the eigenimages explaining 80% variance
for (i in 1:num_components_80) {
plot_jpeg(writeJPEG(mypca2[i,,,], quality = 1, bg = "white"))
}
The analysis finds that only three components are needed to explain at least 80% of the variance in the image dataset. This suggests that a substantial portion of the variability in the images can be represented by a relatively small number of components.