Assignment4

# Load necessary libraries
library(stats)    # For PCA
library(jpeg)    # For reading JPEG images
library(abind)   # For array binding
library(ggplot2) # For plotting
library(stats)   # For PCA
library(reshape2)
library(magick)    # For image processing

## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11

# Set the number of images to read
num <- 20

# List the first 'num' JPEG files in the specified directory
# 'list.files' function is used to retrieve the names of files in the directory
# 'pattern' argument specifies to only list files that end with '.jpg'
# '[1:num]' selects only the first 'num' files from the list
path = "C:/Users/RemoteUser/Downloads/jpg/"
files <- list.files(path, pattern="\\.jpg")[1:num]

# Set the target height, width, and scale for the images
height = 1200; width = 2500; scale = 20

# Define a function to plot JPEG images in an R plotting window
plot_jpeg = function(path, add = FALSE) {
  # Read the JPEG file from the given path
  # 'native = T' uses the native raster if possible for speed
  jpg = readJPEG(path, native = TRUE)
  
  # Extract the resolution of the image, flipping dimensions to [y, x] for plotting
  res = dim(jpg)[2:1]
  
  # If 'add' is FALSE, set up a new plot area
  # 'plot' initializes an empty plot with specified limits and aspect ratio
  # 'type = 'n'' creates a plot without plotting any data
  # 'xaxs = 'i'' and 'yaxs = 'i'' set the style of axis interval calculation
  # 'xaxt = 'n'' and 'yaxt = 'n'' hide the x and y axis
  # 'xlab = ''', 'ylab = ''', and 'bty = 'n'' remove labels and box around the plot
  if (!add) 
    plot(1, 1, xlim = c(1, res[1]), ylim = c(1, res[2]), asp = 1, type = 'n', xaxs = 'i', yaxs = 'i', xaxt = 'n', yaxt = 'n', xlab = '', ylab = '', bty = 'n')
  
  # Plot the image within the dimensions specified by 'res'
  rasterImage(jpg, 1, 1, res[1], res[2])
}

# Load necessary library
# if (!require("magick")) install.packages("magick")
library(magick)

# Parameters for preprocessing
height <- 1200
width <- 2500
scale_factor <- 20

# Set the working directory
setwd(path)

# List JPEG files in the directory
jpeg_files <- list.files(pattern = "\\.jpg$")

# Initialize the array to store preprocessed images
# Calculate new dimensions
new_height <- height / scale_factor
new_width <- width / scale_factor

# Assuming 'num' is the number of JPEG files
num <- length(jpeg_files)
# Initialize the array to store preprocessed images as numeric
# if you plan to convert raw values to numeric for processing
im <- array(dim = c(num, new_height, new_width, 3))

# Loop to read, resize, and store each image
for (i in seq_along(jpeg_files)) {
  file_path <- paste0(getwd(), "/", jpeg_files[i])
  image <- image_read(file_path)
  image <- image_resize(image, paste0(new_width, "x", new_height))
  
  # Within your loop, after resizing the image
  image_data <- magick::image_data(image)
  
  # Convert raw data to numeric if necessary, before storing it in the array
  # This step is crucial if `preprocessed_images` is initialized as a numeric array
  numeric_image_data <- as.numeric(image_data)

 
  im[i,,,] <- numeric_image_data
}

# 'preprocessed_images' now contains all the resized images in an array format

# Initialize a matrix 'flat' with zeros. This matrix will have 20 rows (one for each image)
# and a number of columns equal to the total number of elements in a single flattened image
# 'prod(dim(im))' calculates the total number of elements in the 3D array of a single image
flat = matrix(0, 20, prod(dim(im)))

# Loop through the first 20 image files
for (i in 1:20) {
  # Read the JPEG image anew; this step seems redundant if 'im' already contains the processed images
  # newim <- readJPEG(paste0(path, files[i]))
  
  # Extract and flatten each color channel from the preprocessed image stored in 'im'
  # It's unclear why 'newim' is read but not used; possibly an oversight or meant to use 'im' directly
  r = as.vector(im[i,,,1]) # Red channel
  g = as.vector(im[i,,,2]) # Green channel
  b = as.vector(im[i,,,3]) # Blue channel
  
  # Combine the flattened color channels and assign to the ith row of 'flat'
  # 't()' is used to transpose the combined vector, ensuring it fits as a row in the 'flat' matrix
  flat[i,] <- t(c(r, g, b))
}

# Transpose 'flat' and convert it into a data frame 'shoes'
# This transformation prepares the data for analysis, with each column representing a feature (pixel intensity)
shoes = as.data.frame(t(flat))

# Set up plotting parameters for a 5x4 grid layout
# This allows displaying 20 images in a grid of 5 rows and 4 columns
par(mfrow = c(5, 4))

# Set the margins around each plot to be small so that the images are closer together
par(mai = c(.1, .1, .1, .1))

# Loop through the first 20 images
for (i in 1:20) {  
  # Intended to plot the i-th image using the custom 'plot_jpeg' function
  # 'writeJPEG' is incorrectly used here; it should be replaced with direct image plotting logic
  plot_jpeg(writeJPEG(im[i,,,]))
}

# Scale the 'shoes' data frame
# 'scale' function standardizes the dataset such that each feature (column) has a mean of 0 and a standard deviation of 1
# This is a common preprocessing step in many data analysis and machine learning tasks
scaled = scale(shoes, center = TRUE, scale = TRUE)

# Retrieve and save the original means of the features from the 'shoes' dataset
# 'attr' function is used to access specific attributes of the result from 'scale' function
# "scaled:center" contains the mean values of each feature before scaling
mean.shoe = attr(scaled, "scaled:center")

# Retrieve and save the original standard deviations of the features from the 'shoes' dataset
# "scaled:scale" contains the standard deviation values of each feature before scaling
std.shoe = attr(scaled, "scaled:scale")

# Calculate the correlation matrix of the 'scaled' dataset
# The 'cor' function computes pairwise correlation coefficients between all columns (features) in the dataset
# 'scaled' is expected to be a standardized dataset where each feature has mean 0 and standard deviation 1
# The resulting correlation matrix 'Sigma_' provides insight into the linear relationships between the features

Sigma_ = cor(scaled)

# Perform eigenvalue decomposition on the correlation matrix 'Sigma_'
# 'eigen' function computes eigenvalues and eigenvectors of a square matrix
myeigen = eigen(Sigma_)

# Calculate the cumulative sum of eigenvalues
# This represents the cumulative variance explained by the principal components (eigenvectors)
# Dividing the cumulative sum by the total sum of eigenvalues normalizes these values
# The result shows the proportion of total variance explained as we include more components
cumulative_variance_explained = cumsum(myeigen$values) / sum(myeigen$values)

# Compute a scaling matrix from the first 5 eigenvalues
# 'diag(myeigen$values[1:5]^(-1/2))' creates a diagonal matrix with the inverse square root of the top 5 eigenvalues
# This scaling is part of creating a whitening transformation, which normalizes the variance along principal components
scaling = diag(myeigen$values[1:5]^(-1/2)) / (sqrt(nrow(scaled) - 1))

# Project the 'scaled' data onto the subspace defined by the first 5 eigenvectors, then scale
# This results in 'eigenshoes', which are the coordinates in the new space, potentially for visualization or further analysis
eigenshoes = scaled %*% myeigen$vectors[, 1:5] %*% scaling

# Set up the plotting area to display multiple images (2 rows by 3 columns)
par(mfrow = c(2, 3))

# Define the dimensions for the images and the scaling factor
height = 1200
width = 2500
scale = 20

# Prepare the image data for PCA
# 'newdata' is assigned the value of 'im', presumably a 4-dimensional array of image data
newdata = im

# Reshape 'newdata' to a 2-dimensional matrix
# Each row represents an image, and each column represents a pixel value (RGB channels are flattened)
# The total number of columns is the product of height and width, adjusted by 'scale^2' to reduce dimensionality
dim(newdata) = c(length(files), height * width * 3 / scale^2)

# Perform PCA on the transposed matrix of 'newdata'
# 't(as.matrix(newdata))' transposes the data so that variables (pixels) become rows and observations (images) become columns
# This is required because 'princomp' expects variables as columns
# 'scores=TRUE' requests that principal component scores be calculated
# 'cor=TRUE' indicates that PCA should be done on the correlation matrix, not the covariance matrix, standardizing variables
mypca = princomp(t(as.matrix(newdata)), scores = TRUE, cor = TRUE)

###################Eigenshoes###################################
mypca2=t(mypca$scores)
dim(mypca2)=c(length(files),height/scale,width/scale,3)
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for (i in 1:20){#plot the first 25 Eigenshoes only
plot_jpeg(writeJPEG(mypca2[i,,,], bg="white"))  #complete without reduction
}
################################################################

#################################################
a=round(mypca$sdev[1:20]^2/ sum(mypca$sdev^2),3)
cumsum(a)

##  Comp.1  Comp.2  Comp.3  Comp.4  Comp.5  Comp.6  Comp.7  Comp.8  Comp.9 Comp.10 
##   0.735   0.839   0.886   0.910   0.927   0.943   0.954   0.962   0.969   0.975 
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20 
##   0.980   0.985   0.989   0.993   0.996   0.999   1.000   1.000   1.000   1.000

# Assuming 'a' is a vector of the variances explained by each PC and is defined previously.
# Cumulative proportion of variance explained
var_explained <- cumsum(a)

# Number of components
components <- 1:length(var_explained)

# Plot the scree plot
plot(components, var_explained, type = 'b', 
     xlab = "Principal Components", ylab = "Cumulative Proportion of Variance Explained",
     main = "Scree Plot")

# Draw a line at the 80% threshold
abline(h = 0.8, col = 'red', lty = 2)

# Find the component where the cumulative proportion first reaches 0.8
threshold <- which(var_explained >= 0.8)[1]

# Draw a vertical line at the 80% threshold component
abline(v = threshold, col = 'blue', lty = 2)

# Add a point to indicate the exact location where 80% is reached
points(threshold, var_explained[threshold], col = 'blue', pch = 19)

# Annotate the plot with the component number where 80% variance is explained
text(threshold, 0.8, labels = paste("PC", threshold, "- 80%"), pos = 3, col = 'blue')

x = t(t(eigenshoes) %*% scaled)

# Display 'x' to observe the outcome of the operation:
x

##          [,1]       [,2]       [,3]        [,4]       [,5]
## V1  -595.8520  -51.92918 -111.26809  132.363385 -105.94882
## V2  -608.1582  202.70389  -75.28684   76.390479  -17.51057
## V3  -608.1582  202.70389  -75.28684   76.390479  -17.51057
## V4  -464.2602 -282.33284 -208.43546  263.608240  -43.19245
## V5  -563.0754  254.16556  -79.44150  -97.137565   94.80717
## V6  -592.9561  200.19157  -19.24560  -30.175694  -73.09495
## V7  -492.6311 -301.56216 -277.10971 -113.401823   38.71332
## V8  -524.9344 -280.82976 -236.47672  -84.021547   24.16985
## V9  -610.8360  108.82335 -179.55330 -103.316274   67.18046
## V10 -532.9606  335.45856  -92.54863 -106.993292   74.49034
## V11 -598.0217  227.61765   28.25038   19.223262  -75.51201
## V12 -596.8751  208.86604  109.78719   34.450274  -72.78675
## V13 -596.8751  208.86604  109.78719   34.450274  -72.78675
## V14 -600.6475 -150.50918   66.73737  101.153170  105.42053
## V15 -562.6368 -263.61091  125.20834 -133.401536 -133.85844
## V16 -562.6368 -263.61091  125.20834 -133.401536 -133.85844
## V17 -571.9670 -175.33254  192.73867    9.088176  142.34677
## V18 -593.7684  -72.37707  176.80126   54.254915  151.10760
## V19 -603.6793 -138.86705  150.57790  -68.839131  -26.61154
## V20 -592.6462 -112.98911  160.94186   66.488320   77.99205

Assignment4

Fomba Kassoh

2024-02-25