# Load necessary libraries
library(stats) # For PCA
library(jpeg) # For reading JPEG images
library(abind) # For array binding
library(ggplot2) # For plotting
library(stats) # For PCA
library(reshape2)
library(magick) # For image processing
## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
# Set the number of images to read
num <- 20
# List the first 'num' JPEG files in the specified directory
# 'list.files' function is used to retrieve the names of files in the directory
# 'pattern' argument specifies to only list files that end with '.jpg'
# '[1:num]' selects only the first 'num' files from the list
path = "C:/Users/RemoteUser/Downloads/jpg/"
files <- list.files(path, pattern="\\.jpg")[1:num]
# Set the target height, width, and scale for the images
height = 1200; width = 2500; scale = 20
# Define a function to plot JPEG images in an R plotting window
plot_jpeg = function(path, add = FALSE) {
# Read the JPEG file from the given path
# 'native = T' uses the native raster if possible for speed
jpg = readJPEG(path, native = TRUE)
# Extract the resolution of the image, flipping dimensions to [y, x] for plotting
res = dim(jpg)[2:1]
# If 'add' is FALSE, set up a new plot area
# 'plot' initializes an empty plot with specified limits and aspect ratio
# 'type = 'n'' creates a plot without plotting any data
# 'xaxs = 'i'' and 'yaxs = 'i'' set the style of axis interval calculation
# 'xaxt = 'n'' and 'yaxt = 'n'' hide the x and y axis
# 'xlab = ''', 'ylab = ''', and 'bty = 'n'' remove labels and box around the plot
if (!add)
plot(1, 1, xlim = c(1, res[1]), ylim = c(1, res[2]), asp = 1, type = 'n', xaxs = 'i', yaxs = 'i', xaxt = 'n', yaxt = 'n', xlab = '', ylab = '', bty = 'n')
# Plot the image within the dimensions specified by 'res'
rasterImage(jpg, 1, 1, res[1], res[2])
}
# Load necessary library
# if (!require("magick")) install.packages("magick")
library(magick)
# Parameters for preprocessing
height <- 1200
width <- 2500
scale_factor <- 20
# Set the working directory
setwd(path)
# List JPEG files in the directory
jpeg_files <- list.files(pattern = "\\.jpg$")
# Initialize the array to store preprocessed images
# Calculate new dimensions
new_height <- height / scale_factor
new_width <- width / scale_factor
# Assuming 'num' is the number of JPEG files
num <- length(jpeg_files)
# Initialize the array to store preprocessed images as numeric
# if you plan to convert raw values to numeric for processing
im <- array(dim = c(num, new_height, new_width, 3))
# Loop to read, resize, and store each image
for (i in seq_along(jpeg_files)) {
file_path <- paste0(getwd(), "/", jpeg_files[i])
image <- image_read(file_path)
image <- image_resize(image, paste0(new_width, "x", new_height))
# Within your loop, after resizing the image
image_data <- magick::image_data(image)
# Convert raw data to numeric if necessary, before storing it in the array
# This step is crucial if `preprocessed_images` is initialized as a numeric array
numeric_image_data <- as.numeric(image_data)
im[i,,,] <- numeric_image_data
}
# 'preprocessed_images' now contains all the resized images in an array format
# Initialize a matrix 'flat' with zeros. This matrix will have 20 rows (one for each image)
# and a number of columns equal to the total number of elements in a single flattened image
# 'prod(dim(im))' calculates the total number of elements in the 3D array of a single image
flat = matrix(0, 20, prod(dim(im)))
# Loop through the first 20 image files
for (i in 1:20) {
# Read the JPEG image anew; this step seems redundant if 'im' already contains the processed images
# newim <- readJPEG(paste0(path, files[i]))
# Extract and flatten each color channel from the preprocessed image stored in 'im'
# It's unclear why 'newim' is read but not used; possibly an oversight or meant to use 'im' directly
r = as.vector(im[i,,,1]) # Red channel
g = as.vector(im[i,,,2]) # Green channel
b = as.vector(im[i,,,3]) # Blue channel
# Combine the flattened color channels and assign to the ith row of 'flat'
# 't()' is used to transpose the combined vector, ensuring it fits as a row in the 'flat' matrix
flat[i,] <- t(c(r, g, b))
}
# Transpose 'flat' and convert it into a data frame 'shoes'
# This transformation prepares the data for analysis, with each column representing a feature (pixel intensity)
shoes = as.data.frame(t(flat))
# Set up plotting parameters for a 5x4 grid layout
# This allows displaying 20 images in a grid of 5 rows and 4 columns
par(mfrow = c(5, 4))
# Set the margins around each plot to be small so that the images are closer together
par(mai = c(.1, .1, .1, .1))
# Loop through the first 20 images
for (i in 1:20) {
# Intended to plot the i-th image using the custom 'plot_jpeg' function
# 'writeJPEG' is incorrectly used here; it should be replaced with direct image plotting logic
plot_jpeg(writeJPEG(im[i,,,]))
}

# Scale the 'shoes' data frame
# 'scale' function standardizes the dataset such that each feature (column) has a mean of 0 and a standard deviation of 1
# This is a common preprocessing step in many data analysis and machine learning tasks
scaled = scale(shoes, center = TRUE, scale = TRUE)
# Retrieve and save the original means of the features from the 'shoes' dataset
# 'attr' function is used to access specific attributes of the result from 'scale' function
# "scaled:center" contains the mean values of each feature before scaling
mean.shoe = attr(scaled, "scaled:center")
# Retrieve and save the original standard deviations of the features from the 'shoes' dataset
# "scaled:scale" contains the standard deviation values of each feature before scaling
std.shoe = attr(scaled, "scaled:scale")
# Calculate the correlation matrix of the 'scaled' dataset
# The 'cor' function computes pairwise correlation coefficients between all columns (features) in the dataset
# 'scaled' is expected to be a standardized dataset where each feature has mean 0 and standard deviation 1
# The resulting correlation matrix 'Sigma_' provides insight into the linear relationships between the features
Sigma_ = cor(scaled)
# Perform eigenvalue decomposition on the correlation matrix 'Sigma_'
# 'eigen' function computes eigenvalues and eigenvectors of a square matrix
myeigen = eigen(Sigma_)
# Calculate the cumulative sum of eigenvalues
# This represents the cumulative variance explained by the principal components (eigenvectors)
# Dividing the cumulative sum by the total sum of eigenvalues normalizes these values
# The result shows the proportion of total variance explained as we include more components
cumulative_variance_explained = cumsum(myeigen$values) / sum(myeigen$values)
# Compute a scaling matrix from the first 5 eigenvalues
# 'diag(myeigen$values[1:5]^(-1/2))' creates a diagonal matrix with the inverse square root of the top 5 eigenvalues
# This scaling is part of creating a whitening transformation, which normalizes the variance along principal components
scaling = diag(myeigen$values[1:5]^(-1/2)) / (sqrt(nrow(scaled) - 1))
# Project the 'scaled' data onto the subspace defined by the first 5 eigenvectors, then scale
# This results in 'eigenshoes', which are the coordinates in the new space, potentially for visualization or further analysis
eigenshoes = scaled %*% myeigen$vectors[, 1:5] %*% scaling
# Set up the plotting area to display multiple images (2 rows by 3 columns)
par(mfrow = c(2, 3))
# Define the dimensions for the images and the scaling factor
height = 1200
width = 2500
scale = 20
# Prepare the image data for PCA
# 'newdata' is assigned the value of 'im', presumably a 4-dimensional array of image data
newdata = im
# Reshape 'newdata' to a 2-dimensional matrix
# Each row represents an image, and each column represents a pixel value (RGB channels are flattened)
# The total number of columns is the product of height and width, adjusted by 'scale^2' to reduce dimensionality
dim(newdata) = c(length(files), height * width * 3 / scale^2)
# Perform PCA on the transposed matrix of 'newdata'
# 't(as.matrix(newdata))' transposes the data so that variables (pixels) become rows and observations (images) become columns
# This is required because 'princomp' expects variables as columns
# 'scores=TRUE' requests that principal component scores be calculated
# 'cor=TRUE' indicates that PCA should be done on the correlation matrix, not the covariance matrix, standardizing variables
mypca = princomp(t(as.matrix(newdata)), scores = TRUE, cor = TRUE)
###################Eigenshoes###################################
mypca2=t(mypca$scores)
dim(mypca2)=c(length(files),height/scale,width/scale,3)
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for (i in 1:20){#plot the first 25 Eigenshoes only
plot_jpeg(writeJPEG(mypca2[i,,,], bg="white")) #complete without reduction
}
################################################################

#################################################
a=round(mypca$sdev[1:20]^2/ sum(mypca$sdev^2),3)
cumsum(a)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## 0.735 0.839 0.886 0.910 0.927 0.943 0.954 0.962 0.969 0.975
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18 Comp.19 Comp.20
## 0.980 0.985 0.989 0.993 0.996 0.999 1.000 1.000 1.000 1.000
# Assuming 'a' is a vector of the variances explained by each PC and is defined previously.
# Cumulative proportion of variance explained
var_explained <- cumsum(a)
# Number of components
components <- 1:length(var_explained)
# Plot the scree plot
plot(components, var_explained, type = 'b',
xlab = "Principal Components", ylab = "Cumulative Proportion of Variance Explained",
main = "Scree Plot")
# Draw a line at the 80% threshold
abline(h = 0.8, col = 'red', lty = 2)
# Find the component where the cumulative proportion first reaches 0.8
threshold <- which(var_explained >= 0.8)[1]
# Draw a vertical line at the 80% threshold component
abline(v = threshold, col = 'blue', lty = 2)
# Add a point to indicate the exact location where 80% is reached
points(threshold, var_explained[threshold], col = 'blue', pch = 19)
# Annotate the plot with the component number where 80% variance is explained
text(threshold, 0.8, labels = paste("PC", threshold, "- 80%"), pos = 3, col = 'blue')

x = t(t(eigenshoes) %*% scaled)
# Display 'x' to observe the outcome of the operation:
x
## [,1] [,2] [,3] [,4] [,5]
## V1 -595.8520 -51.92918 -111.26809 132.363385 -105.94882
## V2 -608.1582 202.70389 -75.28684 76.390479 -17.51057
## V3 -608.1582 202.70389 -75.28684 76.390479 -17.51057
## V4 -464.2602 -282.33284 -208.43546 263.608240 -43.19245
## V5 -563.0754 254.16556 -79.44150 -97.137565 94.80717
## V6 -592.9561 200.19157 -19.24560 -30.175694 -73.09495
## V7 -492.6311 -301.56216 -277.10971 -113.401823 38.71332
## V8 -524.9344 -280.82976 -236.47672 -84.021547 24.16985
## V9 -610.8360 108.82335 -179.55330 -103.316274 67.18046
## V10 -532.9606 335.45856 -92.54863 -106.993292 74.49034
## V11 -598.0217 227.61765 28.25038 19.223262 -75.51201
## V12 -596.8751 208.86604 109.78719 34.450274 -72.78675
## V13 -596.8751 208.86604 109.78719 34.450274 -72.78675
## V14 -600.6475 -150.50918 66.73737 101.153170 105.42053
## V15 -562.6368 -263.61091 125.20834 -133.401536 -133.85844
## V16 -562.6368 -263.61091 125.20834 -133.401536 -133.85844
## V17 -571.9670 -175.33254 192.73867 9.088176 142.34677
## V18 -593.7684 -72.37707 176.80126 54.254915 151.10760
## V19 -603.6793 -138.86705 150.57790 -68.839131 -26.61154
## V20 -592.6462 -112.98911 160.94186 66.488320 77.99205