# Specifies the file path to the directory where JPG images are stored.
folder_path <- "/Users/jdr2158/jpg"
# Searches the specified folder for files ending in '.jpg',
jpg_files <- list.files(path = folder_path, pattern = "\\.jpg$", full.names = TRUE)
# Creates an empty list to hold image objects once they are loaded.
image_list <- list()
# Iterates over each file path in the 'jpg_files' array, loading each image into memory one at a time.
for (i in 1:length(jpg_files)) {
# Utilizes the 'load.image' function to read the current image file, converting it into an R image object.
img <- load.image(jpg_files[i])
# Inserts the newly loaded image object into the 'image_list',
# Preserves the order of images as found in the source folder.
image_list[[i]] <- img
}
# Displays the first image from the 'image_list' on the plotting area,
plot(image_list[[1]])
# Defines a function to plot a 3D array as an image, utilizing the EBImage package for image manipulation.
plot_jpg_array <- function(img_array) {
# Converts the input array into an EBImage object, specifying 'Color' as the color mode.
img_eb <- EBImage::Image(data = img_array, colormode = 'Color')
# Rotates the EBImage object by 90 degrees to ensure the image is oriented correctly for display.
img_rotated <- EBImage::rotate(img_eb, 90)
# Displays the rotated image on the screen, allowing for visual inspection or analysis.
EBImage::display(img_rotated)
}
# Initializes parameters for resizing images, setting specific height, width, and scale values.
height <- 1200
width <- 2500
scale <- 20
# Initializes a 4D array with dimensions tailored to store resized images,
# Based on the specified scale and the number of images.
im <- array(rep(0, length(image_list) * (height / scale) * (width / scale) * 3),
dim = c(length(image_list), height / scale, width / scale, 3))
# Iterates through each image in the 'image_list',
# Resizes each image and storing it within the pre-initialized 4D array.
for (i in 1:length(image_list)) {
# Retrieves the file path for the current image from 'jpg_files' and
# reads the image using the 'readJPEG' function.
tmp <- jpg_files[i]
img <- readJPEG(tmp)
# Resizes the current image to match the specified dimensions and stores the resized image in the 4D array.
temp <- EBImage::resize(as.Image(img), height / scale, width / scale)
im[i, , , ] <- array(temp@.Data, dim = c(1, height / scale, width / scale, 3))
}
# Configures plotting parameters to arrange the output in a 3x3 grid,
# Allows for multiple images to be displayed simultaneously.
par(mfrow=c(3,3))
# Sets the margins around each plot in the grid to ensure each image has adequate spacing and is clearly visible.
par(mai=c(0.3,0.3,0.3,0.3))
# Iterates through the images, up to a maximum of 9 (the capacity of the 3x3 grid),
# Calls the 'plot_jpg_array' function for each to display
for (i in 1:min(length(image_list), dim(im)[1])) {
plot_jpg_array(im[i, , , ])
}
# Sets the parameters for image dimensions and scaling factor for the analysis.
height = 1200
width = 2500
scale = 20
# Prepares data for PCA by assigning the 4D array of resized images to 'newdata' and reshapes to a 2D matrix.
newdata = im
dim(newdata) = c(length(jpg_files), height * width * 3 / scale^2)
# Executes the PCA on the transposed matrix of 'newdata',
# Calculates scores for each principal component and standardizing variables.
mypca = princomp(t(as.matrix(newdata)), scores = TRUE, cor = TRUE)
# Calculates and verifies that the sum of the squared standard deviations (variances) of the principal components
# equals 1.
sum(mypca$sdev^2 / sum(mypca$sdev^2))
## [1] 1
The first 2 components nearly make up 80% of the variability. The first 5 components make up almost 90% of the variability.
# Calculates the proportion of variance explained by each principal component relative to the total variance.
# This step quantifies the importance of each component in capturing the variability in the dataset.
mycomponents = mypca$sdev^2 / sum(mypca$sdev^2)
# Summarizes the cumulative variance explained by the first two principal components;
# The first two components make up 80% of the variability
sum(mycomponents[1:2])
## [1] 0.7940449
## [1] 0.8913841
# Create a data frame from the PCA scores for the first two components
pca_scores_df <- data.frame(PC1 = mypca$scores[, 1], PC2 = mypca$scores[, 2])
# Generate a scatter plot of the first two principal components
ggplot(pca_scores_df, aes(x = PC1, y = PC2)) +
geom_point(alpha = 0.5) + # Use semi-transparent points for better visibility of data density
theme_minimal() + # Apply a minimal theme for a clean and clear visualization
labs(title = "Distribution of Images in Principal Component Space",
subtitle = "Visualizing the spread and clustering of images based on the first two principal components",
x = "Principal Component 1 (PC1)",
y = "Principal Component 2 (PC2)",
caption = "Each point represents an image projected onto the first two principal components.") +
geom_hline(yintercept = 0, linetype = "dashed") + # Add a horizontal dashed line at y=0 for reference
geom_vline(xintercept = 0, linetype = "dashed") # Add a vertical dashed line at x=0 for reference
# Generate a Scree Plot to visualize the proportion of variance explained by each principal component
plot(mypca$sdev^2 / sum(mypca$sdev^2),
xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = 'b', # Use both points and lines
main = "Scree Plot: Variance Explained by Each Principal Component",
sub = "Identifies the optimal number of components to retain")
# Add a horizontal line at y=0.01 to highlight smaller contributions of variance
abline(h = 0.01, col = "red", lty = 2) # Dashed red line as a threshold indicator
# Calculate the cumulative variance explained by the principal components
cum_var_explained <- cumsum(mypca$sdev^2 / sum(mypca$sdev^2))
# Generate a plot to visualize the cumulative proportion of variance explained by the principal components
plot(cum_var_explained,
xlab = "Number of Principal Components",
ylab = "Cumulative Proportion of Variance Explained",
type = 'b', # Use both points and lines for visualization
main = "Cumulative Variance Explained by PCA Components",
sub = "Assessing the amount of information retained across components")
# Generate a heatmap to visualize the loadings of each principal component
heatmap(t(mypca$loadings),
Rowv = NA,
Colv = NA,
scale = "column",
main = "PCA Loadings: Variable Contributions to Components",
sub = "The influence of original variables on principal components")
# Calculates the proportion of variance explained by each principal component
# by dividing the square of each singular value (sdev^2) by the sum of all squared singular values.
# This step quantifies how much of the total dataset variability each principal component captures.
var_explained <- mypca$sdev^2 / sum(mypca$sdev^2)
# Calculates the cumulative variance explained by summing the variance explained by each component up to that point.
# This cumulative measure helps to understand how much of the total dataset variability
# is captured as more components are considered.
cum_var_explained <- cumsum(var_explained)
# Identifies the minimum number of principal components required to reach or exceed 80% of the total variance.
# Used for dimensionality reduction, aiming to retain most of the information while using fewer components.
num_components_80pct <- which(cum_var_explained >= 0.8)[1]
# Extracts the loadings for the principal components that contribute to at least 80% of the total variance.
# Loadings indicate the contribution of each original variable (e.g., pixel intensity in image analysis) to the principal component
loadings_80pct <- mypca$loadings[, 1:num_components_80pct]
# Prints the loadings for the principal components that together account for almost 80% variability.
# Visualizes the variables' contributions to the identified components.
print(loadings_80pct)
## Comp.1 Comp.2 Comp.3
## [1,] 0.2515577 0.05962807 0.14114605
## [2,] 0.2564669 -0.22970932 0.09482706
## [3,] 0.1974907 0.34526438 0.24576573
## [4,] 0.2391458 -0.30516320 0.13606194
## [5,] 0.2525203 -0.23895414 0.06096558
## [6,] 0.2096918 0.34776361 0.42324640
## [7,] 0.2220439 0.32176935 0.36923615
## [8,] 0.2597468 -0.13861061 0.27362524
## [9,] 0.2242754 -0.39008169 0.17677165
## [10,] 0.2523894 -0.26939880 -0.02645111
## [11,] 0.2504276 -0.23813195 -0.14578328
## [12,] 0.2541524 0.16064493 -0.16973475
## [13,] 0.2374627 0.25443032 -0.18739393
## [14,] 0.2431988 0.17131145 -0.34992145
## [15,] 0.2531910 0.06188346 -0.32463869
## [16,] 0.2571186 0.11980858 -0.24383184
## [17,] 0.2513643 0.10507094 -0.30609685
# Transposes the PCA scores matrix to align with the original image data structure,
mypca2 = t(mypca$scores)
# Reshapes the transposed PCA scores to match the original dimensions of the images, adjusted by the scale factor.
dim(mypca2) = c(length(jpg_files), height / scale, width / scale, 3)
# Prepares the plotting area to display the first 17 principal components as images in a 5x5 grid, leaving some grid spaces empty
par(mfrow = c(5, 5))
par(mai = c(0.001, 0.001, 0.001, 0.001)) # Sets the margins around each plot to minimal values for a compact display.
# Iterates through the first 17 principal components to plot them as images.
# This loop visualizes the components, revealing the patterns or features that each component captures.
for (i in 1:17) {
# Rescales each principal component's image data to a [0, 1] range
pc_image = mypca2[i, , , ]
min_val = min(pc_image)
max_val = max(pc_image)
rescaled_pc_image = (pc_image - min_val) / (max_val - min_val)
# Plots the rescaled principal component as an image, using a custom plotting function designed for this purpose.
# This visualization allows for the examination of what image features or patterns are most prominent.
plot_jpg_array(rescaled_pc_image)
}