# Load needed libraries
library(tidyverse)
library(readr)
library(knitr)
library(OpenImageR)
library(Matrix)
Go to Kaggle.com and build an account if you do not already have one. It is free.
Go to https://www.kaggle.com/c/digit-recognizer/overview, accept the rules of the competition, and download the data. You will not be required to submit work to Kaggle, but you do need the data.
# filename <- "/Users/Audiorunner13/CUNY MSDS Course Work/DATA605 Fall 2022/Week 11/HR Tracker.csv"
filename <- tempfile()
download.file("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA605%20Fall%202022/FinalExam/FinalProblem2/digit-recognizer/train.csv",filename)
train_pixel_src <- read.csv(filename)
df_label_src <- data.frame(train_pixel_src)
df_label_labels <- df_label_src[1]
df_label_div <- (df_label_src[2:783]/255)
df_label <- cbind(df_label_labels,df_label_div)
df_label_50_plot <- head(df_label_src,50)
ggplot(df_label_50_plot,aes(x=label)) + geom_freqpoly(bins=10)
(df_label_grp <- df_label %>% group_by(label) %>% summarize(grp_mean=sum(df_label[2:783])/42000) %>% as.data.frame())
## label grp_mean
## 1 0 102.716
## 2 1 102.716
## 3 2 102.716
## 4 3 102.716
## 5 4 102.716
## 6 5 102.716
## 7 6 102.716
## 8 7 102.716
## 9 8 102.716
## 10 9 102.716
I can only surmise that the pixel intensity for all number images is the same.
all_images_data <- t(df_label)
dim(all_images_data)
## [1] 783 42000
# dim(all_images_data)
# head(all_images_data)
# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(all_images_data)
# v_scaled_data[is.nan(df_label)] = 0 # if a value is null then set equal to 0
v_svd_decomp <- svd(all_images_data)
plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)
v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 81"
v_var_pct <- 1.0
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: NA"
v_var_pct <- .999999999999
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 705"
The closest that we can get to 100% is 705 components. The further to the right of the decimal point the number of vectors does not change from 705.
# Read and Plot the first image (.jpg)
v_image <- df_label[1,2:783]
# Get the dimension for the first image
dim(v_image)
## [1] 1 782
# Save the row,col,channels dimension
v_row_dim <- dim(v_image)[1]
v_col_dim <- dim(v_image)[2]
# v_channel_dim <- 0 #dim(v_image)[3]
print(v_row_dim); print(v_col_dim); # print(v_channel_dim)
## [1] 1
## [1] 782
# plot the image
# imageShow(v_image)
I get the following error but I do not know how to determine the 3rd
dimension; Error in Array_range(image, 1) : Error converting object to
arma::Cube
# Reconstruct all shoes using only a subset of columns which explain X% of variability.
v_newimage <- v_svd_decomp$u[, 1:v_vectors] %*% diag(v_svd_decomp$d[1:v_vectors]) %*% t(v_svd_decomp$v[,1:v_vectors])
# Select ONE shoe of the reconstructed matrix and plot it.
# imageShow(array(v_newimage[,1],c(v_row_dim, v_col_dim, v_channel_dim)))
I am not able to plot the values. I am getting the error: Error in Array_range(image, 1) : Cube::slice(): index out of bounds. It related to not being able to determine the 3rd dimension.
df_label_8s <- df_label_src[df_label_src$label==8,]
df_label_8s_label <- df_label_8s[1]
df_label_8s_div <- (df_label_8s[2:783]/255)
df_label_8s <- cbind(df_label_8s_label,df_label_8s_div)
all_images_data_8s <- t(df_label_8s)
dim(all_images_data_8s)
## [1] 783 4063
# dim(all_images_data)
# head(all_images_data)
# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(all_images_data_8s)
# v_scaled_data[is.nan(df_label)] = 0 # if a value is null then set equal to 0
v_svd_decomp <- svd(all_images_data_8s)
plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)
v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 47"
v_var_pct <- 1.0
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: NA"
v_var_pct <- .99999999999
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 538"
Even with a smaller training dataset the results are almost identical as you can see from the plot of the variances explained.