# Load needed libraries
library(tidyverse)
library(readr)
library(knitr)
library(OpenImageR)
library(Matrix)
  1. Go to Kaggle.com and build an account if you do not already have one. It is free.

  2. Go to https://www.kaggle.com/c/digit-recognizer/overview, accept the rules of the competition, and download the data. You will not be required to submit work to Kaggle, but you do need the data.

# filename <- "/Users/Audiorunner13/CUNY MSDS Course Work/DATA605 Fall 2022/Week 11/HR Tracker.csv"
filename <- tempfile()
download.file("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA605%20Fall%202022/FinalExam/FinalProblem2/digit-recognizer/train.csv",filename)
train_pixel_src <- read.csv(filename)
  1. Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)
df_label_src <- data.frame(train_pixel_src)
df_label_labels <- df_label_src[1]
df_label_div <- (df_label_src[2:783]/255)
df_label <- cbind(df_label_labels,df_label_div)
  1. What is the frequency distribution of the numbers in the dataset? (5 points)
df_label_50_plot <- head(df_label_src,50)
ggplot(df_label_50_plot,aes(x=label)) + geom_freqpoly(bins=10)

  1. For each number, provide the mean pixel intensity. What does this tell you? (5 points)
(df_label_grp <- df_label %>% group_by(label) %>% summarize(grp_mean=sum(df_label[2:783])/42000) %>% as.data.frame())
##    label grp_mean
## 1      0  102.716
## 2      1  102.716
## 3      2  102.716
## 4      3  102.716
## 5      4  102.716
## 6      5  102.716
## 7      6  102.716
## 8      7  102.716
## 9      8  102.716
## 10     9  102.716

I can only surmise that the pixel intensity for all number images is the same.

  1. Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)
all_images_data <- t(df_label)
dim(all_images_data)
## [1]   783 42000
# dim(all_images_data)
# head(all_images_data)
# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(all_images_data)

# v_scaled_data[is.nan(df_label)] = 0 # if a value is null then set equal to 0

v_svd_decomp <- svd(all_images_data)
plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 81"
v_var_pct <- 1.0
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: NA"
v_var_pct <- .999999999999
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 705"

The closest that we can get to 100% is 705 components. The further to the right of the decimal point the number of vectors does not change from 705.

#  Read and Plot the first image (.jpg)
v_image <- df_label[1,2:783]

# Get the dimension for the first image
dim(v_image)
## [1]   1 782
  1. Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points)
# Save the row,col,channels dimension
v_row_dim <- dim(v_image)[1]
v_col_dim <- dim(v_image)[2]
# v_channel_dim <- 0 #dim(v_image)[3]

print(v_row_dim); print(v_col_dim); # print(v_channel_dim)
## [1] 1
## [1] 782
# plot the image
# imageShow(v_image)

I get the following error but I do not know how to determine the 3rd dimension; Error in Array_range(image, 1) : Error converting object to arma::Cube: Input array must have exactly 3 dimensions.

# Reconstruct all shoes using only a subset of columns which explain X% of variability.

v_newimage <- v_svd_decomp$u[, 1:v_vectors] %*% diag(v_svd_decomp$d[1:v_vectors]) %*% t(v_svd_decomp$v[,1:v_vectors])

# Select ONE shoe of the reconstructed matrix and plot it.
# imageShow(array(v_newimage[,1],c(v_row_dim, v_col_dim, v_channel_dim)))

I am not able to plot the values. I am getting the error: Error in Array_range(image, 1) : Cube::slice(): index out of bounds. It related to not being able to determine the 3rd dimension.

  1. Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)
df_label_8s <- df_label_src[df_label_src$label==8,]
df_label_8s_label <- df_label_8s[1]
df_label_8s_div <- (df_label_8s[2:783]/255)
df_label_8s <- cbind(df_label_8s_label,df_label_8s_div)
all_images_data_8s <- t(df_label_8s)
dim(all_images_data_8s)
## [1]  783 4063
# dim(all_images_data)
# head(all_images_data)
# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(all_images_data_8s)

# v_scaled_data[is.nan(df_label)] = 0 # if a value is null then set equal to 0

v_svd_decomp <- svd(all_images_data_8s)
plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 47"
v_var_pct <- 1.0
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: NA"
v_var_pct <- .99999999999
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))
## [1] "Vectors to use: 538"

Even with a smaller training dataset the results are almost identical as you can see from the plot of the variances explained.

  1. An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)