Final Problem 2

# Load needed libraries
library(tidyverse)
library(readr)
library(knitr)
library(OpenImageR)
library(Matrix)

Go to Kaggle.com and build an account if you do not already have one. It is free.
Go to https://www.kaggle.com/c/digit-recognizer/overview, accept the rules of the competition, and download the data. You will not be required to submit work to Kaggle, but you do need the data.

# filename <- "/Users/Audiorunner13/CUNY MSDS Course Work/DATA605 Fall 2022/Week 11/HR Tracker.csv"
filename <- tempfile()
download.file("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA605%20Fall%202022/FinalExam/FinalProblem2/digit-recognizer/train.csv",filename)
train_pixel_src <- read.csv(filename)

Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)

df_label_src <- data.frame(train_pixel_src)

df_label_labels <- df_label_src[1]
df_label_div <- (df_label_src[2:783]/255)
df_label <- cbind(df_label_labels,df_label_div)

What is the frequency distribution of the numbers in the dataset? (5 points)

df_label_50_plot <- head(df_label_src,50)
ggplot(df_label_50_plot,aes(x=label)) + geom_freqpoly(bins=10)

For each number, provide the mean pixel intensity. What does this tell you? (5 points)

(df_label_grp <- df_label %>% group_by(label) %>% summarize(grp_mean=sum(df_label[2:783])/42000) %>% as.data.frame())

##    label grp_mean
## 1      0  102.716
## 2      1  102.716
## 3      2  102.716
## 4      3  102.716
## 5      4  102.716
## 6      5  102.716
## 7      6  102.716
## 8      7  102.716
## 9      8  102.716
## 10     9  102.716

I can only surmise that the pixel intensity for all number images is the same.

Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)

all_images_data <- t(df_label)
dim(all_images_data)

## [1]   783 42000

# dim(all_images_data)
# head(all_images_data)

# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(all_images_data)

# v_scaled_data[is.nan(df_label)] = 0 # if a value is null then set equal to 0

v_svd_decomp <- svd(all_images_data)

plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))

## [1] "Vectors to use: 81"

v_var_pct <- 1.0
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))

## [1] "Vectors to use: NA"

v_var_pct <- .999999999999
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))

## [1] "Vectors to use: 705"

The closest that we can get to 100% is 705 components. The further to the right of the decimal point the number of vectors does not change from 705.

#  Read and Plot the first image (.jpg)
v_image <- df_label[1,2:783]

# Get the dimension for the first image
dim(v_image)

## [1]   1 782

Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points)

# Save the row,col,channels dimension
v_row_dim <- dim(v_image)[1]
v_col_dim <- dim(v_image)[2]
# v_channel_dim <- 0 #dim(v_image)[3]

print(v_row_dim); print(v_col_dim); # print(v_channel_dim)

## [1] 1

## [1] 782

# plot the image
# imageShow(v_image)

I get the following error but I do not know how to determine the 3rd dimension; Error in Array_range(image, 1) : Error converting object to arma::Cube: Input array must have exactly 3 dimensions.

# Reconstruct all shoes using only a subset of columns which explain X% of variability.

v_newimage <- v_svd_decomp$u[, 1:v_vectors] %*% diag(v_svd_decomp$d[1:v_vectors]) %*% t(v_svd_decomp$v[,1:v_vectors])

# Select ONE shoe of the reconstructed matrix and plot it.
# imageShow(array(v_newimage[,1],c(v_row_dim, v_col_dim, v_channel_dim)))

I am not able to plot the values. I am getting the error: Error in Array_range(image, 1) : Cube::slice(): index out of bounds. It related to not being able to determine the 3rd dimension.

Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)

df_label_8s <- df_label_src[df_label_src$label==8,]
df_label_8s_label <- df_label_8s[1]
df_label_8s_div <- (df_label_8s[2:783]/255)
df_label_8s <- cbind(df_label_8s_label,df_label_8s_div)

all_images_data_8s <- t(df_label_8s)
dim(all_images_data_8s)

## [1]  783 4063

# dim(all_images_data)
# head(all_images_data)

# Scale the image data before applying the singular value decomposition (svd) function
v_scaled_data <- scale(all_images_data_8s)

# v_scaled_data[is.nan(df_label)] = 0 # if a value is null then set equal to 0

v_svd_decomp <- svd(all_images_data_8s)

plot(v_svd_decomp$d^2/sum(v_svd_decomp$d^2), type="b",xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

v_var_pct <- 0.95
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))

## [1] "Vectors to use: 47"

v_var_pct <- 1.0
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))

## [1] "Vectors to use: NA"

v_var_pct <- .99999999999
v_vectors <- which(cumsum(v_svd_decomp$d^2/sum(v_svd_decomp$d^2)) >= v_var_pct)[1]
print(paste0("Vectors to use: ", v_vectors))

## [1] "Vectors to use: 538"

Even with a smaller training dataset the results are almost identical as you can see from the plot of the variances explained.

An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)

Final Problem 2

Peter Gatica

2022-12-18