library(jpeg)         # for readJPEG
library(OpenImageR)   # for flipImage
library(ggplot2)      # for qplot
library(tidyverse)    # for %>%
library(caret)        # for predict
library(nnet)         # for multinom





Principal Component Analysis and Confusion Matrix





Prepare Data



Bring in the Training and Test Data Sets.


#  The first column of the Training set is the label, "zero" through "nine"

# Note: file sizes are too big for free github accounts

base_path="C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem 2 PCA Images\\"
pathnpic<-paste0(base_path,"train.csv" )
train_df=read.csv(pathnpic, stringsAsFactors=F)  #read in the raw data

train_labels=train_df[,1] 
train_df[,1]=NULL  


pathnpic<-paste0(base_path,"test.csv" )
test_df=read.csv(pathnpic, stringsAsFactors=F)


train_vec=as.vector(unlist(train_df)) 
train_arr=array(train_vec, dim=c(nrow(train_df),28,28))

test_vec=as.vector(unlist(test_df)) 
test_arr=array(train_vec, dim=c(nrow(test_df),28,28))

# remove objects dynamically

rm(test_vec)
rm(train_df)
rm(test_df)
rm(test_arr)


3. Using the training.csv file, plot representations of the first 10 images to understand the data format.


# dim(train_arr)    # 42000 28 28

par(mfrow=c(2,5))

for (i in 1:10){ 
  image(1:28, 1:28, flipImage(train_arr[i,,]))
}


3. Go ahead and divide all pixels by 255 to produce values between 0 and 1.


train_arr<-train_arr/255

Analyze Data



4. What is the frequency distribution of the numbers in the dataset?


hist(train_vec, breaks = 100)


We can create a frequency table with the table() function. Lets look at the most frequent intensities.


train_freq<-table(train_vec)

tail(sort(train_freq))
rm(train_vec)
## train_vec
##      251      255      252      254      253        0 
##    78319   223310   465950   560067  1066447 26621312


If we remove the pure white, We can get a better histogram on the other values


logInd<-train_arr>0

hist(train_arr[logInd], breaks = 100, freq = FALSE)


5. For each number, provide the mean pixel intensity. What does this tell you?


mean(train_arr)
## [1] 0.1310153


A lower value indicates darker images. Pure black is the absence of color and white is full color.

These images are fairly dark.

The eigen shoe images we worked with were filled with pure white (RGB values of 255) and the mean intensity was around .8



PCA



6. Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?


PCA will produce 784 components. Each pixel intensity represents a response for each image.


We can either use prcomp() or scale(), cor() and eigen(). Below uses scale(), cor() and eigen()

# convert this to 784 rows (pixels) and 42000 columns (images)
train_flat_mat<-t(matrix(train_arr, ncol=28*28, byrow = FALSE))

raw_scaled<-scale(train_flat_mat, scale = FALSE)
raw_cor<-raw_scaled %*% t(raw_scaled) / (nrow(raw_scaled)-1)
raw_eigen <- eigen(raw_cor)

cum.var  <- cumsum(raw_eigen$values) / sum(raw_eigen$values)
thres    <- min(which(cum.var > .95))

sprintf("Saving off the top %d principal components",thres)
top_loadings<-raw_eigen$rotation[1:thres,]
## [1] "Saving off the top 119 principal components"


Here we use prcomp() and display a scree plot of just the first 50 to get a visual of how many Eigen Values are close to zero.


train_pca<-prcomp(train_flat_mat, scale = TRUE)

var_explained = train_pca$sdev^2 / sum(train_pca$sdev^2)

#create scree plot


qplot(c(1:50), var_explained[1:50]) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot First 50 Only") 


7. Plot the first 10 images generated by PCA. They will appear to be noise. Why?


par(mfrow=c(2,5))

for (i in 1:10) {
  image(1:28,1:28,array(train_pca$x[,i], dim=c(28,28)))
}

This represents variances among all 9 digits so it looks smudgy.


You can sort of see that the patches get darker in spots that most numbers go through.


8. Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see?

eights<-which(train_labels == 8)
dim(train_arr)
train_eights<-train_arr[eights,,]
train_flat_mat<-t(matrix(train_eights, ncol=28*28, byrow = FALSE))
pca<-prcomp(train_flat_mat, scale = TRUE)

par(mfrow=c(2,5))

for (i in 1:10) {
  image(1:28,1:28,array(pca$x[,i], dim=c(28,28)))
}

## [1] 42000    28    28


Looks like 8s. Each successive image gets more and more blurry or more specifically it contrasts more and in different spots.



Predictions and Confusion Matrix



9. An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal.


Create model


# convert this to 784 rows (pixels) and 42000 columns (images)
train_flat_mat<-t(matrix(train_arr, ncol=28*28, byrow = FALSE))


# add labels to train_flat_mat and reduce the size to 7K to prevent memory issues
train_flat_new<-as.data.frame(cbind(train_labels, t(train_flat_mat)  ))
train_flat_new<-train_flat_new[1:7000,]




digit_model <- nnet::multinom(train_labels ~., data = train_flat_new, MaxNWts =10000000)



# Summarize the model -- not responding
# summary(digit_model)
## # weights:  7860 (7065 variable)
## initial  value 16118.095651 
## iter  10 value 4210.098486
## iter  20 value 3170.440865
## iter  30 value 2780.391793
## iter  40 value 2469.949179
## iter  50 value 2098.440621
## iter  60 value 1438.861688
## iter  70 value 1014.932193
## iter  80 value 802.695313
## iter  90 value 622.340950
## iter 100 value 236.584221
## final  value 236.584221 
## stopped after 100 iterations


Put test matrix together


# put your test matrix togehter
test_flat_new<-train_flat_new[7:14000,]
test_flat_new_na<-test_flat_new[,-1]

dim(test_flat_new)
## [1] 13994   785


Display the summary of predict() and the success rate based on the known labels.


# Make predictions
predicted.classes <- digit_model %>% predict(test_flat_new)

summary(predicted.classes)
##    0    1    2    3    4    5    6    7    8    9 NA's 
##  686  776  750  681  668  627  697  733  673  703 7000


Display the success rate.


true_or_false<-(predicted.classes == test_flat_new$train_labels)

results<-table(predicted.classes == test_flat_new$train_labels)

success_rate<-results[2]/(results[1]+results[2])

sprintf("The success rate is %.f", success_rate)
## [1] "The success rate is 1"


Print out the confusion matrix. Not sure why its called confusion but the grid shows the predicted vs actual classification of each image.


confusionMatrix(
  factor(predicted.classes),
  factor(test_flat_new$train_labels))  
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 686   0   0   0   0   0   0   0   0   0
##          1   0 776   0   0   0   0   0   0   0   0
##          2   0   0 750   0   0   0   0   0   0   0
##          3   0   0   0 680   0   0   0   0   1   0
##          4   0   0   0   0 668   0   0   0   0   0
##          5   0   0   0   0   0 627   0   0   0   0
##          6   0   0   0   0   0   0 697   0   0   0
##          7   0   0   0   0   0   0   0 733   0   0
##          8   0   1   1   1   0   3   0   1 665   1
##          9   0   0   0   1   0   0   0   2   0 700
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9983         
##                  95% CI : (0.997, 0.9991)
##     No Information Rate : 0.1111         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9981         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           1.00000   0.9987   0.9987  0.99707  1.00000  0.99524
## Specificity           1.00000   1.0000   1.0000  0.99984  1.00000  1.00000
## Pos Pred Value        1.00000   1.0000   1.0000  0.99853  1.00000  1.00000
## Neg Pred Value        1.00000   0.9998   0.9998  0.99968  1.00000  0.99953
## Prevalence            0.09808   0.1111   0.1074  0.09751  0.09551  0.09008
## Detection Rate        0.09808   0.1110   0.1072  0.09723  0.09551  0.08965
## Detection Prevalence  0.09808   0.1110   0.1072  0.09737  0.09551  0.08965
## Balanced Accuracy     1.00000   0.9994   0.9993  0.99845  1.00000  0.99762
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           1.00000   0.9959  0.99850   0.9986
## Specificity           1.00000   1.0000  0.99874   0.9995
## Pos Pred Value        1.00000   1.0000  0.98811   0.9957
## Neg Pred Value        1.00000   0.9995  0.99984   0.9998
## Prevalence            0.09966   0.1052  0.09522   0.1002
## Detection Rate        0.09966   0.1048  0.09508   0.1001
## Detection Prevalence  0.09966   0.1048  0.09623   0.1005
## Balanced Accuracy     1.00000   0.9980  0.99862   0.9990