library(jpeg) # for readJPEG
library(OpenImageR) # for flipImage
library(ggplot2) # for qplot
library(tidyverse) # for %>%
library(caret) # for predict
library(nnet) # for multinomPrincipal Component Analysis and Confusion Matrix
Bring in the Training and Test Data Sets.
# The first column of the Training set is the label, "zero" through "nine"
# Note: file sizes are too big for free github accounts
base_path="C:\\Users\\arono\\source\\R\\Data605\\Final\\Problem 2 PCA Images\\"
pathnpic<-paste0(base_path,"train.csv" )
train_df=read.csv(pathnpic, stringsAsFactors=F) #read in the raw data
train_labels=train_df[,1]
train_df[,1]=NULL
pathnpic<-paste0(base_path,"test.csv" )
test_df=read.csv(pathnpic, stringsAsFactors=F)
train_vec=as.vector(unlist(train_df))
train_arr=array(train_vec, dim=c(nrow(train_df),28,28))
test_vec=as.vector(unlist(test_df))
test_arr=array(train_vec, dim=c(nrow(test_df),28,28))
# remove objects dynamically
rm(test_vec)
rm(train_df)
rm(test_df)
rm(test_arr)3. Using the training.csv file, plot representations of the first 10 images to understand the data format.
# dim(train_arr) # 42000 28 28
par(mfrow=c(2,5))
for (i in 1:10){
image(1:28, 1:28, flipImage(train_arr[i,,]))
}3. Go ahead and divide all pixels by 255 to produce values between 0 and 1.
train_arr<-train_arr/2554. What is the frequency distribution of the numbers in the dataset?
hist(train_vec, breaks = 100)We can create a frequency table with the table() function. Lets look at the most frequent intensities.
train_freq<-table(train_vec)
tail(sort(train_freq))
rm(train_vec)## train_vec
## 251 255 252 254 253 0
## 78319 223310 465950 560067 1066447 26621312
If we remove the pure white, We can get a better histogram on the other values
logInd<-train_arr>0
hist(train_arr[logInd], breaks = 100, freq = FALSE)5. For each number, provide the mean pixel intensity. What does this tell you?
mean(train_arr)## [1] 0.1310153
A lower value indicates darker images. Pure black is the absence of color and white is full color.
These images are fairly dark.
The eigen shoe images we worked with were filled with pure white (RGB values of 255) and the mean intensity was around .8
6. Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?
PCA will produce 784 components. Each pixel intensity represents a response for each image.
We can either use prcomp() or scale(), cor() and eigen(). Below uses scale(), cor() and eigen()
# convert this to 784 rows (pixels) and 42000 columns (images)
train_flat_mat<-t(matrix(train_arr, ncol=28*28, byrow = FALSE))
raw_scaled<-scale(train_flat_mat, scale = FALSE)
raw_cor<-raw_scaled %*% t(raw_scaled) / (nrow(raw_scaled)-1)
raw_eigen <- eigen(raw_cor)
cum.var <- cumsum(raw_eigen$values) / sum(raw_eigen$values)
thres <- min(which(cum.var > .95))
sprintf("Saving off the top %d principal components",thres)
top_loadings<-raw_eigen$rotation[1:thres,]## [1] "Saving off the top 119 principal components"
Here we use prcomp() and display a scree plot of just the first 50 to get a visual of how many Eigen Values are close to zero.
train_pca<-prcomp(train_flat_mat, scale = TRUE)
var_explained = train_pca$sdev^2 / sum(train_pca$sdev^2)
#create scree plot
qplot(c(1:50), var_explained[1:50]) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot First 50 Only") 7. Plot the first 10 images generated by PCA. They will appear to be noise. Why?
par(mfrow=c(2,5))
for (i in 1:10) {
image(1:28,1:28,array(train_pca$x[,i], dim=c(28,28)))
}This represents variances among all 9 digits so it looks smudgy.
You can sort of see that the patches get darker in spots that most numbers go through.
8. Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see?
eights<-which(train_labels == 8)
dim(train_arr)
train_eights<-train_arr[eights,,]
train_flat_mat<-t(matrix(train_eights, ncol=28*28, byrow = FALSE))
pca<-prcomp(train_flat_mat, scale = TRUE)
par(mfrow=c(2,5))
for (i in 1:10) {
image(1:28,1:28,array(pca$x[,i], dim=c(28,28)))
}## [1] 42000 28 28
Looks like 8s. Each successive image gets more and more blurry or more specifically it contrasts more and in different spots.
9. An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal.
Create model
# convert this to 784 rows (pixels) and 42000 columns (images)
train_flat_mat<-t(matrix(train_arr, ncol=28*28, byrow = FALSE))
# add labels to train_flat_mat and reduce the size to 7K to prevent memory issues
train_flat_new<-as.data.frame(cbind(train_labels, t(train_flat_mat) ))
train_flat_new<-train_flat_new[1:7000,]
digit_model <- nnet::multinom(train_labels ~., data = train_flat_new, MaxNWts =10000000)
# Summarize the model -- not responding
# summary(digit_model)## # weights: 7860 (7065 variable)
## initial value 16118.095651
## iter 10 value 4210.098486
## iter 20 value 3170.440865
## iter 30 value 2780.391793
## iter 40 value 2469.949179
## iter 50 value 2098.440621
## iter 60 value 1438.861688
## iter 70 value 1014.932193
## iter 80 value 802.695313
## iter 90 value 622.340950
## iter 100 value 236.584221
## final value 236.584221
## stopped after 100 iterations
Put test matrix together
# put your test matrix togehter
test_flat_new<-train_flat_new[7:14000,]
test_flat_new_na<-test_flat_new[,-1]
dim(test_flat_new)## [1] 13994 785
Display the summary of predict() and the success rate based on the known labels.
# Make predictions
predicted.classes <- digit_model %>% predict(test_flat_new)
summary(predicted.classes)## 0 1 2 3 4 5 6 7 8 9 NA's
## 686 776 750 681 668 627 697 733 673 703 7000
Display the success rate.
true_or_false<-(predicted.classes == test_flat_new$train_labels)
results<-table(predicted.classes == test_flat_new$train_labels)
success_rate<-results[2]/(results[1]+results[2])
sprintf("The success rate is %.f", success_rate)## [1] "The success rate is 1"
Print out the confusion matrix. Not sure why its called confusion but the grid shows the predicted vs actual classification of each image.
confusionMatrix(
factor(predicted.classes),
factor(test_flat_new$train_labels)) ## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 686 0 0 0 0 0 0 0 0 0
## 1 0 776 0 0 0 0 0 0 0 0
## 2 0 0 750 0 0 0 0 0 0 0
## 3 0 0 0 680 0 0 0 0 1 0
## 4 0 0 0 0 668 0 0 0 0 0
## 5 0 0 0 0 0 627 0 0 0 0
## 6 0 0 0 0 0 0 697 0 0 0
## 7 0 0 0 0 0 0 0 733 0 0
## 8 0 1 1 1 0 3 0 1 665 1
## 9 0 0 0 1 0 0 0 2 0 700
##
## Overall Statistics
##
## Accuracy : 0.9983
## 95% CI : (0.997, 0.9991)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9981
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 1.00000 0.9987 0.9987 0.99707 1.00000 0.99524
## Specificity 1.00000 1.0000 1.0000 0.99984 1.00000 1.00000
## Pos Pred Value 1.00000 1.0000 1.0000 0.99853 1.00000 1.00000
## Neg Pred Value 1.00000 0.9998 0.9998 0.99968 1.00000 0.99953
## Prevalence 0.09808 0.1111 0.1074 0.09751 0.09551 0.09008
## Detection Rate 0.09808 0.1110 0.1072 0.09723 0.09551 0.08965
## Detection Prevalence 0.09808 0.1110 0.1072 0.09737 0.09551 0.08965
## Balanced Accuracy 1.00000 0.9994 0.9993 0.99845 1.00000 0.99762
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 1.00000 0.9959 0.99850 0.9986
## Specificity 1.00000 1.0000 0.99874 0.9995
## Pos Pred Value 1.00000 1.0000 0.98811 0.9957
## Neg Pred Value 1.00000 0.9995 0.99984 0.9998
## Prevalence 0.09966 0.1052 0.09522 0.1002
## Detection Rate 0.09966 0.1048 0.09508 0.1001
## Detection Prevalence 0.09966 0.1048 0.09623 0.1005
## Balanced Accuracy 1.00000 0.9980 0.99862 0.9990