library(jpeg)
library(OpenImageR)
library(imager)
library(EBImage)
library(nnet)
library(tidyverse)

Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)

df <- as.data.frame(read.csv('train.csv'))
#Divide all pixels by 255
dfn <- df[,2:ncol(df)]/255
#Add back the label column
dfn$label = df$label
#Create df without labels for pca
df2 <- subset(dfn, select = -label)
#List to store images
imgs = list()

#Create image matrix and and save into imgs list, flip the matrix to correctly orient the image
for(i in seq(1,length(df2),1))
{
  img <- flip(matrix(c(df2[i,]), nrow=28, ncol=28, byrow = F))
  mode(img) = 'numeric'
  imgs[[i]] <- img
}
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for(i in 1:10)
{
  image(as.matrix(imgs[[i]]),axes=FALSE,col=grey(seq(0,1,length=180)))
}

What is the frequency distribution of the numbers in the dataset? (5 points)

freq <- dfn %>% dplyr::group_by(label) %>% dplyr::summarise(Frequency = n())
freq
## # A tibble: 10 x 2
##    label Frequency
##    <int>     <int>
##  1     0      4132
##  2     1      4684
##  3     2      4177
##  4     3      4351
##  5     4      4072
##  6     5      3795
##  7     6      4137
##  8     7      4401
##  9     8      4063
## 10     9      4188
ggplot(df, aes(label)) + geom_histogram(fill = 'lightgreen', color = 'black') + xlab('Digits') + scale_x_continuous(breaks = 0:9)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For each number, provide the mean pixel intensity. What does this tell you? (5 points)

The mean pixel intensity is an indication of an image’s brightnes. Looking at the MPI for each number, we can see that number 5 has the highest value.

mpi <- dfn %>% group_by(label) %>% dplyr::summarise(MPI = sum(dfn[2:785])/(n()*784), n= n())
mpi
## # A tibble: 10 x 3
##    label   MPI     n
##    <int> <dbl> <int>
##  1     0  1.39  4132
##  2     1  1.23  4684
##  3     2  1.37  4177
##  4     3  1.32  4351
##  5     4  1.41  4072
##  6     5  1.51  3795
##  7     6  1.39  4137
##  8     7  1.30  4401
##  9     8  1.41  4063
## 10     9  1.37  4188

Reduce the data by using principal components that account for 95% of the variance. How many components did you generate?

Using ‘prcomp’ principal components were generated to represent the data. Of all components 124 components account for 95% variance in the data.

df3 <- t(df2)
pca <- prcomp(df3, scale. = TRUE)
var <- cumsum(pca$sdev^2) / sum(pca$sdev^2)
v_95 <- min(which(var >= 0.95))
v_95
## [1] 124

Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?

A total of 784 components were generated to represent 100% of the variance.

v_100 <- min(which(var >= 1.0))
v_100
## [1] 704
length(var)
## [1] 784

Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points)

Principal component analysis is used for data reduction, and does not serve the purpose of noise reduction.

par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for(i in 1:10)
{
  image(flip(matrix(t(pca$x)[i,], nrow=28, ncol=28)),axes=FALSE,col=grey(seq(0,1,length=180)))
}

Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)

df_8 <- dfn %>% dplyr::filter(label==8)
df_8 <- subset(df_8, select = -label)
df_8 <- t(df_8)
pca_8 <- prcomp(df_8, scale. = TRUE)
par(mfrow=c(5,5))
par(mai=c(.001,.001,.001,.001))
for(i in 1:10)
{
  image(flip(matrix(t(pca_8$x)[i,], nrow=28, ncol=28)),axes=FALSE,col=grey(seq(0,1,length=180)))
}

Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal.

model <- multinom(label ~., data = df, MaxNWts = 42000)
## # weights:  7860 (7065 variable)
## initial  value 96708.573906 
## iter  10 value 27932.238987
## iter  20 value 22901.936560
## iter  30 value 21677.526948
## iter  40 value 21254.278400
## iter  50 value 21077.662091
## iter  60 value 21002.266892
## iter  70 value 20928.779026
## iter  80 value 20866.883697
## iter  90 value 20792.605325
## iter 100 value 20727.274628
## final  value 20727.274628 
## stopped after 100 iterations
test_out <- predict(model, df[,2:ncol(df)], type = 'class')
confusion_matrix <- as.matrix(table('Label' = df$label, 'Prediction' =test_out))
confusion_matrix
##      Prediction
## Label    0    1    2    3    4    5    6    7    8    9
##     0 3953   24   17    9    8   27   10   35   38   11
##     1    0 4605   12   13    4   13    2   13   22    0
##     2   10  146 3639   73   26   21   13  111  111   27
##     3    5   96   76 3762    3  183    6   91   91   38
##     4   11   83   36   11 3448   55   24   18   47  339
##     5   24   77   18   88   12 3283   44   51  141   57
##     6   43   54   72    3   17  101 3769   10   41   27
##     7    6  113   38    7   21   13    4 4053    7  139
##     8   16  397   21   66   18  178   13   35 3247   72
##     9   13   83   14   51   63   46    0  179   25 3714
accuracy <- sum(diag(confusion_matrix))/nrow(df)
accuracy
## [1] 0.8922143