1. Libraries

# =================================================================================================
#
#install.packages("plot.matrix")
library(plot.matrix)
library(data.table)
library(ggplot2)
library(fst)
library(data.table)
library(magrittr)
library(kableExtra)
library(ggplot2)
library(leaps)
library(png)
library(knitr)
library(grid)
library(gridExtra)
library(plot.matrix)

library(devtools)
#install.packages("tensorflow", type="source")
library(tensorflow)
#install_tensorflow()
#install.packages("keras", type="source")
library(keras)
#install_keras()

library(reticulate)

 
 
 

2. ETL

#
########################################################################
#
# Extract, Transform, Load 
# Read data, investigate dimensions, type of variables, values taken 
# (distribution or frequency), missing values.
#
########################################################################
#
#


# ==========================================================
data <- "Data"
rawData <- file.path(data, "raw")
rData <- file.path(data, "r")
list.files(rawData)
 [1] "KDigits_train_02.csv"   "KDigits_train_03.csv"   "KDigits_train_04.csv"  
 [4] "KDigits_train_05.csv"   "KDigits_train_06.csv"   "KDigits_train_07.csv"  
 [7] "KDigits_train_08.csv"   "KDigits_train_09.csv"   "KDigits_train_10.csv"  
[10] "KDigits_train_data.csv" "Titanic_test.csv"       "Titanic_train.csv"     

 

2.1 - Converting .csv to .fst

#
#
# Simply reading all .csv files listed above, then writing as .fst ,
# the commands below are the same for all files. 
#
#



# All training data
#
# =========  --  DO NOT USE    -    only for reference
#data.train <- read.csv(file.path(rawData, "KDigits_train_data.csv"), stringsAsFactors = T)
#write_fst(data.train, file.path(rData, "data.train.fst"))
# =========



# Split training data
#
# ==========
#s01 <- read.csv(file.path(rawData, "KDigits_train_01.csv"), stringsAsFactors = T)
#write_fst(s01, file.path(rData, "s01.fst"))

#s02 <- read.csv(file.path(rawData, "KDigits_train_02.csv"), stringsAsFactors = T)
#write_fst(s02, file.path(rData, "s02.fst"))

#s03 <- read.csv(file.path(rawData, "KDigits_train_03.csv"), stringsAsFactors = T)
#write_fst(s03, file.path(rData, "s03.fst"))

#s04 <- read.csv(file.path(rawData, "KDigits_train_04.csv"), stringsAsFactors = T)
#write_fst(s04, file.path(rData, "s04.fst"))

#s05 <- read.csv(file.path(rawData, "KDigits_train_05.csv"), stringsAsFactors = T)
#write_fst(s05, file.path(rData, "s05.fst"))

#s06 <- read.csv(file.path(rawData, "KDigits_train_06.csv"), stringsAsFactors = T)
#write_fst(s06, file.path(rData, "s06.fst"))

#s07 <- read.csv(file.path(rawData, "KDigits_train_07.csv"), stringsAsFactors = T)
#write_fst(s07, file.path(rData, "s07.fst"))

#s08 <- read.csv(file.path(rawData, "KDigits_train_08.csv"), stringsAsFactors = T)
#write_fst(s08, file.path(rData, "s08.fst"))

#s09 <- read.csv(file.path(rawData, "KDigits_train_09.csv"), stringsAsFactors = T)
#write_fst(s09, file.path(rData, "s09.fst"))

#s10 <- read.csv(file.path(rawData, "KDigits_train_10.csv"), stringsAsFactors = T)
#write_fst(s10, file.path(rData, "s10.fst"))
# ==========



#

 

2.2 - Reading .fst files

#
# This is the final product of the work above. 
#

# ==========================================================
data.train <- read_fst(file.path(rData, "data.train.fst")) 


s01 <- read_fst(file.path(rData, "s01.fst"))
s02 <- read_fst(file.path(rData, "s02.fst"))
s03 <- read_fst(file.path(rData, "s03.fst"))
s04 <- read_fst(file.path(rData, "s04.fst"))
s05 <- read_fst(file.path(rData, "s05.fst"))
s06 <- read_fst(file.path(rData, "s06.fst"))
s07 <- read_fst(file.path(rData, "s07.fst"))
s08 <- read_fst(file.path(rData, "s08.fst"))
s09 <- read_fst(file.path(rData, "s09.fst"))
s10 <- read_fst(file.path(rData, "s10.fst"))

#

 

2.3 - Investigating files

 

Missing Values, Variable Types, Dimensions

#
#
sum(is.na(data.train))
[1] 0
#
# There are no missing/NA values present in the data.


typeof(data.train)
[1] "list"
#
# The dataset as a whole is of type 'list'. 

typeof(data.train[,3])
[1] "integer"
#
# A single component is of type 'integer'.

type.list <- data.frame(ncol(data.train))
for(i in 1:ncol(data.train)){
  type.list[i,1] <- typeof(data.train[,i])
}
length(unique(type.list)) 
[1] 1
#
# If a data type other than integer were present in the 
# created 'type.list' variable, the length of unique 
# values would be greater than 1. 
#
# Since length equals 1, the variables composing the data are all of type 'integer'.


dim(data.train)
[1] 41897   786
#
# The training data -as a whole- consists of 41,897
# observations and 786 variables. 


hist(data.train$label, xlab="Label", main = "Label Distribution")

#
# All labels follow a uniform distribution.  

 

2.4 - Further exploration

Subset comparison

#
#
# library(vtable)
# st(data.train)  --  This is the function that inspired the computations below. 
#
#


# This function calculates the difference for each descriptive statistic -as seen
# within the function- for all subsets of data against the whole.  
#
# ==========================================================
summary.compare <- function(list1,  list2){
  d_mean = abs(mean(list1) - mean(list2))
  d_st = abs(sd(list1) - sd(list2))
  d_med = abs(median(list1) - median(list2))
  d_25 = abs(quantile(list1, 0.25) - quantile(list2, 0.25))
  d_75 = abs(quantile(list1, 0.75) - quantile(list2, 0.75))
  d_min = abs(min(list1) - min(list2))
  d_max = abs(max(list1) - max(list2))
  
  vals <- c(d_mean, d_st, d_med, d_25, d_75, d_min, d_max)
  return(vals)
}




#
#
dim(s01)
[1] 4167  787
#
#
### The following is only to give insight into the current computations. 
##
# Each subset of data has its 787 variables compared to the training data
# to see any potentially existing variation. If there's any difference, it temporarily
# populates the 'diff' variable seen inside the loop.  
#
diff <- matrix(0, nrow=7, ncol=ncol(data.train))
for(i in 1:ncol(data.train)){
  diff[c(1,2,3,4,5,6,7),i] <- round(summary.compare(data.train[,i], s01[,i]), digits=3)
}
diff[, 1:14] # NOTE: this is only for subset 's01'
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
[1,] 0.039    0    0    0    0    0    0    0    0     0     0     0     0
[2,] 0.023    0    0    0    0    0    0    0    0     0     0     0     0
[3,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[4,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[5,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[6,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[7,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
       [,14]
[1,]   0.003
[2,]   0.567
[3,]   0.000
[4,]   0.000
[5,]   0.000
[6,]   0.000
[7,] 116.000
sum(diff[1,]) # mean
[1] 569.161
sum(diff[2,]) # st dev
[1] 752.783
sum(diff[3,]) # median
[1] 817

 

Boxplots for all descriptive statistics that populate the ‘diff’ variable above.

NOTE: these plots are only for subset s01. It is interesting to see that most summary statistics deviate very little from the training set, but also contain observations that deviate highly.

 
 
 

The above computation is done again for the remaining subsets.

#
# Commented out because it takes some time. This populates a matrix with 
# the differences of each subset of data against the entire training data, and 
# updates the total sum for each summary statistic continuously.   
#
#  
# diff.mat <- matrix(0, nrow=7, ncol=10)
# for(i in 1:ncol(data.train)){                      # top loop creates temporary 'diff' variable
#  d01 <- summary.compare(data.train[,i], s01[,i])
#  d02 <- summary.compare(data.train[,i], s02[,i])
#  d03 <- summary.compare(data.train[,i], s03[,i])
#  d04 <- summary.compare(data.train[,i], s04[,i])
#  d05 <- summary.compare(data.train[,i], s05[,i])
#  d06 <- summary.compare(data.train[,i], s06[,i])
#  d07 <- summary.compare(data.train[,i], s07[,i])
#  d08 <- summary.compare(data.train[,i], s08[,i])
#  d09 <- summary.compare(data.train[,i], s09[,i])
#  d10 <- summary.compare(data.train[,i], s10[,i])
 
#  for(r in 1:7){                                   # loop 2 populates matrix with 'diff' variable
#    diff.mat[r,1] <- diff.mat[r,1] + d01[r]
#    diff.mat[r,2] <- diff.mat[r,2] + d02[r]
#    diff.mat[r,3] <- diff.mat[r,3] + d03[r]
#    diff.mat[r,4] <- diff.mat[r,4] + d04[r]
#    diff.mat[r,5] <- diff.mat[r,5] + d05[r]
#    diff.mat[r,6] <- diff.mat[r,6] + d06[r]
#    diff.mat[r,7] <- diff.mat[r,7] + d07[r]
#    diff.mat[r,8] <- diff.mat[r,8] + d08[r]
#    diff.mat[r,09] <- diff.mat[r,09] + d09[r]
#    diff.mat[r,10] <- diff.mat[r,10] + d10[r]
   
#  }
# }



#
# Renaming matrix column and row names for table presentation. 
#
#colnames(diff.mat) <- c("s01", "s02", "s03", "s04", "s05", "s06", "s07", "s08", "s09", "s10")
#rownames(diff.mat) <- c("Mean", "Sd", "Median", "25th", "75th", "Min", "Max")

#write.table(diff.mat, "diff.txt")
diff.mat <- read.table("diff.txt")

kbl(diff.mat, caption="Subset Variation from Training Set", booktabs = T, linesep = "") %>%
  kable_styling(latex_options = c("striped", "hold_position"), stripe_index = c(1,2, 5:6))
Subset Variation from Training Set
s01 s02 s03 s04 s05 s06 s07 s08 s09 s10
Mean 569.1622 622.5883 599.7935 497.8024 861.6378 407.6950 622.258 735.4372 705.6742 665.5293
Sd 752.7789 837.5735 616.3175 643.8175 757.7040 617.0945 648.084 747.1020 808.5846 750.9476
Median 817.0000 827.0000 830.0000 673.0000 1094.0000 588.0000 1254.500 1129.0000 792.0000 1014.0000
25th 305.0000 272.0000 163.0000 166.2500 178.0000 214.5000 365.500 55.5000 79.0000 53.0000
75th 1219.5000 1278.0000 1026.0000 917.5000 1464.0000 856.0000 907.000 1260.7500 1343.0000 1137.0000
Min 0.0000 18.0000 8.0000 10.0000 3.0000 2.0000 17.000 4.0000 6.0000 34.0000
Max 13201.0000 14139.0000 13331.0000 13539.0000 11345.0000 11670.0000 14066.000 14029.0000 12878.0000 12547.0000

#
# Checking Frequency and Distribution of Subset Differences
#
row.names <- rownames(diff.mat)
plot.mat <- cbind(diff.mat, row.names)
p.m.see <- melt(plot.mat[1:6,])  # separating high from low frequencies
p.m.up <- melt(plot.mat[7,])     # 'see' is to see difference, 'up' is upper
plot.mat <- melt(plot.mat)

hist(plot.mat$value, xlab="Difference in Sum", main = "Total Subset Differences")

g1 <- ggplot(data=p.m.up, aes(x=variable, y=value)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  ggtitle("Mean") + xlab("Subset")

g2 <- ggplot(data=p.m.see, aes(x=variable, y=value, fill=row.names)) +
  geom_bar(stat="identity") + 
  facet_wrap(~ row.names) + 
  scale_x_discrete(breaks = unique(p.m.see$variable)[seq(2,10,2)]) + 
  theme(legend.position = "none",
    axis.text=element_text(size=12)) + xlab("Subset")


grid.arrange(g1,g2)

 
 
 

3. Data Preparation

#
########################################################################
#
# Separate the outcome ("label") variable into a separate dataset (to 
# prevent model from 'peeking' into the result).
#
# Make sure model only uses intended predictors of the form "X20x31" 
# and similar (you will have to drop some columns like "id", "cut", etc.).
#
########################################################################
#
#

 

3.1 - Creating pure subsets

# Columns to drop:
#                   - label
#                   - id
#                   - cut
#
#

# ==========================================================
s01.t <- s01[,2:785]
s02.t <- s02[,2:785]
s03.t <- s03[,2:785]
s04.t <- s04[,2:785]
s05.t <- s05[,2:785]
s06.t <- s06[,2:785]
s07.t <- s07[,2:785]
s08.t <- s08[,2:785]
s09.t <- s09[,2:785]
s10.t <- s10[,2:785]

#

 

3.2 - Exploring pure subsets

#
# Now that I have square matrices of pure 8-bit grayscale values, I want to plot some. 
# First, I converted one row into an array of integers and transposed it so that I could easily 
# convert it into a square matrix. The matrix of numbers is interesting. 
#
# Next, I saved the .csv file, then opened it in MATLAB. 
# MATLAB has a 'mesh()' function which produced the plots in the following screenshots. 
#
one.img <- s02.t[1,]
one.img <- as.vector(t(one.img))
one.img <- matrix(one.img, nrow=28, ncol=28)
#write.csv(one.img, "one.img.csv")

one.img.2 <- s02.t[2,]
one.img.2 <- as.vector(t(one.img.2))
one.img.2 <- matrix(one.img.2, nrow=28, ncol=28)
#write.csv(one.img.2, "one.img.2.csv")
grid.arrange(rasterGrob(img1), rasterGrob(img2), 
             rasterGrob(img3), rasterGrob(img4),
             ncol=2, nrow=2)

#
# It is also possible to do this in R with the 'plot.matrix' library. 
#
brk = 50
plot(one.img.2, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

 

3.3 - Functions

# 
# ======= FUNCTION 1 ===========================================================
# 
# Prepare Test Sets
#
test.prep.x <- function(pure){
  x_test <- as.matrix(pure)
  x_test <- x_test / 255
  
  return(x_test)
}

test.prep.y <- function(excluded){
  y_test <- to_categorical(excluded, 10)
  
  return(y_test)
}

# ======= FUNCTION 2 ===========================================================
#
# Makes Predictions
# Calls my.prediction function to evaluate
#
my.model.preds <- function(x_test, y_test, id){
  predictions <- modelnn %>% predict(x_test) %>% k_argmax()
  
  # Sending to other function for eval
  final.pred <- my.prediction(predictions, y_test, id)
  
  return(final.pred)
}

# ======= FUNCTION 3 ===========================================================
#
# Receives Test Set Predictions
# Organizes for Evaluation
#
#
my.prediction <- function(preds, actual, id){
  test.pred <- t(t(as.numeric(preds[])))
  test.pred <- cbind(id, test.pred, rep(0,length(actual)))
  
  count = 1
  for(i in 1:nrow(actual)){
    for(t in 1:ncol(actual)){
      if((actual[i,t] == 1) && (count < nrow(test.pred))){
        test.pred[count,3] = t
        count = count + 1
        break
      }else if((actual[i,t] == 1) && (count == nrow(test.pred))){
        test.pred[count,3] = t
        break
      }
    }
  }
  test.pred[,3] <- test.pred[,3]-1
  return(test.pred)
}

3.4 - Data Prep w/ Multiple Test Sets

#
# ========================================================== Training Evaluation Set
exclude <- rbind(s01[,c(1,786,787)], s02[,c(1,786,787)], s03[,c(1,786,787)],
                 s05[,c(1,786,787)], s06[,c(1,786,787)], s08[,c(1,786,787)], s10[,c(1,786,787)])

exclude.2 <- rbind(s09[,c(1,786,787)])
exclude.3 <- rbind(s07[,c(1,786,787)])
exclude.4 <- rbind(s04[,c(1,786,787)])

# ========================================================== Training Set
x_train <- rbind(s01.t, s02.t, s03.t, s05.t, s06.t, s08.t, s10.t)
x_train <- x_train / 255
g_train <- exclude[,1]
x_train <- as.matrix(x_train)
y_train <- to_categorical(g_train, 10)

# ========================================================== Multiple Test Sets
x_test1 <- test.prep.x(s04.t)
y_test1 <- test.prep.y(exclude.4[,1])
x_test2 <- test.prep.x(s07.t)
y_test2 <- test.prep.y(exclude.3[,1])
x_test3 <- test.prep.x(s09.t)
y_test3 <- test.prep.y(exclude.2[,1])

dim(x_train)
[1] 29224   784
dim(x_test1)
[1] 4254  784
dim(x_test2)
[1] 4230  784
dim(x_test3)
[1] 4189  784

 
 
 

4. Model Creation

#
########################################################################
#
# Model and parameter choices.
#
# Include comments showing you are aware of which parameters you set, 
# or may be implicit (default) in the model.
#
########################################################################
#
#

 

4.1 - Model 1

Multilayer Neural Network

#
# 
# Multilayer Neural Network
#
#
modelnn <- keras_model_sequential()

modelnn %>%                                    # first layer: 28x28=784 input units 
  layer_dense(units = 256, activation = "relu",  # -> hidden layer of 256
              input_shape = c(784)) %>%  # ^ layer_den takes as input modelnn object, returns object
  layer_dropout(rate = 0.4) %>% # ^ object piped through for dropout regularization
  layer_dense(units = 128, activation = "relu") %>% # second hidden layer
  layer_dropout(rate = 0.3) %>% # dropout layer
  layer_dense(units = 10, activation = "softmax") # defines map from second hidden layer
                                                  # to class probabilities

# This model has four layers in total: 
# (1) Input layer
# (2) Hidden layer 1
# (3) Hidden layer 2
# (4) Output layer
#
#
# -The input layer and its following weights have 785 x 256 = 200,960 elements.  
# -Each element feeds feeds to the second hidden layer through a matrix of weights
#  of dimension 257 x 128 = 32,896. 
# -Then comes the output layer with 10 responses. The weights associated with this 
#  final layer are 129 x 10 = 1,290 weights. 
#
#

summary(modelnn)
Model: "sequential"
________________________________________________________________________________
 Layer (type)                       Output Shape                    Param #     
================================================================================
 dense_2 (Dense)                    (None, 256)                     200960      
                                                                                
 dropout_1 (Dropout)                (None, 256)                     0           
                                                                                
 dense_1 (Dense)                    (None, 128)                     32896       
                                                                                
 dropout (Dropout)                  (None, 128)                     0           
                                                                                
 dense (Dense)                      (None, 10)                      1290        
                                                                                
================================================================================
Total params: 235,146
Trainable params: 235,146
Non-trainable params: 0
________________________________________________________________________________
# Fit model by minimizing cross-entropy function.
#
# Since the response is qualitative, we look for coefficient estimates that minimize the 
# negative multinomial log-likelihood aka the cross-entropy. 
#
# If the response was qualitative, we would instead minimize squared-error loss. 
# 
modelnn %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_rmsprop(), 
  metrics = c("accuracy")
)

 
 
 

5. Model Evaluation

#
########################################################################
#
# Test the performance of your model(s) on a different split of data 
# (this means that you should NOT have used all the splits of the data 
# in your training model).
#
########################################################################
#
#

5.1 - Assessing Fit

Model 1 - Multilayer Neural Network

#
# Final Step
# Supply training data and fit model. 
system.time(
  history <- modelnn %>%
    fit(x_train, y_train, epochs = 30, batch_size = 128,
        validation_split = 0.2)
)
   user  system elapsed 
  66.17    1.83   18.86 
plot(history, smooth = FALSE)

# This model starts overfitting at around epoch 7. One method to prevent it would be to 
# stop training at epoch 7. 


# Test Error
modelnn %>% evaluate(x_test1, y_test1)
     loss  accuracy 
0.1117804 0.9774330 
#
### Making predictions
##
#
#
# predicting the softmax probabilities
#
# This is a probability assigned to each of the 10 classes. The classifier then
# assigns the image to the class with the highest probability. 
#
predictions <- modelnn %>% predict(x_test1)
round(head(predictions), 3)
     [,1] [,2] [,3] [,4]  [,5] [,6] [,7]  [,8] [,9] [,10]
[1,]    0    0    0    0 0.000    1    0 0.000    0 0.000
[2,]    0    0    0    1 0.000    0    0 0.000    0 0.000
[3,]    0    0    0    0 0.000    0    0 0.000    1 0.000
[4,]    0    0    0    0 0.000    0    0 0.001    0 0.999
[5,]    0    0    0    0 0.037    0    0 0.001    0 0.962
[6,]    1    0    0    0 0.000    0    0 0.000    0 0.000
#
#
# Directly predicting label
label_pred <- modelnn %>% predict(x_test1) %>% k_argmax()
label_pred[1:20]
tf.Tensor([5 3 8 9 9 0 9 4 1 2 7 4 5 9 7 9 5 4 0 3], shape=(20,), dtype=int64)
#
#

 

Confusion Tables and Model Accuracy

#
#
# Subset 04
# (test eval #1)
#
final.preds1 <- my.model.preds(x_test1, y_test1, exclude.4$id)
tab1 <- table(final.preds1[,2], final.preds1[,3])
tab1
   
      0   1   2   3   4   5   6   7   8   9
  0 439   0   3   0   0   0   1   0   1   0
  1   0 469   1   1   3   0   0   0   3   0
  2   0   1 428   5   1   1   0   2   2   0
  3   0   0   2 425   0   4   0   1   1   1
  4   0   0   0   1 432   0   2   0   0  10
  5   2   0   0   1   0 362   1   0   2   2
  6   2   1   1   0   2   0 394   0   1   0
  7   0   3   2   2   3   1   0 402   0   6
  8   0   0   0   1   0   0   0   0 381   1
  9   2   0   0   0   8   2   0   0   3 426
sum(diag(tab1))/sum(tab1)
[1] 0.977433
#
#
# Subset 07 (same as training)
# (test eval #2)
#
final.preds2 <- my.model.preds(x_test2, y_test2, exclude.3$id)
tab2 <- table(final.preds2[,2], final.preds2[,3])
tab2
   
      0   1   2   3   4   5   6   7   8   9
  0 407   0   2   0   1   0   2   0   1   0
  1   0 465   0   3   2   0   0   1   4   1
  2   1   1 407   2   2   0   0   1   1   1
  3   0   0   2 424   0   9   0   1   3   0
  4   0   0   0   0 396   0   2   0   1   4
  5   0   0   0   2   2 380   4   0   2   1
  6   0   0   0   0   1   3 401   0   1   1
  7   0   2   1   1   2   1   0 449   0   4
  8   0   1   1   4   0   1   0   0 399   3
  9   0   1   1   0   5   0   0   4   4 401
sum(diag(tab2))/sum(tab2)
[1] 0.9761229
#
#
# Subset 09
# (test eval #3)
#
final.preds3 <- my.model.preds(x_test3, y_test3, exclude.2$id)
tab3 <- table(final.preds3[,2], final.preds3[,3])
tab3
   
      0   1   2   3   4   5   6   7   8   9
  0 405   0   0   1   0   0   0   1   2   0
  1   0 483   1   0   0   0   1   2   4   2
  2   1   0 381   4   2   0   0   3   1   0
  3   0   0   1 422   0   6   0   0   7   2
  4   0   2   0   0 410   0   1   1   0   2
  5   0   0   0   4   0 381   1   0   5   4
  6   2   1   0   1   0   2 400   0   1   0
  7   0   2   2   3   1   1   0 446   1   8
  8   1   1   2   3   1   1   1   0 392   3
  9   0   0   0   2   4   1   0   0   3 360
sum(diag(tab3))/sum(tab3)
[1] 0.9739795

 
 
 

6. Misprediction Inspection

#
########################################################################
#
# Identify examples (data inputs) that were mispredicted, and plot them. 
# hint: create a function from the sample code included in RCode/plot_kdigits.R
#
########################################################################
#
#
#
# isolate incorrect values
#
my.wrong.finder <- function(examine){
  wrong_id <- matrix(0, nrow=abs(sum(examine[,2] == examine[,3]) - nrow(examine)), ncol=4)
  
  count = 1
  for(i in 1:nrow(examine)){
    if(examine[i,2] != examine[i,3]){
      wrong_id[count,1] = examine[i,1]
      wrong_id[count,2] = examine[i,2]
      wrong_id[count,3] = examine[i,3]
      wrong_id[count,4] = abs(examine[i,2] - examine[i,3])
      
      count = count + 1
    }
  }
  return(wrong_id)
}

#
# image prep
#
my.pixel.printer <- function(img.loc){
  img.loc <- as.vector(t(img.loc))
  img.loc <- matrix(img.loc, nrow=28, ncol=28)
  
  # brk = 50
  # p <- plot(img, border=NA, breaks=brk, col=heat.colors(brk), 
  #           key=NULL)
  return(img.loc)
}
#
wrong.1 <- my.wrong.finder(final.preds1)
sum(wrong.1[,4])
[1] 354
nrow(wrong.1)
[1] 96
#
#
wrong.2 <- my.wrong.finder(final.preds2)
sum(wrong.2[,4])
[1] 334
nrow(wrong.2)
[1] 101
#
#
wrong.3 <- my.wrong.finder(final.preds3)
sum(wrong.3[,4])
[1] 413
nrow(wrong.3)
[1] 109
#

brk = 50
img1 <- my.pixel.printer(data.train[wrong.1[3,1]+1,])
img2 <- my.pixel.printer(data.train[wrong.1[45,1]+1,])
img3 <- my.pixel.printer(data.train[wrong.1[69,1]+1,])
img4 <- my.pixel.printer(data.train[wrong.1[74,1]+1,])



wrong.1[3,2] # Actual Value
[1] 7
wrong.1[3,3] # Predicted Value
[1] 9
plot(img1, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

wrong.1[45,2] # Actual Value
[1] 8
wrong.1[45,3] # Predicted Value
[1] 9
plot(img2, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

wrong.1[69,2] # Actual Value
[1] 7
wrong.1[69,3] # Predicted Value
[1] 2
plot(img3, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

wrong.1[74,2] # Actual Value
[1] 8
wrong.1[74,3] # Predicted Value
[1] 3
plot(img4, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

Incorrect predictions are made when the written number is almost illegible. With all the incorrect numbers I’ve seen so far, I cannot blame the model for some of its incorrect predictions.