1. Libraries

# =================================================================================================
#
#install.packages("plot.matrix")
library(plot.matrix)
library(data.table)
library(ggplot2)
library(fst)
library(data.table)
library(magrittr)
library(kableExtra)
library(ggplot2)
library(leaps)
library(png)
library(knitr)
library(grid)
library(gridExtra)
library(plot.matrix)

library(devtools)
#install.packages("tensorflow", type="source")
library(tensorflow)
#install_tensorflow()
#install.packages("keras", type="source")
library(keras)
#install_keras()

library(reticulate)

2. ETL

#
########################################################################
#
# Extract, Transform, Load 
# Read data, investigate dimensions, type of variables, values taken 
# (distribution or frequency), missing values.
#
########################################################################
#
#


# ==========================================================
data <- "Data"
rawData <- file.path(data, "raw")
rData <- file.path(data, "r")
list.files(rawData)

 [1] "KDigits_train_02.csv"   "KDigits_train_03.csv"   "KDigits_train_04.csv"  
 [4] "KDigits_train_05.csv"   "KDigits_train_06.csv"   "KDigits_train_07.csv"  
 [7] "KDigits_train_08.csv"   "KDigits_train_09.csv"   "KDigits_train_10.csv"  
[10] "KDigits_train_data.csv" "Titanic_test.csv"       "Titanic_train.csv"

2.1 - Converting .csv to .fst

#
#
# Simply reading all .csv files listed above, then writing as .fst ,
# the commands below are the same for all files. 
#
#



# All training data
#
# =========  --  DO NOT USE    -    only for reference
#data.train <- read.csv(file.path(rawData, "KDigits_train_data.csv"), stringsAsFactors = T)
#write_fst(data.train, file.path(rData, "data.train.fst"))
# =========



# Split training data
#
# ==========
#s01 <- read.csv(file.path(rawData, "KDigits_train_01.csv"), stringsAsFactors = T)
#write_fst(s01, file.path(rData, "s01.fst"))

#s02 <- read.csv(file.path(rawData, "KDigits_train_02.csv"), stringsAsFactors = T)
#write_fst(s02, file.path(rData, "s02.fst"))

#s03 <- read.csv(file.path(rawData, "KDigits_train_03.csv"), stringsAsFactors = T)
#write_fst(s03, file.path(rData, "s03.fst"))

#s04 <- read.csv(file.path(rawData, "KDigits_train_04.csv"), stringsAsFactors = T)
#write_fst(s04, file.path(rData, "s04.fst"))

#s05 <- read.csv(file.path(rawData, "KDigits_train_05.csv"), stringsAsFactors = T)
#write_fst(s05, file.path(rData, "s05.fst"))

#s06 <- read.csv(file.path(rawData, "KDigits_train_06.csv"), stringsAsFactors = T)
#write_fst(s06, file.path(rData, "s06.fst"))

#s07 <- read.csv(file.path(rawData, "KDigits_train_07.csv"), stringsAsFactors = T)
#write_fst(s07, file.path(rData, "s07.fst"))

#s08 <- read.csv(file.path(rawData, "KDigits_train_08.csv"), stringsAsFactors = T)
#write_fst(s08, file.path(rData, "s08.fst"))

#s09 <- read.csv(file.path(rawData, "KDigits_train_09.csv"), stringsAsFactors = T)
#write_fst(s09, file.path(rData, "s09.fst"))

#s10 <- read.csv(file.path(rawData, "KDigits_train_10.csv"), stringsAsFactors = T)
#write_fst(s10, file.path(rData, "s10.fst"))
# ==========



#

2.2 - Reading .fst files

#
# This is the final product of the work above. 
#

# ==========================================================
data.train <- read_fst(file.path(rData, "data.train.fst")) 


s01 <- read_fst(file.path(rData, "s01.fst"))
s02 <- read_fst(file.path(rData, "s02.fst"))
s03 <- read_fst(file.path(rData, "s03.fst"))
s04 <- read_fst(file.path(rData, "s04.fst"))
s05 <- read_fst(file.path(rData, "s05.fst"))
s06 <- read_fst(file.path(rData, "s06.fst"))
s07 <- read_fst(file.path(rData, "s07.fst"))
s08 <- read_fst(file.path(rData, "s08.fst"))
s09 <- read_fst(file.path(rData, "s09.fst"))
s10 <- read_fst(file.path(rData, "s10.fst"))

#

2.3 - Investigating files

Missing Values, Variable Types, Dimensions

#
#
sum(is.na(data.train))

[1] 0

#
# There are no missing/NA values present in the data.


typeof(data.train)

[1] "list"

#
# The dataset as a whole is of type 'list'. 

typeof(data.train[,3])

[1] "integer"

#
# A single component is of type 'integer'.

type.list <- data.frame(ncol(data.train))
for(i in 1:ncol(data.train)){
  type.list[i,1] <- typeof(data.train[,i])
}
length(unique(type.list))

[1] 1

#
# If a data type other than integer were present in the 
# created 'type.list' variable, the length of unique 
# values would be greater than 1. 
#
# Since length equals 1, the variables composing the data are all of type 'integer'.


dim(data.train)

[1] 41897   786

#
# The training data -as a whole- consists of 41,897
# observations and 786 variables. 


hist(data.train$label, xlab="Label", main = "Label Distribution")

#
# All labels follow a uniform distribution.

2.4 - Further exploration

Subset comparison

#
#
# library(vtable)
# st(data.train)  --  This is the function that inspired the computations below. 
#
#


# This function calculates the difference for each descriptive statistic -as seen
# within the function- for all subsets of data against the whole.  
#
# ==========================================================
summary.compare <- function(list1,  list2){
  d_mean = abs(mean(list1) - mean(list2))
  d_st = abs(sd(list1) - sd(list2))
  d_med = abs(median(list1) - median(list2))
  d_25 = abs(quantile(list1, 0.25) - quantile(list2, 0.25))
  d_75 = abs(quantile(list1, 0.75) - quantile(list2, 0.75))
  d_min = abs(min(list1) - min(list2))
  d_max = abs(max(list1) - max(list2))
  
  vals <- c(d_mean, d_st, d_med, d_25, d_75, d_min, d_max)
  return(vals)
}




#
#
dim(s01)

[1] 4167  787

#
#
### The following is only to give insight into the current computations. 
##
# Each subset of data has its 787 variables compared to the training data
# to see any potentially existing variation. If there's any difference, it temporarily
# populates the 'diff' variable seen inside the loop.  
#
diff <- matrix(0, nrow=7, ncol=ncol(data.train))
for(i in 1:ncol(data.train)){
  diff[c(1,2,3,4,5,6,7),i] <- round(summary.compare(data.train[,i], s01[,i]), digits=3)
}
diff[, 1:14] # NOTE: this is only for subset 's01'

      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
[1,] 0.039    0    0    0    0    0    0    0    0     0     0     0     0
[2,] 0.023    0    0    0    0    0    0    0    0     0     0     0     0
[3,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[4,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[5,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[6,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
[7,] 0.000    0    0    0    0    0    0    0    0     0     0     0     0
       [,14]
[1,]   0.003
[2,]   0.567
[3,]   0.000
[4,]   0.000
[5,]   0.000
[6,]   0.000
[7,] 116.000

sum(diff[1,]) # mean

[1] 569.161

sum(diff[2,]) # st dev

[1] 752.783

sum(diff[3,]) # median

[1] 817

Boxplots for all descriptive statistics that populate the ‘diff’ variable above.

NOTE: these plots are only for subset s01. It is interesting to see that most summary statistics deviate very little from the training set, but also contain observations that deviate highly.

The above computation is done again for the remaining subsets.

#
# Commented out because it takes some time. This populates a matrix with 
# the differences of each subset of data against the entire training data, and 
# updates the total sum for each summary statistic continuously.   
#
#  
# diff.mat <- matrix(0, nrow=7, ncol=10)
# for(i in 1:ncol(data.train)){                      # top loop creates temporary 'diff' variable
#  d01 <- summary.compare(data.train[,i], s01[,i])
#  d02 <- summary.compare(data.train[,i], s02[,i])
#  d03 <- summary.compare(data.train[,i], s03[,i])
#  d04 <- summary.compare(data.train[,i], s04[,i])
#  d05 <- summary.compare(data.train[,i], s05[,i])
#  d06 <- summary.compare(data.train[,i], s06[,i])
#  d07 <- summary.compare(data.train[,i], s07[,i])
#  d08 <- summary.compare(data.train[,i], s08[,i])
#  d09 <- summary.compare(data.train[,i], s09[,i])
#  d10 <- summary.compare(data.train[,i], s10[,i])
 
#  for(r in 1:7){                                   # loop 2 populates matrix with 'diff' variable
#    diff.mat[r,1] <- diff.mat[r,1] + d01[r]
#    diff.mat[r,2] <- diff.mat[r,2] + d02[r]
#    diff.mat[r,3] <- diff.mat[r,3] + d03[r]
#    diff.mat[r,4] <- diff.mat[r,4] + d04[r]
#    diff.mat[r,5] <- diff.mat[r,5] + d05[r]
#    diff.mat[r,6] <- diff.mat[r,6] + d06[r]
#    diff.mat[r,7] <- diff.mat[r,7] + d07[r]
#    diff.mat[r,8] <- diff.mat[r,8] + d08[r]
#    diff.mat[r,09] <- diff.mat[r,09] + d09[r]
#    diff.mat[r,10] <- diff.mat[r,10] + d10[r]
   
#  }
# }



#
# Renaming matrix column and row names for table presentation. 
#
#colnames(diff.mat) <- c("s01", "s02", "s03", "s04", "s05", "s06", "s07", "s08", "s09", "s10")
#rownames(diff.mat) <- c("Mean", "Sd", "Median", "25th", "75th", "Min", "Max")

#write.table(diff.mat, "diff.txt")
diff.mat <- read.table("diff.txt")

kbl(diff.mat, caption="Subset Variation from Training Set", booktabs = T, linesep = "") %>%
  kable_styling(latex_options = c("striped", "hold_position"), stripe_index = c(1,2, 5:6))

Subset Variation from Training Set
	s01	s02	s03	s04	s05	s06	s07	s08	s09	s10
Mean	569.1622	622.5883	599.7935	497.8024	861.6378	407.6950	622.258	735.4372	705.6742	665.5293
Sd	752.7789	837.5735	616.3175	643.8175	757.7040	617.0945	648.084	747.1020	808.5846	750.9476
Median	817.0000	827.0000	830.0000	673.0000	1094.0000	588.0000	1254.500	1129.0000	792.0000	1014.0000
25th	305.0000	272.0000	163.0000	166.2500	178.0000	214.5000	365.500	55.5000	79.0000	53.0000
75th	1219.5000	1278.0000	1026.0000	917.5000	1464.0000	856.0000	907.000	1260.7500	1343.0000	1137.0000
Min	0.0000	18.0000	8.0000	10.0000	3.0000	2.0000	17.000	4.0000	6.0000	34.0000
Max	13201.0000	14139.0000	13331.0000	13539.0000	11345.0000	11670.0000	14066.000	14029.0000	12878.0000	12547.0000

#
# Checking Frequency and Distribution of Subset Differences
#
row.names <- rownames(diff.mat)
plot.mat <- cbind(diff.mat, row.names)
p.m.see <- melt(plot.mat[1:6,])  # separating high from low frequencies
p.m.up <- melt(plot.mat[7,])     # 'see' is to see difference, 'up' is upper
plot.mat <- melt(plot.mat)

hist(plot.mat$value, xlab="Difference in Sum", main = "Total Subset Differences")

g1 <- ggplot(data=p.m.up, aes(x=variable, y=value)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  ggtitle("Mean") + xlab("Subset")

g2 <- ggplot(data=p.m.see, aes(x=variable, y=value, fill=row.names)) +
  geom_bar(stat="identity") + 
  facet_wrap(~ row.names) + 
  scale_x_discrete(breaks = unique(p.m.see$variable)[seq(2,10,2)]) + 
  theme(legend.position = "none",
    axis.text=element_text(size=12)) + xlab("Subset")


grid.arrange(g1,g2)

3. Data Preparation

#
########################################################################
#
# Separate the outcome ("label") variable into a separate dataset (to 
# prevent model from 'peeking' into the result).
#
# Make sure model only uses intended predictors of the form "X20x31" 
# and similar (you will have to drop some columns like "id", "cut", etc.).
#
########################################################################
#
#

3.1 - Creating pure subsets

# Columns to drop:
#                   - label
#                   - id
#                   - cut
#
#

# ==========================================================
s01.t <- s01[,2:785]
s02.t <- s02[,2:785]
s03.t <- s03[,2:785]
s04.t <- s04[,2:785]
s05.t <- s05[,2:785]
s06.t <- s06[,2:785]
s07.t <- s07[,2:785]
s08.t <- s08[,2:785]
s09.t <- s09[,2:785]
s10.t <- s10[,2:785]

#

3.2 - Exploring pure subsets

#
# Now that I have square matrices of pure 8-bit grayscale values, I want to plot some. 
# First, I converted one row into an array of integers and transposed it so that I could easily 
# convert it into a square matrix. The matrix of numbers is interesting. 
#
# Next, I saved the .csv file, then opened it in MATLAB. 
# MATLAB has a 'mesh()' function which produced the plots in the following screenshots. 
#
one.img <- s02.t[1,]
one.img <- as.vector(t(one.img))
one.img <- matrix(one.img, nrow=28, ncol=28)
#write.csv(one.img, "one.img.csv")

one.img.2 <- s02.t[2,]
one.img.2 <- as.vector(t(one.img.2))
one.img.2 <- matrix(one.img.2, nrow=28, ncol=28)
#write.csv(one.img.2, "one.img.2.csv")

grid.arrange(rasterGrob(img1), rasterGrob(img2), 
             rasterGrob(img3), rasterGrob(img4),
             ncol=2, nrow=2)

#
# It is also possible to do this in R with the 'plot.matrix' library. 
#
brk = 50
plot(one.img.2, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

3.3 - Functions

# 
# ======= FUNCTION 1 ===========================================================
# 
# Prepare Test Sets
#
test.prep.x <- function(pure){
  x_test <- as.matrix(pure)
  x_test <- x_test / 255
  
  return(x_test)
}

test.prep.y <- function(excluded){
  y_test <- to_categorical(excluded, 10)
  
  return(y_test)
}

# ======= FUNCTION 2 ===========================================================
#
# Makes Predictions
# Calls my.prediction function to evaluate
#
my.model.preds <- function(x_test, y_test, id){
  predictions <- modelnn %>% predict(x_test) %>% k_argmax()
  
  # Sending to other function for eval
  final.pred <- my.prediction(predictions, y_test, id)
  
  return(final.pred)
}

# ======= FUNCTION 3 ===========================================================
#
# Receives Test Set Predictions
# Organizes for Evaluation
#
#
my.prediction <- function(preds, actual, id){
  test.pred <- t(t(as.numeric(preds[])))
  test.pred <- cbind(id, test.pred, rep(0,length(actual)))
  
  count = 1
  for(i in 1:nrow(actual)){
    for(t in 1:ncol(actual)){
      if((actual[i,t] == 1) && (count < nrow(test.pred))){
        test.pred[count,3] = t
        count = count + 1
        break
      }else if((actual[i,t] == 1) && (count == nrow(test.pred))){
        test.pred[count,3] = t
        break
      }
    }
  }
  test.pred[,3] <- test.pred[,3]-1
  return(test.pred)
}

3.4 - Data Prep w/ Multiple Test Sets

#
# ========================================================== Training Evaluation Set
exclude <- rbind(s01[,c(1,786,787)], s02[,c(1,786,787)], s03[,c(1,786,787)],
                 s05[,c(1,786,787)], s06[,c(1,786,787)], s08[,c(1,786,787)], s10[,c(1,786,787)])

exclude.2 <- rbind(s09[,c(1,786,787)])
exclude.3 <- rbind(s07[,c(1,786,787)])
exclude.4 <- rbind(s04[,c(1,786,787)])

# ========================================================== Training Set
x_train <- rbind(s01.t, s02.t, s03.t, s05.t, s06.t, s08.t, s10.t)
x_train <- x_train / 255
g_train <- exclude[,1]
x_train <- as.matrix(x_train)
y_train <- to_categorical(g_train, 10)

# ========================================================== Multiple Test Sets
x_test1 <- test.prep.x(s04.t)
y_test1 <- test.prep.y(exclude.4[,1])
x_test2 <- test.prep.x(s07.t)
y_test2 <- test.prep.y(exclude.3[,1])
x_test3 <- test.prep.x(s09.t)
y_test3 <- test.prep.y(exclude.2[,1])

dim(x_train)

[1] 29224   784

dim(x_test1)

[1] 4254  784

dim(x_test2)

[1] 4230  784

dim(x_test3)

[1] 4189  784

4. Model Creation

#
########################################################################
#
# Model and parameter choices.
#
# Include comments showing you are aware of which parameters you set, 
# or may be implicit (default) in the model.
#
########################################################################
#
#

4.1 - Model 1

Multilayer Neural Network

#
# 
# Multilayer Neural Network
#
#
modelnn <- keras_model_sequential()

modelnn %>%                                    # first layer: 28x28=784 input units 
  layer_dense(units = 256, activation = "relu",  # -> hidden layer of 256
              input_shape = c(784)) %>%  # ^ layer_den takes as input modelnn object, returns object
  layer_dropout(rate = 0.4) %>% # ^ object piped through for dropout regularization
  layer_dense(units = 128, activation = "relu") %>% # second hidden layer
  layer_dropout(rate = 0.3) %>% # dropout layer
  layer_dense(units = 10, activation = "softmax") # defines map from second hidden layer
                                                  # to class probabilities

# This model has four layers in total: 
# (1) Input layer
# (2) Hidden layer 1
# (3) Hidden layer 2
# (4) Output layer
#
#
# -The input layer and its following weights have 785 x 256 = 200,960 elements.  
# -Each element feeds feeds to the second hidden layer through a matrix of weights
#  of dimension 257 x 128 = 32,896. 
# -Then comes the output layer with 10 responses. The weights associated with this 
#  final layer are 129 x 10 = 1,290 weights. 
#
#

summary(modelnn)

Model: "sequential"
________________________________________________________________________________
 Layer (type)                       Output Shape                    Param #     
================================================================================
 dense_2 (Dense)                    (None, 256)                     200960      
                                                                                
 dropout_1 (Dropout)                (None, 256)                     0           
                                                                                
 dense_1 (Dense)                    (None, 128)                     32896       
                                                                                
 dropout (Dropout)                  (None, 128)                     0           
                                                                                
 dense (Dense)                      (None, 10)                      1290        
                                                                                
================================================================================
Total params: 235,146
Trainable params: 235,146
Non-trainable params: 0
________________________________________________________________________________

# Fit model by minimizing cross-entropy function.
#
# Since the response is qualitative, we look for coefficient estimates that minimize the 
# negative multinomial log-likelihood aka the cross-entropy. 
#
# If the response was qualitative, we would instead minimize squared-error loss. 
# 
modelnn %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_rmsprop(), 
  metrics = c("accuracy")
)

5. Model Evaluation

#
########################################################################
#
# Test the performance of your model(s) on a different split of data 
# (this means that you should NOT have used all the splits of the data 
# in your training model).
#
########################################################################
#
#

5.1 - Assessing Fit

Model 1 - Multilayer Neural Network

#
# Final Step
# Supply training data and fit model. 
system.time(
  history <- modelnn %>%
    fit(x_train, y_train, epochs = 30, batch_size = 128,
        validation_split = 0.2)
)

   user  system elapsed 
  66.17    1.83   18.86

plot(history, smooth = FALSE)

# This model starts overfitting at around epoch 7. One method to prevent it would be to 
# stop training at epoch 7. 


# Test Error
modelnn %>% evaluate(x_test1, y_test1)

     loss  accuracy 
0.1117804 0.9774330

#
### Making predictions
##
#
#
# predicting the softmax probabilities
#
# This is a probability assigned to each of the 10 classes. The classifier then
# assigns the image to the class with the highest probability. 
#
predictions <- modelnn %>% predict(x_test1)
round(head(predictions), 3)

     [,1] [,2] [,3] [,4]  [,5] [,6] [,7]  [,8] [,9] [,10]
[1,]    0    0    0    0 0.000    1    0 0.000    0 0.000
[2,]    0    0    0    1 0.000    0    0 0.000    0 0.000
[3,]    0    0    0    0 0.000    0    0 0.000    1 0.000
[4,]    0    0    0    0 0.000    0    0 0.001    0 0.999
[5,]    0    0    0    0 0.037    0    0 0.001    0 0.962
[6,]    1    0    0    0 0.000    0    0 0.000    0 0.000

#
#
# Directly predicting label
label_pred <- modelnn %>% predict(x_test1) %>% k_argmax()
label_pred[1:20]

tf.Tensor([5 3 8 9 9 0 9 4 1 2 7 4 5 9 7 9 5 4 0 3], shape=(20,), dtype=int64)

#
#

Confusion Tables and Model Accuracy

#
#
# Subset 04
# (test eval #1)
#
final.preds1 <- my.model.preds(x_test1, y_test1, exclude.4$id)
tab1 <- table(final.preds1[,2], final.preds1[,3])
tab1

   
      0   1   2   3   4   5   6   7   8   9
  0 439   0   3   0   0   0   1   0   1   0
  1   0 469   1   1   3   0   0   0   3   0
  2   0   1 428   5   1   1   0   2   2   0
  3   0   0   2 425   0   4   0   1   1   1
  4   0   0   0   1 432   0   2   0   0  10
  5   2   0   0   1   0 362   1   0   2   2
  6   2   1   1   0   2   0 394   0   1   0
  7   0   3   2   2   3   1   0 402   0   6
  8   0   0   0   1   0   0   0   0 381   1
  9   2   0   0   0   8   2   0   0   3 426

sum(diag(tab1))/sum(tab1)

[1] 0.977433

#
#
# Subset 07 (same as training)
# (test eval #2)
#
final.preds2 <- my.model.preds(x_test2, y_test2, exclude.3$id)
tab2 <- table(final.preds2[,2], final.preds2[,3])
tab2

   
      0   1   2   3   4   5   6   7   8   9
  0 407   0   2   0   1   0   2   0   1   0
  1   0 465   0   3   2   0   0   1   4   1
  2   1   1 407   2   2   0   0   1   1   1
  3   0   0   2 424   0   9   0   1   3   0
  4   0   0   0   0 396   0   2   0   1   4
  5   0   0   0   2   2 380   4   0   2   1
  6   0   0   0   0   1   3 401   0   1   1
  7   0   2   1   1   2   1   0 449   0   4
  8   0   1   1   4   0   1   0   0 399   3
  9   0   1   1   0   5   0   0   4   4 401

sum(diag(tab2))/sum(tab2)

[1] 0.9761229

#
#
# Subset 09
# (test eval #3)
#
final.preds3 <- my.model.preds(x_test3, y_test3, exclude.2$id)
tab3 <- table(final.preds3[,2], final.preds3[,3])
tab3

   
      0   1   2   3   4   5   6   7   8   9
  0 405   0   0   1   0   0   0   1   2   0
  1   0 483   1   0   0   0   1   2   4   2
  2   1   0 381   4   2   0   0   3   1   0
  3   0   0   1 422   0   6   0   0   7   2
  4   0   2   0   0 410   0   1   1   0   2
  5   0   0   0   4   0 381   1   0   5   4
  6   2   1   0   1   0   2 400   0   1   0
  7   0   2   2   3   1   1   0 446   1   8
  8   1   1   2   3   1   1   1   0 392   3
  9   0   0   0   2   4   1   0   0   3 360

sum(diag(tab3))/sum(tab3)

[1] 0.9739795

6. Misprediction Inspection

#
########################################################################
#
# Identify examples (data inputs) that were mispredicted, and plot them. 
# hint: create a function from the sample code included in RCode/plot_kdigits.R
#
########################################################################
#
#

#
# isolate incorrect values
#
my.wrong.finder <- function(examine){
  wrong_id <- matrix(0, nrow=abs(sum(examine[,2] == examine[,3]) - nrow(examine)), ncol=4)
  
  count = 1
  for(i in 1:nrow(examine)){
    if(examine[i,2] != examine[i,3]){
      wrong_id[count,1] = examine[i,1]
      wrong_id[count,2] = examine[i,2]
      wrong_id[count,3] = examine[i,3]
      wrong_id[count,4] = abs(examine[i,2] - examine[i,3])
      
      count = count + 1
    }
  }
  return(wrong_id)
}

#
# image prep
#
my.pixel.printer <- function(img.loc){
  img.loc <- as.vector(t(img.loc))
  img.loc <- matrix(img.loc, nrow=28, ncol=28)
  
  # brk = 50
  # p <- plot(img, border=NA, breaks=brk, col=heat.colors(brk), 
  #           key=NULL)
  return(img.loc)
}

#
wrong.1 <- my.wrong.finder(final.preds1)
sum(wrong.1[,4])

[1] 354

nrow(wrong.1)

[1] 96

#
#
wrong.2 <- my.wrong.finder(final.preds2)
sum(wrong.2[,4])

[1] 334

nrow(wrong.2)

[1] 101

#
#
wrong.3 <- my.wrong.finder(final.preds3)
sum(wrong.3[,4])

[1] 413

nrow(wrong.3)

[1] 109

#

brk = 50
img1 <- my.pixel.printer(data.train[wrong.1[3,1]+1,])
img2 <- my.pixel.printer(data.train[wrong.1[45,1]+1,])
img3 <- my.pixel.printer(data.train[wrong.1[69,1]+1,])
img4 <- my.pixel.printer(data.train[wrong.1[74,1]+1,])



wrong.1[3,2] # Actual Value

[1] 7

wrong.1[3,3] # Predicted Value

[1] 9

plot(img1, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

wrong.1[45,2] # Actual Value

[1] 8

wrong.1[45,3] # Predicted Value

[1] 9

plot(img2, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

wrong.1[69,2] # Actual Value

[1] 7

wrong.1[69,3] # Predicted Value

[1] 2

plot(img3, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

wrong.1[74,2] # Actual Value

[1] 8

wrong.1[74,3] # Predicted Value

[1] 3

plot(img4, border=NA, breaks=brk, col=heat.colors(brk), 
     key=NULL)

Incorrect predictions are made when the written number is almost illegible. With all the incorrect numbers I’ve seen so far, I cannot blame the model for some of its incorrect predictions.

Final

Silas Selfe

2/22/2022