1 Intro

What is Deep Learning? Deep learning is one of branches in machine learning topics that inspired from how human brain works. Every nodes in layer will recieve input from the previous layers and produce the dot product of the weight plus the bias, and so on until it arrives in output layer. I will be using Keras, but there are other frameworks to build a Deep Learning model such as Tensorflow and pytorch.

2 Setup and Data

library(keras)
use_condaenv("r-tensorflow")
train <- read.csv('fashion-mnist_train.csv')
test <- read.csv('fashion-mnist_test.csv')

The data we will be using are the popular Fashion MNIST dataset, it contains 10 classes of different clothings, the total is about 60000 different pictures.

3 Exploratory Data Analysis

Check Target Distribution

table(train$label)
#> 
#>    0    1    2    3    4    5    6    7    8    9 
#> 6000 6000 6000 6000 6000 6000 6000 6000 6000 6000

Check one random image

fashionmatrix <- matrix(train[sample(1:nrow(train), 1),2:ncol(train)], nrow = 28, ncol=28)
fashionmatrix <- apply(fashionmatrix, 2, as.numeric, byrow = T)
fashionmatrix <- t(apply(fashionmatrix, 1, rev))
image(fashionmatrix)

Let’s take a general look at our data before we move onto the next step. To do that, we just need to repeat the code above and loop it.

vizTrain <- function(input){
  
  dimmax <- sqrt(ncol(train[,-1]))
  
  dimn <- ceiling(sqrt(nrow(input)))
  par(mfrow=c(dimn, dimn), mar=c(.1, .1, .1, .1))
  
  for (i in 1:nrow(input)){
      m1 <- matrix(input[i,2:ncol(input)], nrow=dimmax, byrow=T)
      m1 <- apply(m1, 2, as.numeric)
      m1 <- t(apply(m1, 2, rev))
      
      image(1:dimmax, 1:dimmax, m1, col=grey.colors(255), xaxt = 'n', yaxt = 'n')
      text(2, 20, col="white", cex=1.2, train[i, 1])
  }
  
}

vizTrain(train[1:100,])

4 Data Pre-Processing

Separating the features and label.

mtrain <- data.matrix(train)
mtest <- data.matrix(test)

train_x <- mtrain[,-1]
train_y <- mtrain[,1]

test_x <- mtest[,-1]
test_y <- mtest[,1]

Turn our matrices into array so we can use it on our keras model later

train_x_keras <- array_reshape(train_x, dim = c(nrow(train_x), ncol(train_x)))
test_x_keras <- array_reshape(test_x, c(nrow(test_x), ncol(test_x)))

Rescale our features so it ranges between 0-1

train_x_keras <- train_x_keras/255
test_x_keras <- test_x_keras/255

Encode the label

train_y_keras <- to_categorical(train_y, 10)
test_y_keras <- to_categorical(train_y, 10)

head(train_y_keras)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    0    0    1    0    0    0    0    0    0     0
#> [2,]    0    0    0    0    0    0    0    0    0     1
#> [3,]    0    0    0    0    0    0    1    0    0     0
#> [4,]    1    0    0    0    0    0    0    0    0     0
#> [5,]    0    0    0    1    0    0    0    0    0     0
#> [6,]    0    0    0    0    1    0    0    0    0     0

5 Create Model

Create the architecture

model <- keras_model_sequential()
model %>% layer_dense(units = 256, activation = "relu", input_shape = c(784)) %>% 
    layer_dense(units = 128, activation = "relu") %>% 
    layer_dropout(rate=0.3) %>%
    layer_dense(units = 10, activation = "softmax")

Compile the model

model %>% compile(loss = "categorical_crossentropy", 
            optimizer = optimizer_adam(lr = 0.001), metrics = c("accuracy"))

Train the model

model %>% fit(train_x_keras, train_y_keras, epoch = 10, 
                           bacth_size = 256, validation_split = 0.15)

6 Predict and Evaluate

Predict on test set

predclass <- predict_classes(model, test_x_keras)

Decode the class for easier interpretation

predclass <- sapply(as.character(predclass), switch,
                       "0" = "Tshirt",
                       "1" = "Trouser",
                       "2" = "Pullover",
                       "3" = "Dress",
                       "4" = "Coat",
                       "5" = "Sandal",
                       "6" = "Shirt",
                       "7" = "Sneaker",
                       "8" = "Bag",
                      "9" = "Boot")

test_y <- sapply(as.character(test_y), switch,
                       "0" = "Tshirt",
                       "1" = "Trouser",
                       "2" = "Pullover",
                       "3" = "Dress",
                       "4" = "Coat",
                       "5" = "Sandal",
                       "6" = "Shirt",
                       "7" = "Sneaker",
                       "8" = "Bag",
                      "9" = "Boot")
caret::confusionMatrix(as.factor(predclass), as.factor(test_y))
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction Bag Boot Coat Dress Pullover Sandal Shirt Sneaker Trouser Tshirt
#>   Bag      985    3    4     1        5      6    15       3       1     10
#>   Boot       0  950    0     0        0     17     0      32       0      0
#>   Coat       2    0  892    28      124      0   104       0       1      1
#>   Dress      2    0   23   932       15      0    32       0       8     30
#>   Pullover   5    0   56     7      809      0    95       0       1     17
#>   Sandal     0    8    0     0        0    945     0      16       0      1
#>   Shirt      3    1   23    12       28      0   569       0       0     54
#>   Sneaker    1   38    0     0        0     32     0     949       0      0
#>   Trouser    1    0    2    10        0      0     1       0     986      2
#>   Tshirt     1    0    0    10       19      0   184       0       3    885
#> 
#> Overall Statistics
#>                                                
#>                Accuracy : 0.8902               
#>                  95% CI : (0.8839, 0.8963)     
#>     No Information Rate : 0.1                  
#>     P-Value [Acc > NIR] : < 0.00000000000000022
#>                                                
#>                   Kappa : 0.878                
#>                                                
#>  Mcnemar's Test P-Value : NA                   
#> 
#> Statistics by Class:
#> 
#>                      Class: Bag Class: Boot Class: Coat Class: Dress
#> Sensitivity              0.9850      0.9500      0.8920       0.9320
#> Specificity              0.9947      0.9946      0.9711       0.9878
#> Pos Pred Value           0.9535      0.9510      0.7743       0.8944
#> Neg Pred Value           0.9983      0.9944      0.9878       0.9924
#> Prevalence               0.1000      0.1000      0.1000       0.1000
#> Detection Rate           0.0985      0.0950      0.0892       0.0932
#> Detection Prevalence     0.1033      0.0999      0.1152       0.1042
#> Balanced Accuracy        0.9898      0.9723      0.9316       0.9599
#>                      Class: Pullover Class: Sandal Class: Shirt Class: Sneaker
#> Sensitivity                   0.8090        0.9450       0.5690         0.9490
#> Specificity                   0.9799        0.9972       0.9866         0.9921
#> Pos Pred Value                0.8172        0.9742       0.8246         0.9304
#> Neg Pred Value                0.9788        0.9939       0.9537         0.9943
#> Prevalence                    0.1000        0.1000       0.1000         0.1000
#> Detection Rate                0.0809        0.0945       0.0569         0.0949
#> Detection Prevalence          0.0990        0.0970       0.0690         0.1020
#> Balanced Accuracy             0.8944        0.9711       0.7778         0.9706
#>                      Class: Trouser Class: Tshirt
#> Sensitivity                  0.9860        0.8850
#> Specificity                  0.9982        0.9759
#> Pos Pred Value               0.9840        0.8031
#> Neg Pred Value               0.9984        0.9871
#> Prevalence                   0.1000        0.1000
#> Detection Rate               0.0986        0.0885
#> Detection Prevalence         0.1002        0.1102
#> Balanced Accuracy            0.9921        0.9304

Not bad for such a simple model, we are able to hit 88% accuracy on training and not overfitting on the evaluation, our validation and test accuracy have 87%

7 Conclusion

Even tough we did not reach a high 90% accuracy, it is still a decent model, having only 2 hidden dense layer, plus a drop out layer, not only simple, the computation needed is pretty light, of course it can be improved, we can apply hyper/parameter search to find the best hyperparameter for our model, however applying this method require a vast computational power, we can also build a deeper model, or we can use CNN model, but its important to remember, a deeper model will not always yield a better result but most definitely take a longer time to train.