Libraries

library(stringr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(gridExtra)
library(caret)
library(conflicted)
library(factoextra)
library(purrr)
library(scales)

# conflict_scout()
conflict_prefer('filter', 'dplyr')

Import

# CSV FILES DOWNLOADED FROM 
# https://www.kaggle.com/zalando-research/fashionmnist

file_path <- 'C:/downloads/docs/data622/final/'

train_fn <- 'fashion-mnist_train.csv'
test_fn <- 'fashion-mnist_test.csv'

train_path <- str_c(file_path, train_fn)
test_path <- str_c(file_path, test_fn)

train <- read.csv(train_path)
test <- read.csv(test_path)

train$label <- factor(train$label, labels = c('T-shirt/top',
                                              'Trouser',
                                              'Pullover',
                                              'Dress',
                                              'Coat', 
                                              'Sandal',
                                              'Shirt',
                                              'Sneaker',
                                              'Bag',
                                              'Ankle boot'))

test$label <- factor(test$label, labels = c('T-shirt/top',
                                            'Trouser',
                                            'Pullover',
                                            'Dress',
                                            'Coat', 
                                            'Sandal',
                                            'Shirt',
                                            'Sneaker',
                                            'Bag',
                                            'Ankle boot'))

We also imported the data in csv format, downloaded from kaggle. Each observation is a picture and each variable is one pixel within each picture. The labels have been converted to factors.

### These functions are used to manipulate the structure of the raw data tables

# convert number to 6 digit string
num_to_str_5 <- function(num) {
  
  str <- as.character(num)
  str <- paste0('0000', str)
  str <- str_sub(str, -5, -1)
  
  return(str)
}


# convert dataframe to grid with x and y coordinates
as_grid_vars <- function(data, row_names = NA) {
  
  # dataframe in parameter
  df <- data
  
  # store label
  label <- df$label
  
  # remove label
  df <- df %>%
    select(-label)
  
  # change field names to seq 1 to 784
  colnames(df) <- seq_len(28*28)
  
  # add back label
  df$label <- label
  
  # add row index/name
  if (is.na(row_names)) {
    
    df$row <- seq_len(nrow(df)) %>%
      num_to_str_5()
    
  } else {
    
    df$row <- row_names %>%
      num_to_str_5()
    
  }
  
  # gather fields into rows
  df2 <- gather(df, var, val, -row, -label)
  
  # convert col name variable into integer
  df2$var <- as.integer(df2$var)
  
  # calculate grid
  df2$y <- ceiling(df2$var / 28)
  df2$x <- df2$var - 28 * (df2$y - 1)
  
  # reverse y
  df2$y <- 29 - df2$y

  # sort
  df2 <- df2 %>%
    arrange(row, x, y) %>%
    select(label, row, pixel = var, val, x, y)
  
  return(df2)
    
}


# plot based on row index
plot_row <- function(row, data = train) {
  
  # select row from dataframe
  df <- data[row, ]
  
  # convert to grid format dataframe
  df2 <- as_grid_vars(df, row)
  
  df2$id <- paste0(df2$row, ': ', df2$label)
  
  # plot
  p <- ggplot(df2, aes(x = x, y = y, fill = val)) +
    geom_tile() +
    scale_fill_gradient(low = "white", high = "black", na.value = NA) +
    theme_bw() +
    theme(aspect.ratio = 1,
          legend.position = 'none') +
    scale_x_continuous(breaks = seq(0,28,4)) +
    scale_y_continuous(breaks = seq(0,28,4)) +
    labs(x = element_blank(),
         y = element_blank()) +
    facet_wrap(~id)
  
  return(p)
}


# create vector of pixels applying to a specific column or x value
col_index_list <- function(x) {
  
  cols <- c()
  
  for (i in 0:27) {
    
    xx <- x + (i * 28)
    cols <- append(cols, xx)
  
  }
  
  return(cols)
  
}

# create vector of pixels applying to a specific row or y value
row_index_list <- function(y) {
  
  y_start <- (28 - y) * 28 + 1
  y_end <- y_start + 27
  rows <- seq(y_start, y_end) 
  
  return(rows)
  
}

EDA

Long Table

# use a smaller training set, 10% or 6k rows
set.seed(101)
tenth <- createDataPartition(train$label, p = 0.1, list = F)
train2 <- train[tenth,]

# convert to grid format
train2_grid <- as_grid_vars(train2)

head(train2_grid)
##         label   row pixel val x y
## 1 T-shirt/top 00001   757   0 1 1
## 2 T-shirt/top 00001   729   0 1 2
## 3 T-shirt/top 00001   701   0 1 3
## 4 T-shirt/top 00001   673   0 1 4
## 5 T-shirt/top 00001   645   0 1 5
## 6 T-shirt/top 00001   617   0 1 6

Using a function, we have gathered each of the pixel variables into a long format table and have converted each pixel to x and y coordinates, with the origin point in the lower left. Each row has also been labeled as a 5 digit number in string format. This grid location conversion was done to facilitate the feature engineering done below, allowing us to see each picture using the commonly known two dimensional chart, with x on the horizontal axis and y on the vertical axis. Additionally, this makes plotting significantly easier.

For the charts below, we’re using a sample of 10% or 6,000 observations to explore and visualize what the data contains.

Plot 4x4 Observations

# plot random 4x4
plot_row(seq(101,116,1))

# plot another random 4x4
plot_row(seq(201,216,1))

Above, we can see 16 observations from rows 101 - 116 of the sample set. At first glance, there are some clear differences between some classes. Shoes, shirts, and bags are clearly different. The differences between ankle boots, sneakers, and sandals vary by details rather than overall shape. Also, all types of tops, including dresses, have very similar shapes. It’s very difficult to tell the differences between shirts, pullovers, and coats.

Most Used Pixels

# see which pixels are used the most
train2_grid %>%
  group_by(x, y) %>%
  summarize(not_white = sum(ifelse(val > 0, 1, 0)),
            .groups = 'drop') %>%
  ggplot(aes(x = x, y = y, fill = not_white)) +
  geom_tile() +
  theme(aspect.ratio = 1)

The above chart above is a heat map of the pixels that are used in the sample set, counting only any non-white pixel. Overall, we can see that the center is the most likely to have some color. Stepping back, we can see the outline of a shirt, a shoe, and the space between trousers. For modeling purposes, we can see that the corners aren’t used at all, so its likely safe to leave those points out.

Most Used Pixels by Label

# see which pixels are used the most by label
train2_grid %>%
  group_by(x, y, label) %>%
  summarize(not_white = sum(ifelse(val > 0, 1, 0)),
            .groups = 'drop') %>%
  ggplot(aes(x = x, y = y, fill = not_white)) +
  geom_tile() +
  scale_x_continuous(breaks = seq(0,28,4)) +
  scale_y_continuous(breaks = seq(0,28,4)) +
  theme(aspect.ratio = 1) +
  facet_wrap(~label)

The visualization above is similar to the previous chart, showing most used pixels for each label. Here we are attempting to generalize the shapes and pixels used by each of the different fashion items. Looking at the shoes, we can see that there are full horizontal rows that are completely white, which is also common to bags. As we look at the differences and similarities of each of the classes, the shape of different sections allows us to classify the labels in our minds, which we’ll try to replicate with feature engineering below. If we can create new variables to try and capture the differences in shape, we can see if those variables will be useful in contributing to the accuracy of the models we train.

Feature Engineering

One of the methods that can be used for dimensionality reduction is feature engineering. Instead of processing \(28^2\) or 784 individual variables, we will use feature engineering to process the data into fewer variables that attempt to summarize information within the original dataset. If resources were unlimited, we could use both the original pixel variables along with the features below. Since we’re using consumer grade personal computers, we’ll train the models in the next step using the calculated features in this section. Below is a list of the calculated variables:

  • % of pixels that are different shades of grey – 0, 50, 87, 167, 209, 255
    • white: 0
    • grey1: 1-46
    • grey2: 47-93
    • grey3: 94-168
    • grey4: 169-205
    • black: 206-255
  • % of non-white pixels in each of the 28 rows
  • % of non-white pixels in each of the 28 columns
  • % of rows that are completely white
  • % of columns that are completely white
  • 10 custom blocks that are subsections of the grids, showing percent non-white as well as percent light grey (1 - 122).

The features listed were determined by visually analyzing the faceted chart in the EDA section that showed the most used pixels for each of the types of fashion item.

# # get the pixels for each coordinate
# grid_ref <- train2_grid %>%
#   filter(row == '00001') %>%
#   select(x, y, pixel)
# 
# # list the pixels for column 1 or x = 1
# col1_check <- grid_ref %>%
#   filter(x == 1) %>%
#   arrange(pixel)
# 
# # list the pixels for row 1 or y = 1
# row1_check <- grid_ref %>%
#   filter(y == 1) %>%
#   arrange(pixel)

# return pixel based on coordinates
find_pixel <- function(x, y) {
  
  pixel <- (28 - y) * 28 + x
  
}

# list pixels within box using lower left and top right coordinates
box_pixel_list <- function(xstart, ystart, xend, yend) {
  
  # initialize empty vector
  pixels <- c()
  
  # loop through entire range
  for (y in ystart:yend) {
    for (x in xstart:xend) {
      pixels <- append(pixels, find_pixel(x, y))
    }
  }
  
  return(pixels)
  
}

# ptest <- box_pixel_list(16,6,22,10)




# # for function building purposes only
# data <- train[1:10,]
# grid_check <- as_grid_vars(data)

# convert a dataframe into a new dataframe with feature variables only
calculate_features <- function(data) {

  # initialize dataframe
  df <- data
  
  # remove label if it exists in first column
  if (is.factor(data[,1])) {
    labels <- df$label
    df <- df %>%
      select(-label)
  }
  
  # count columns = 28 * 28 = 784
  col_count <- ncol(df)
  
  
  ### pct different shades of grey
  temp_df <- df
  
  # percent white 
  x <- df == 0
  x[x == T] <- 1
  temp_df$pct_white <- rowSums(x) / 784
  
  # percent grey1
  x <- df > 0 & df <= 46
  x[x == T] <- 1
  temp_df$pct_grey1 <- rowSums(x) / 784
  
  # percent grey2
  x <- df > 46 & df <= 93
  x[x == T] <- 1
  temp_df$pct_grey2 <- rowSums(x) / 784
  
  # percent grey3
  x <- df > 93 & df <= 168
  x[x == T] <- 1
  temp_df$pct_grey3 <- rowSums(x) / 784
  
  # percent grey4
  x <- df > 168 & df <= 205 
  x[x == T] <- 1
  temp_df$pct_grey4 <- rowSums(x) / 784
  
  # percent black
  x <- df > 205 
  x[x == T] <- 1
  temp_df$pct_black <- rowSums(x) / 784
  
  # keep shades only
  y_start <- col_count + 1
  y_end <- ncol(temp_df)
  shades <- temp_df[,y_start:y_end]
  
  rm(temp_df)
  
  ### calculate non-white pixels for row and column strips
  
  # convert non zeros to 1
  bkup_df <- df
  df[df > 0] <- 1
  
  # calculate column strips as % non-white
  df$pw_x1 <- df[,row_index_list(1)] %>% rowSums() / 28
  df$pw_x2 <- df[,row_index_list(2)] %>% rowSums() / 28
  df$pw_x3 <- df[,row_index_list(3)] %>% rowSums() / 28
  df$pw_x4 <- df[,row_index_list(4)] %>% rowSums() / 28
  df$pw_x5 <- df[,row_index_list(5)] %>% rowSums() / 28
  df$pw_x6 <- df[,row_index_list(6)] %>% rowSums() / 28
  df$pw_x7 <- df[,row_index_list(7)] %>% rowSums() / 28
  df$pw_x8 <- df[,row_index_list(8)] %>% rowSums() / 28
  df$pw_x9 <- df[,row_index_list(9)] %>% rowSums() / 28
  df$pw_x10 <- df[,row_index_list(10)] %>% rowSums() / 28
  df$pw_x11 <- df[,row_index_list(11)] %>% rowSums() / 28
  df$pw_x12 <- df[,row_index_list(12)] %>% rowSums() / 28
  df$pw_x13 <- df[,row_index_list(13)] %>% rowSums() / 28
  df$pw_x14 <- df[,row_index_list(14)] %>% rowSums() / 28
  df$pw_x15 <- df[,row_index_list(15)] %>% rowSums() / 28
  df$pw_x16 <- df[,row_index_list(16)] %>% rowSums() / 28
  df$pw_x17 <- df[,row_index_list(17)] %>% rowSums() / 28
  df$pw_x18 <- df[,row_index_list(18)] %>% rowSums() / 28
  df$pw_x19 <- df[,row_index_list(19)] %>% rowSums() / 28
  df$pw_x20 <- df[,row_index_list(20)] %>% rowSums() / 28
  df$pw_x21 <- df[,row_index_list(21)] %>% rowSums() / 28
  df$pw_x22 <- df[,row_index_list(22)] %>% rowSums() / 28
  df$pw_x23 <- df[,row_index_list(23)] %>% rowSums() / 28
  df$pw_x24 <- df[,row_index_list(24)] %>% rowSums() / 28
  df$pw_x25 <- df[,row_index_list(25)] %>% rowSums() / 28
  df$pw_x26 <- df[,row_index_list(26)] %>% rowSums() / 28
  df$pw_x27 <- df[,row_index_list(27)] %>% rowSums() / 28
  df$pw_x28 <- df[,row_index_list(28)] %>% rowSums() / 28
  
  # calculate row strips as % non-white
  df$pw_y1 <- df[,col_index_list(1)] %>% rowSums() / 28
  df$pw_y2 <- df[,col_index_list(2)] %>% rowSums() / 28
  df$pw_y3 <- df[,col_index_list(3)] %>% rowSums() / 28
  df$pw_y4 <- df[,col_index_list(4)] %>% rowSums() / 28
  df$pw_y5 <- df[,col_index_list(5)] %>% rowSums() / 28
  df$pw_y6 <- df[,col_index_list(6)] %>% rowSums() / 28
  df$pw_y7 <- df[,col_index_list(7)] %>% rowSums() / 28
  df$pw_y8 <- df[,col_index_list(8)] %>% rowSums() / 28
  df$pw_y9 <- df[,col_index_list(9)] %>% rowSums() / 28
  df$pw_y10 <- df[,col_index_list(10)] %>% rowSums() / 28
  df$pw_y11 <- df[,col_index_list(11)] %>% rowSums() / 28
  df$pw_y12 <- df[,col_index_list(12)] %>% rowSums() / 28
  df$pw_y13 <- df[,col_index_list(13)] %>% rowSums() / 28
  df$pw_y14 <- df[,col_index_list(14)] %>% rowSums() / 28
  df$pw_y15 <- df[,col_index_list(15)] %>% rowSums() / 28
  df$pw_y16 <- df[,col_index_list(16)] %>% rowSums() / 28
  df$pw_y17 <- df[,col_index_list(17)] %>% rowSums() / 28
  df$pw_y18 <- df[,col_index_list(18)] %>% rowSums() / 28
  df$pw_y19 <- df[,col_index_list(19)] %>% rowSums() / 28
  df$pw_y20 <- df[,col_index_list(20)] %>% rowSums() / 28
  df$pw_y21 <- df[,col_index_list(21)] %>% rowSums() / 28
  df$pw_y22 <- df[,col_index_list(22)] %>% rowSums() / 28
  df$pw_y23 <- df[,col_index_list(23)] %>% rowSums() / 28
  df$pw_y24 <- df[,col_index_list(24)] %>% rowSums() / 28
  df$pw_y25 <- df[,col_index_list(25)] %>% rowSums() / 28
  df$pw_y26 <- df[,col_index_list(26)] %>% rowSums() / 28
  df$pw_y27 <- df[,col_index_list(27)] %>% rowSums() / 28
  df$pw_y28 <- df[,col_index_list(28)] %>% rowSums() / 28

  ### calculate pct of non-white pixels in select grids
  ### also pct of lightly shaded colors in box
  temp_df <- bkup_df > 0 & bkup_df <= 122
  temp_df[temp_df == T] <- 1
  
  # custom grid 1, collar area, 12,26 : 18,28
  pixel_list <- box_pixel_list(12, 26, 18, 28)
  pixel_area <- length(pixel_list)
  df$pw_collar <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_collar <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 2, tshirt sleeve area
  pixel_list <- box_pixel_list(2, 14, 12, 20)
  pixel_area <- length(pixel_list)
  df$pw_ts_sleeve <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_ts_sleeve <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 3, trouser legs
  pixel_list <- box_pixel_list(12, 4, 18, 20)
  pixel_area <- length(pixel_list)
  df$pw_trouser_legs <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_trouser_legs <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 4, shoulder area
  pixel_list <- box_pixel_list(2, 23, 10, 28)
  pixel_area <- length(pixel_list)
  df$pw_shoulder <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_shoulder <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 5, sneaker top
  pixel_list <- box_pixel_list(19, 17, 27, 25)
  pixel_area <- length(pixel_list)
  df$pw_sneaker_top <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_sneaker_top <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 6, center box
  pixel_list <- box_pixel_list(7, 7, 21, 21)
  pixel_area <- length(pixel_list)
  df$pw_center <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_center <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 7, bag_handle
  pixel_list <- box_pixel_list(10, 16, 20, 24)
  pixel_area <- length(pixel_list)
  df$pw_bag_handle <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_bag_handle <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 8, footwear area
  pixel_list <- box_pixel_list(1, 6, 28, 22)
  pixel_area <- length(pixel_list)
  df$pw_footwear <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_footwear <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 9, sleeve
  pixel_list <- box_pixel_list(4, 1, 10, 16)
  pixel_area <- length(pixel_list)
  df$pw_sleeve <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_sleeve <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  # custom grid 10, heel
  pixel_list <- box_pixel_list(16, 4, 24, 10)
  pixel_area <- length(pixel_list)
  df$pw_heel <- df[,pixel_list] %>% rowSums() / pixel_area
  df$plg_heel <- temp_df[,pixel_list] %>% rowSums() / pixel_area
  
  ### exclude single pixel variables
  
  ystart <- col_count + 1
  yend <- ncol(df)
  
  strips <- df[,ystart:yend]
  rm(df, temp_df, bkup_df)
  
  ### calculate percent of non-white strips for rows and columsn
  
  # pct of non-white rows
  temp_df <- strips[,1:28]
  x <- temp_df == 1
  x[x == T] <- 1
  strips$pw_rows <- rowSums(x) / 28
  
  # pct of non-white cols
  temp_df <- strips[,29:56]
  x <- temp_df == 1
  x[x == T] <- 1
  strips$pw_cols <- rowSums(x) / 28
  
  rm(temp_df)
  
  ### combine all new variables
  final <- bind_cols(shades, strips)
  
  # add back labels if exists
  if (is.factor(data[,1])) {
    final <- bind_cols(label = labels, final)
  } 
  
  rm(shades, strips)
  gc()
  return(final)
    
}

Supervised Models

This section uses 10 fold cross validation to train the following models:

  • Random forest
  • Support vector machine using a radial kernel
  • KNN nearest neighbor
  • Multinomial logistic regression
  • Naive bayes

The following models will be trained exclusively on the variables created in the earlier feature engineering section. Additionally, the training data will only use a fraction of the original training dataset in order to deal with the hardware limitations of personal computers. The training set used below is a stratified sample and accounts for 16.7% or 10,010 observations of the original training set.

# this will determine if the models are retrained or if the stored files are referenced
full_run <- F

# local path where models will be saved and loaded from
model_path <- 'C:/downloads/docs/data622/final/feature_engineering_models/'

rf_path <- str_c(model_path, 'fe_rf_mod.rds')
svm_path <- str_c(model_path, 'fe_svm_mod.rds')
knn_path <- str_c(model_path, 'fe_knn_mod.rds')
multinom_path <- str_c(model_path, 'fe_multinom_mod.rds')
nb_path <- str_c(model_path, 'fe_nb_mod.rds')

duration_path <- str_c(model_path, 'fe_run_duration.csv')

# create new dataframes using only feature engineering variables
test2 <- calculate_features(test)
# use a new training set, about 10k rows
set.seed(101)
trainIndex <- createDataPartition(train$label, p = 0.167, list = F)
train3 <- train[trainIndex,] %>%
  calculate_features()


# 10 fold cross validation
ctrl <- trainControl(method = 'cv', number = 10)


# random forest
mark_0 <- Sys.time()
  
grid <-  expand.grid(.mtry = sqrt(ncol(train3)))  
rf <- train(label ~ .,
            data = train3,
            method = 'rf',
            tuneGrid = grid,
            trControl = ctrl
            )
  

mark_1 <- Sys.time()

# svm radial 
svm <- train(label ~ .,
             data = train3,
             method = 'svmRadial',
             preProc = c('center', 'scale'),
             tuneLength = 8,
             trControl = trainControl(method = 'cv')
             )

mark_2 <- Sys.time()

# knn classifier
knn <- train(label ~ .,
             data = train3,
             method = "knn",
             preProc = c("center", "scale"),
             tuneLength = 10,
             trControl = ctrl
             )

mark_3 <- Sys.time()

# multinomial regression
multinom <- train(label ~ .,
                  data = train3,
                  method = 'multinom',
                  tuneLength = 8,
                  trControl = ctrl
                  )

mark_4 <- Sys.time()

# naive bayes
nb <- train(label ~ .,
            data = train3,
            method = 'naive_bayes',
            tuneLength = 8,
            trControl = ctrl
            )


mark_5 <- Sys.time()

# multinomial regression output test
test <- train(label ~ .,
                  data = train3,
                  method = 'multinom',
                  tuneLength = 8,
                  trControl = ctrl
                  )


# save rds
saveRDS(rf, file = rf_path)
saveRDS(svm, file = svm_path)
saveRDS(knn, file = knn_path)
saveRDS(multinom, file = multinom_path)
saveRDS(nb, file = nb_path)
# load rds
rf <- readRDS(rf_path)
svm <- readRDS(svm_path)
knn <- readRDS(knn_path)
multinom <- readRDS(multinom_path)
nb <- readRDS(nb_path)

Model Training Durations

# calculate durations
rf_dur <- difftime(mark_1, mark_0, units = 'mins') %>% round(1)
svm_dur <- difftime(mark_2, mark_1, units = 'mins') %>% round(1)
knn_dur <- difftime(mark_3, mark_2, units = 'mins') %>% round(1)
mn_dur <- difftime(mark_4, mark_3, units = 'mins') %>% round(1)
nb_dur <- difftime(mark_5, mark_4, units = 'mins') %>% round(1)

# show all training durations
all_dur <- data.frame(
  model_type = c('Random Forest', 'SVM Radial', 'KNN', 'Multinomial Logisitic Regression', 'Naive Bayes'),
  minutes = c(rf_dur, svm_dur, knn_dur, mn_dur, nb_dur)
) %>%
  arrange(desc(minutes))

all_dur

write.csv(all_dur, duration_path, row.names = F)
all_dur <- read.csv(duration_path)

all_dur
##                         model_type minutes
## 1                       SVM Radial    18.4
## 2 Multinomial Logisitic Regression     9.1
## 3                    Random Forest     5.3
## 4                              KNN     2.5
## 5                      Naive Bayes     0.2

Resample Models

# resampling  results
rs <- resamples(list(rf = rf, svm = svm, knn = knn, multinomial = multinom, naive_bayes = nb))
summary(rs)
## 
## Call:
## summary.resamples(object = rs)
## 
## Models: rf, svm, knn, multinomial, naive_bayes 
## Number of resamples: 10 
## 
## Accuracy 
##                  Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## rf          0.8394816 0.8436800 0.8542914 0.8539328 0.8628569 0.8703888    0
## svm         0.8424726 0.8530412 0.8614890 0.8605165 0.8697941 0.8754980    0
## knn         0.7838645 0.7913896 0.7995025 0.7988018 0.8034419 0.8179104    0
## multinomial 0.8183633 0.8206732 0.8300949 0.8292152 0.8369084 0.8393214    0
## naive_bayes 0.7018943 0.7093433 0.7151394 0.7149591 0.7218345 0.7295409    0
## 
## Kappa 
##                  Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## rf          0.8216422 0.8263135 0.8381031 0.8377030 0.8476204 0.8559873    0
## svm         0.8249695 0.8367125 0.8460997 0.8450178 0.8553257 0.8616613    0
## knn         0.7598584 0.7682045 0.7772228 0.7764455 0.7815983 0.7976759    0
## multinomial 0.7981815 0.8007464 0.8112138 0.8102383 0.8187884 0.8214679    0
## naive_bayes 0.6687861 0.6770715 0.6834892 0.6832878 0.6909125 0.6994767    0

Based on the results above, SVM looks to be the most accurate model.

test2$rf <- predict(rf, test2)
test2$svm <- predict(svm, test2)
test2$knn <- predict(knn, test2)
test2$mn <- predict(multinom, test2)
test2$nb <- predict(nb, test2)

rf_acc <- sum(test2$label == test2$rf) / nrow(test2)
svm_acc <- sum(test2$label == test2$svm) / nrow(test2)
knn_acc <- sum(test2$label == test2$knn) / nrow(test2)
mn_acc <- sum(test2$label == test2$mn) / nrow(test2)
nb_acc <- sum(test2$label == test2$nb) / nrow(test2)

# show accuracy
data.frame(
  model_type = c('Random Forest', 'SVM Radial', 'KNN', 'Multinomial Logisitic Regression', 'Naive Bayes'),
  accuracy = c(rf_acc, svm_acc, knn_acc, mn_acc, nb_acc)
) %>%
  arrange(desc(accuracy))
##                         model_type accuracy
## 1                       SVM Radial   0.8564
## 2                    Random Forest   0.8549
## 3 Multinomial Logisitic Regression   0.8240
## 4                              KNN   0.7965
## 5                      Naive Bayes   0.7134

After calculating the prediction accuracy on the test set, the support vector machine model using a radial kernel has a minor lead over the random forest model model.

Classification Evaluation

# confusion matrix for SVM Radial
cm <- confusionMatrix(test2$svm, test2$label)

cm$table
##              Reference
## Prediction    T-shirt/top Trouser Pullover Dress Coat Sandal Shirt Sneaker Bag
##   T-shirt/top         824       9       14    32    1      0   167       0   4
##   Trouser               2     961        1    11    1      0     4       0   2
##   Pullover             17      11      738    11   84      1    99       0  17
##   Dress                35      15        7   891   38      1    35       0   0
##   Coat                  1       2      129    24  796      0    83       0   6
##   Sandal                0       0        0     0    0    934     0      15   2
##   Shirt               112       2      104    31   77      0   603       0  19
##   Sneaker               1       0        0     0    0     45     0     926   4
##   Bag                   8       0        7     0    3      6     9       0 946
##   Ankle boot            0       0        0     0    0     13     0      59   0
##              Reference
## Prediction    Ankle boot
##   T-shirt/top          0
##   Trouser              0
##   Pullover             1
##   Dress                0
##   Coat                 0
##   Sandal               2
##   Shirt                0
##   Sneaker             47
##   Bag                  5
##   Ankle boot         945

As expected, the errors in the supervised models trained using only variables summarizing subsections or shapes of the original training data led to specific items of clothing to be confused for others. Similarly shaped items show the most misclassifications.

The SVM model commonly misclassifies pullovers as coats and shirts as t-shirts/tops or pullovers. Additionally, for each of the three types of footwear, the errors predicted by the model are most likely one of other two footwear types.

# overall accuracy
overall_accuracy <-cm$overall[1]

# data frame of results by class
byclass <- cm$byClass %>% 
  as.data.frame()

# add in label as variable instead of row name
byclass$label <- row.names(byclass) %>%
  str_replace('Class: ','')

# plot
ggplot(byclass, aes(x = `Pos Pred Value`, y = reorder(label, `Pos Pred Value`))) +
  geom_col() +
  geom_vline(xintercept = overall_accuracy, lty = 2, color = 'blue') +
  scale_x_continuous(labels = percent_format(accuracy=1)) +
  labs(y = '', title = 'SVM Accuracy by Label') +
  theme_bw()

The above plot shows the model’s accuracy by label, with the blue vertical line representing the overall accuracy. We can see that shirts, pullovers, coats, and T-shirt/tops were the classes that performed the poorest. The similar shapes of the items suggest that the derived variables that were used did not adequately pick up the variance among the items.