library(stringr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(gridExtra)
library(caret)
library(conflicted)
library(factoextra)
library(purrr)
library(scales)
# conflict_scout()
conflict_prefer('filter', 'dplyr')# CSV FILES DOWNLOADED FROM
# https://www.kaggle.com/zalando-research/fashionmnist
file_path <- 'C:/downloads/docs/data622/final/'
train_fn <- 'fashion-mnist_train.csv'
test_fn <- 'fashion-mnist_test.csv'
train_path <- str_c(file_path, train_fn)
test_path <- str_c(file_path, test_fn)
train <- read.csv(train_path)
test <- read.csv(test_path)
train$label <- factor(train$label, labels = c('T-shirt/top',
'Trouser',
'Pullover',
'Dress',
'Coat',
'Sandal',
'Shirt',
'Sneaker',
'Bag',
'Ankle boot'))
test$label <- factor(test$label, labels = c('T-shirt/top',
'Trouser',
'Pullover',
'Dress',
'Coat',
'Sandal',
'Shirt',
'Sneaker',
'Bag',
'Ankle boot'))We also imported the data in csv format, downloaded from kaggle. Each observation is a picture and each variable is one pixel within each picture. The labels have been converted to factors.
### These functions are used to manipulate the structure of the raw data tables
# convert number to 6 digit string
num_to_str_5 <- function(num) {
str <- as.character(num)
str <- paste0('0000', str)
str <- str_sub(str, -5, -1)
return(str)
}
# convert dataframe to grid with x and y coordinates
as_grid_vars <- function(data, row_names = NA) {
# dataframe in parameter
df <- data
# store label
label <- df$label
# remove label
df <- df %>%
select(-label)
# change field names to seq 1 to 784
colnames(df) <- seq_len(28*28)
# add back label
df$label <- label
# add row index/name
if (is.na(row_names)) {
df$row <- seq_len(nrow(df)) %>%
num_to_str_5()
} else {
df$row <- row_names %>%
num_to_str_5()
}
# gather fields into rows
df2 <- gather(df, var, val, -row, -label)
# convert col name variable into integer
df2$var <- as.integer(df2$var)
# calculate grid
df2$y <- ceiling(df2$var / 28)
df2$x <- df2$var - 28 * (df2$y - 1)
# reverse y
df2$y <- 29 - df2$y
# sort
df2 <- df2 %>%
arrange(row, x, y) %>%
select(label, row, pixel = var, val, x, y)
return(df2)
}
# plot based on row index
plot_row <- function(row, data = train) {
# select row from dataframe
df <- data[row, ]
# convert to grid format dataframe
df2 <- as_grid_vars(df, row)
df2$id <- paste0(df2$row, ': ', df2$label)
# plot
p <- ggplot(df2, aes(x = x, y = y, fill = val)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "black", na.value = NA) +
theme_bw() +
theme(aspect.ratio = 1,
legend.position = 'none') +
scale_x_continuous(breaks = seq(0,28,4)) +
scale_y_continuous(breaks = seq(0,28,4)) +
labs(x = element_blank(),
y = element_blank()) +
facet_wrap(~id)
return(p)
}
# create vector of pixels applying to a specific column or x value
col_index_list <- function(x) {
cols <- c()
for (i in 0:27) {
xx <- x + (i * 28)
cols <- append(cols, xx)
}
return(cols)
}
# create vector of pixels applying to a specific row or y value
row_index_list <- function(y) {
y_start <- (28 - y) * 28 + 1
y_end <- y_start + 27
rows <- seq(y_start, y_end)
return(rows)
}# use a smaller training set, 10% or 6k rows
set.seed(101)
tenth <- createDataPartition(train$label, p = 0.1, list = F)
train2 <- train[tenth,]
# convert to grid format
train2_grid <- as_grid_vars(train2)
head(train2_grid)## label row pixel val x y
## 1 T-shirt/top 00001 757 0 1 1
## 2 T-shirt/top 00001 729 0 1 2
## 3 T-shirt/top 00001 701 0 1 3
## 4 T-shirt/top 00001 673 0 1 4
## 5 T-shirt/top 00001 645 0 1 5
## 6 T-shirt/top 00001 617 0 1 6
Using a function, we have gathered each of the pixel variables into a long format table and have converted each pixel to x and y coordinates, with the origin point in the lower left. Each row has also been labeled as a 5 digit number in string format. This grid location conversion was done to facilitate the feature engineering done below, allowing us to see each picture using the commonly known two dimensional chart, with x on the horizontal axis and y on the vertical axis. Additionally, this makes plotting significantly easier.
For the charts below, we’re using a sample of 10% or 6,000 observations to explore and visualize what the data contains.
# plot random 4x4
plot_row(seq(101,116,1))# plot another random 4x4
plot_row(seq(201,216,1))Above, we can see 16 observations from rows 101 - 116 of the sample set. At first glance, there are some clear differences between some classes. Shoes, shirts, and bags are clearly different. The differences between ankle boots, sneakers, and sandals vary by details rather than overall shape. Also, all types of tops, including dresses, have very similar shapes. It’s very difficult to tell the differences between shirts, pullovers, and coats.
# see which pixels are used the most
train2_grid %>%
group_by(x, y) %>%
summarize(not_white = sum(ifelse(val > 0, 1, 0)),
.groups = 'drop') %>%
ggplot(aes(x = x, y = y, fill = not_white)) +
geom_tile() +
theme(aspect.ratio = 1)The above chart above is a heat map of the pixels that are used in the sample set, counting only any non-white pixel. Overall, we can see that the center is the most likely to have some color. Stepping back, we can see the outline of a shirt, a shoe, and the space between trousers. For modeling purposes, we can see that the corners aren’t used at all, so its likely safe to leave those points out.
# see which pixels are used the most by label
train2_grid %>%
group_by(x, y, label) %>%
summarize(not_white = sum(ifelse(val > 0, 1, 0)),
.groups = 'drop') %>%
ggplot(aes(x = x, y = y, fill = not_white)) +
geom_tile() +
scale_x_continuous(breaks = seq(0,28,4)) +
scale_y_continuous(breaks = seq(0,28,4)) +
theme(aspect.ratio = 1) +
facet_wrap(~label)The visualization above is similar to the previous chart, showing most used pixels for each label. Here we are attempting to generalize the shapes and pixels used by each of the different fashion items. Looking at the shoes, we can see that there are full horizontal rows that are completely white, which is also common to bags. As we look at the differences and similarities of each of the classes, the shape of different sections allows us to classify the labels in our minds, which we’ll try to replicate with feature engineering below. If we can create new variables to try and capture the differences in shape, we can see if those variables will be useful in contributing to the accuracy of the models we train.
One of the methods that can be used for dimensionality reduction is feature engineering. Instead of processing \(28^2\) or 784 individual variables, we will use feature engineering to process the data into fewer variables that attempt to summarize information within the original dataset. If resources were unlimited, we could use both the original pixel variables along with the features below. Since we’re using consumer grade personal computers, we’ll train the models in the next step using the calculated features in this section. Below is a list of the calculated variables:
The features listed were determined by visually analyzing the faceted chart in the EDA section that showed the most used pixels for each of the types of fashion item.
# # get the pixels for each coordinate
# grid_ref <- train2_grid %>%
# filter(row == '00001') %>%
# select(x, y, pixel)
#
# # list the pixels for column 1 or x = 1
# col1_check <- grid_ref %>%
# filter(x == 1) %>%
# arrange(pixel)
#
# # list the pixels for row 1 or y = 1
# row1_check <- grid_ref %>%
# filter(y == 1) %>%
# arrange(pixel)
# return pixel based on coordinates
find_pixel <- function(x, y) {
pixel <- (28 - y) * 28 + x
}
# list pixels within box using lower left and top right coordinates
box_pixel_list <- function(xstart, ystart, xend, yend) {
# initialize empty vector
pixels <- c()
# loop through entire range
for (y in ystart:yend) {
for (x in xstart:xend) {
pixels <- append(pixels, find_pixel(x, y))
}
}
return(pixels)
}
# ptest <- box_pixel_list(16,6,22,10)
# # for function building purposes only
# data <- train[1:10,]
# grid_check <- as_grid_vars(data)
# convert a dataframe into a new dataframe with feature variables only
calculate_features <- function(data) {
# initialize dataframe
df <- data
# remove label if it exists in first column
if (is.factor(data[,1])) {
labels <- df$label
df <- df %>%
select(-label)
}
# count columns = 28 * 28 = 784
col_count <- ncol(df)
### pct different shades of grey
temp_df <- df
# percent white
x <- df == 0
x[x == T] <- 1
temp_df$pct_white <- rowSums(x) / 784
# percent grey1
x <- df > 0 & df <= 46
x[x == T] <- 1
temp_df$pct_grey1 <- rowSums(x) / 784
# percent grey2
x <- df > 46 & df <= 93
x[x == T] <- 1
temp_df$pct_grey2 <- rowSums(x) / 784
# percent grey3
x <- df > 93 & df <= 168
x[x == T] <- 1
temp_df$pct_grey3 <- rowSums(x) / 784
# percent grey4
x <- df > 168 & df <= 205
x[x == T] <- 1
temp_df$pct_grey4 <- rowSums(x) / 784
# percent black
x <- df > 205
x[x == T] <- 1
temp_df$pct_black <- rowSums(x) / 784
# keep shades only
y_start <- col_count + 1
y_end <- ncol(temp_df)
shades <- temp_df[,y_start:y_end]
rm(temp_df)
### calculate non-white pixels for row and column strips
# convert non zeros to 1
bkup_df <- df
df[df > 0] <- 1
# calculate column strips as % non-white
df$pw_x1 <- df[,row_index_list(1)] %>% rowSums() / 28
df$pw_x2 <- df[,row_index_list(2)] %>% rowSums() / 28
df$pw_x3 <- df[,row_index_list(3)] %>% rowSums() / 28
df$pw_x4 <- df[,row_index_list(4)] %>% rowSums() / 28
df$pw_x5 <- df[,row_index_list(5)] %>% rowSums() / 28
df$pw_x6 <- df[,row_index_list(6)] %>% rowSums() / 28
df$pw_x7 <- df[,row_index_list(7)] %>% rowSums() / 28
df$pw_x8 <- df[,row_index_list(8)] %>% rowSums() / 28
df$pw_x9 <- df[,row_index_list(9)] %>% rowSums() / 28
df$pw_x10 <- df[,row_index_list(10)] %>% rowSums() / 28
df$pw_x11 <- df[,row_index_list(11)] %>% rowSums() / 28
df$pw_x12 <- df[,row_index_list(12)] %>% rowSums() / 28
df$pw_x13 <- df[,row_index_list(13)] %>% rowSums() / 28
df$pw_x14 <- df[,row_index_list(14)] %>% rowSums() / 28
df$pw_x15 <- df[,row_index_list(15)] %>% rowSums() / 28
df$pw_x16 <- df[,row_index_list(16)] %>% rowSums() / 28
df$pw_x17 <- df[,row_index_list(17)] %>% rowSums() / 28
df$pw_x18 <- df[,row_index_list(18)] %>% rowSums() / 28
df$pw_x19 <- df[,row_index_list(19)] %>% rowSums() / 28
df$pw_x20 <- df[,row_index_list(20)] %>% rowSums() / 28
df$pw_x21 <- df[,row_index_list(21)] %>% rowSums() / 28
df$pw_x22 <- df[,row_index_list(22)] %>% rowSums() / 28
df$pw_x23 <- df[,row_index_list(23)] %>% rowSums() / 28
df$pw_x24 <- df[,row_index_list(24)] %>% rowSums() / 28
df$pw_x25 <- df[,row_index_list(25)] %>% rowSums() / 28
df$pw_x26 <- df[,row_index_list(26)] %>% rowSums() / 28
df$pw_x27 <- df[,row_index_list(27)] %>% rowSums() / 28
df$pw_x28 <- df[,row_index_list(28)] %>% rowSums() / 28
# calculate row strips as % non-white
df$pw_y1 <- df[,col_index_list(1)] %>% rowSums() / 28
df$pw_y2 <- df[,col_index_list(2)] %>% rowSums() / 28
df$pw_y3 <- df[,col_index_list(3)] %>% rowSums() / 28
df$pw_y4 <- df[,col_index_list(4)] %>% rowSums() / 28
df$pw_y5 <- df[,col_index_list(5)] %>% rowSums() / 28
df$pw_y6 <- df[,col_index_list(6)] %>% rowSums() / 28
df$pw_y7 <- df[,col_index_list(7)] %>% rowSums() / 28
df$pw_y8 <- df[,col_index_list(8)] %>% rowSums() / 28
df$pw_y9 <- df[,col_index_list(9)] %>% rowSums() / 28
df$pw_y10 <- df[,col_index_list(10)] %>% rowSums() / 28
df$pw_y11 <- df[,col_index_list(11)] %>% rowSums() / 28
df$pw_y12 <- df[,col_index_list(12)] %>% rowSums() / 28
df$pw_y13 <- df[,col_index_list(13)] %>% rowSums() / 28
df$pw_y14 <- df[,col_index_list(14)] %>% rowSums() / 28
df$pw_y15 <- df[,col_index_list(15)] %>% rowSums() / 28
df$pw_y16 <- df[,col_index_list(16)] %>% rowSums() / 28
df$pw_y17 <- df[,col_index_list(17)] %>% rowSums() / 28
df$pw_y18 <- df[,col_index_list(18)] %>% rowSums() / 28
df$pw_y19 <- df[,col_index_list(19)] %>% rowSums() / 28
df$pw_y20 <- df[,col_index_list(20)] %>% rowSums() / 28
df$pw_y21 <- df[,col_index_list(21)] %>% rowSums() / 28
df$pw_y22 <- df[,col_index_list(22)] %>% rowSums() / 28
df$pw_y23 <- df[,col_index_list(23)] %>% rowSums() / 28
df$pw_y24 <- df[,col_index_list(24)] %>% rowSums() / 28
df$pw_y25 <- df[,col_index_list(25)] %>% rowSums() / 28
df$pw_y26 <- df[,col_index_list(26)] %>% rowSums() / 28
df$pw_y27 <- df[,col_index_list(27)] %>% rowSums() / 28
df$pw_y28 <- df[,col_index_list(28)] %>% rowSums() / 28
### calculate pct of non-white pixels in select grids
### also pct of lightly shaded colors in box
temp_df <- bkup_df > 0 & bkup_df <= 122
temp_df[temp_df == T] <- 1
# custom grid 1, collar area, 12,26 : 18,28
pixel_list <- box_pixel_list(12, 26, 18, 28)
pixel_area <- length(pixel_list)
df$pw_collar <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_collar <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 2, tshirt sleeve area
pixel_list <- box_pixel_list(2, 14, 12, 20)
pixel_area <- length(pixel_list)
df$pw_ts_sleeve <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_ts_sleeve <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 3, trouser legs
pixel_list <- box_pixel_list(12, 4, 18, 20)
pixel_area <- length(pixel_list)
df$pw_trouser_legs <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_trouser_legs <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 4, shoulder area
pixel_list <- box_pixel_list(2, 23, 10, 28)
pixel_area <- length(pixel_list)
df$pw_shoulder <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_shoulder <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 5, sneaker top
pixel_list <- box_pixel_list(19, 17, 27, 25)
pixel_area <- length(pixel_list)
df$pw_sneaker_top <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_sneaker_top <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 6, center box
pixel_list <- box_pixel_list(7, 7, 21, 21)
pixel_area <- length(pixel_list)
df$pw_center <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_center <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 7, bag_handle
pixel_list <- box_pixel_list(10, 16, 20, 24)
pixel_area <- length(pixel_list)
df$pw_bag_handle <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_bag_handle <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 8, footwear area
pixel_list <- box_pixel_list(1, 6, 28, 22)
pixel_area <- length(pixel_list)
df$pw_footwear <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_footwear <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 9, sleeve
pixel_list <- box_pixel_list(4, 1, 10, 16)
pixel_area <- length(pixel_list)
df$pw_sleeve <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_sleeve <- temp_df[,pixel_list] %>% rowSums() / pixel_area
# custom grid 10, heel
pixel_list <- box_pixel_list(16, 4, 24, 10)
pixel_area <- length(pixel_list)
df$pw_heel <- df[,pixel_list] %>% rowSums() / pixel_area
df$plg_heel <- temp_df[,pixel_list] %>% rowSums() / pixel_area
### exclude single pixel variables
ystart <- col_count + 1
yend <- ncol(df)
strips <- df[,ystart:yend]
rm(df, temp_df, bkup_df)
### calculate percent of non-white strips for rows and columsn
# pct of non-white rows
temp_df <- strips[,1:28]
x <- temp_df == 1
x[x == T] <- 1
strips$pw_rows <- rowSums(x) / 28
# pct of non-white cols
temp_df <- strips[,29:56]
x <- temp_df == 1
x[x == T] <- 1
strips$pw_cols <- rowSums(x) / 28
rm(temp_df)
### combine all new variables
final <- bind_cols(shades, strips)
# add back labels if exists
if (is.factor(data[,1])) {
final <- bind_cols(label = labels, final)
}
rm(shades, strips)
gc()
return(final)
}This section uses 10 fold cross validation to train the following models:
The following models will be trained exclusively on the variables created in the earlier feature engineering section. Additionally, the training data will only use a fraction of the original training dataset in order to deal with the hardware limitations of personal computers. The training set used below is a stratified sample and accounts for 16.7% or 10,010 observations of the original training set.
# this will determine if the models are retrained or if the stored files are referenced
full_run <- F
# local path where models will be saved and loaded from
model_path <- 'C:/downloads/docs/data622/final/feature_engineering_models/'
rf_path <- str_c(model_path, 'fe_rf_mod.rds')
svm_path <- str_c(model_path, 'fe_svm_mod.rds')
knn_path <- str_c(model_path, 'fe_knn_mod.rds')
multinom_path <- str_c(model_path, 'fe_multinom_mod.rds')
nb_path <- str_c(model_path, 'fe_nb_mod.rds')
duration_path <- str_c(model_path, 'fe_run_duration.csv')
# create new dataframes using only feature engineering variables
test2 <- calculate_features(test)# use a new training set, about 10k rows
set.seed(101)
trainIndex <- createDataPartition(train$label, p = 0.167, list = F)
train3 <- train[trainIndex,] %>%
calculate_features()
# 10 fold cross validation
ctrl <- trainControl(method = 'cv', number = 10)
# random forest
mark_0 <- Sys.time()
grid <- expand.grid(.mtry = sqrt(ncol(train3)))
rf <- train(label ~ .,
data = train3,
method = 'rf',
tuneGrid = grid,
trControl = ctrl
)
mark_1 <- Sys.time()
# svm radial
svm <- train(label ~ .,
data = train3,
method = 'svmRadial',
preProc = c('center', 'scale'),
tuneLength = 8,
trControl = trainControl(method = 'cv')
)
mark_2 <- Sys.time()
# knn classifier
knn <- train(label ~ .,
data = train3,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl
)
mark_3 <- Sys.time()
# multinomial regression
multinom <- train(label ~ .,
data = train3,
method = 'multinom',
tuneLength = 8,
trControl = ctrl
)
mark_4 <- Sys.time()
# naive bayes
nb <- train(label ~ .,
data = train3,
method = 'naive_bayes',
tuneLength = 8,
trControl = ctrl
)
mark_5 <- Sys.time()
# multinomial regression output test
test <- train(label ~ .,
data = train3,
method = 'multinom',
tuneLength = 8,
trControl = ctrl
)
# save rds
saveRDS(rf, file = rf_path)
saveRDS(svm, file = svm_path)
saveRDS(knn, file = knn_path)
saveRDS(multinom, file = multinom_path)
saveRDS(nb, file = nb_path)# load rds
rf <- readRDS(rf_path)
svm <- readRDS(svm_path)
knn <- readRDS(knn_path)
multinom <- readRDS(multinom_path)
nb <- readRDS(nb_path)# calculate durations
rf_dur <- difftime(mark_1, mark_0, units = 'mins') %>% round(1)
svm_dur <- difftime(mark_2, mark_1, units = 'mins') %>% round(1)
knn_dur <- difftime(mark_3, mark_2, units = 'mins') %>% round(1)
mn_dur <- difftime(mark_4, mark_3, units = 'mins') %>% round(1)
nb_dur <- difftime(mark_5, mark_4, units = 'mins') %>% round(1)
# show all training durations
all_dur <- data.frame(
model_type = c('Random Forest', 'SVM Radial', 'KNN', 'Multinomial Logisitic Regression', 'Naive Bayes'),
minutes = c(rf_dur, svm_dur, knn_dur, mn_dur, nb_dur)
) %>%
arrange(desc(minutes))
all_dur
write.csv(all_dur, duration_path, row.names = F)all_dur <- read.csv(duration_path)
all_dur## model_type minutes
## 1 SVM Radial 18.4
## 2 Multinomial Logisitic Regression 9.1
## 3 Random Forest 5.3
## 4 KNN 2.5
## 5 Naive Bayes 0.2
# resampling results
rs <- resamples(list(rf = rf, svm = svm, knn = knn, multinomial = multinom, naive_bayes = nb))
summary(rs)##
## Call:
## summary.resamples(object = rs)
##
## Models: rf, svm, knn, multinomial, naive_bayes
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.8394816 0.8436800 0.8542914 0.8539328 0.8628569 0.8703888 0
## svm 0.8424726 0.8530412 0.8614890 0.8605165 0.8697941 0.8754980 0
## knn 0.7838645 0.7913896 0.7995025 0.7988018 0.8034419 0.8179104 0
## multinomial 0.8183633 0.8206732 0.8300949 0.8292152 0.8369084 0.8393214 0
## naive_bayes 0.7018943 0.7093433 0.7151394 0.7149591 0.7218345 0.7295409 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.8216422 0.8263135 0.8381031 0.8377030 0.8476204 0.8559873 0
## svm 0.8249695 0.8367125 0.8460997 0.8450178 0.8553257 0.8616613 0
## knn 0.7598584 0.7682045 0.7772228 0.7764455 0.7815983 0.7976759 0
## multinomial 0.7981815 0.8007464 0.8112138 0.8102383 0.8187884 0.8214679 0
## naive_bayes 0.6687861 0.6770715 0.6834892 0.6832878 0.6909125 0.6994767 0
Based on the results above, SVM looks to be the most accurate model.
test2$rf <- predict(rf, test2)
test2$svm <- predict(svm, test2)
test2$knn <- predict(knn, test2)
test2$mn <- predict(multinom, test2)
test2$nb <- predict(nb, test2)
rf_acc <- sum(test2$label == test2$rf) / nrow(test2)
svm_acc <- sum(test2$label == test2$svm) / nrow(test2)
knn_acc <- sum(test2$label == test2$knn) / nrow(test2)
mn_acc <- sum(test2$label == test2$mn) / nrow(test2)
nb_acc <- sum(test2$label == test2$nb) / nrow(test2)
# show accuracy
data.frame(
model_type = c('Random Forest', 'SVM Radial', 'KNN', 'Multinomial Logisitic Regression', 'Naive Bayes'),
accuracy = c(rf_acc, svm_acc, knn_acc, mn_acc, nb_acc)
) %>%
arrange(desc(accuracy))## model_type accuracy
## 1 SVM Radial 0.8564
## 2 Random Forest 0.8549
## 3 Multinomial Logisitic Regression 0.8240
## 4 KNN 0.7965
## 5 Naive Bayes 0.7134
After calculating the prediction accuracy on the test set, the support vector machine model using a radial kernel has a minor lead over the random forest model model.
# confusion matrix for SVM Radial
cm <- confusionMatrix(test2$svm, test2$label)
cm$table## Reference
## Prediction T-shirt/top Trouser Pullover Dress Coat Sandal Shirt Sneaker Bag
## T-shirt/top 824 9 14 32 1 0 167 0 4
## Trouser 2 961 1 11 1 0 4 0 2
## Pullover 17 11 738 11 84 1 99 0 17
## Dress 35 15 7 891 38 1 35 0 0
## Coat 1 2 129 24 796 0 83 0 6
## Sandal 0 0 0 0 0 934 0 15 2
## Shirt 112 2 104 31 77 0 603 0 19
## Sneaker 1 0 0 0 0 45 0 926 4
## Bag 8 0 7 0 3 6 9 0 946
## Ankle boot 0 0 0 0 0 13 0 59 0
## Reference
## Prediction Ankle boot
## T-shirt/top 0
## Trouser 0
## Pullover 1
## Dress 0
## Coat 0
## Sandal 2
## Shirt 0
## Sneaker 47
## Bag 5
## Ankle boot 945
As expected, the errors in the supervised models trained using only variables summarizing subsections or shapes of the original training data led to specific items of clothing to be confused for others. Similarly shaped items show the most misclassifications.
The SVM model commonly misclassifies pullovers as coats and shirts as t-shirts/tops or pullovers. Additionally, for each of the three types of footwear, the errors predicted by the model are most likely one of other two footwear types.
# overall accuracy
overall_accuracy <-cm$overall[1]
# data frame of results by class
byclass <- cm$byClass %>%
as.data.frame()
# add in label as variable instead of row name
byclass$label <- row.names(byclass) %>%
str_replace('Class: ','')
# plot
ggplot(byclass, aes(x = `Pos Pred Value`, y = reorder(label, `Pos Pred Value`))) +
geom_col() +
geom_vline(xintercept = overall_accuracy, lty = 2, color = 'blue') +
scale_x_continuous(labels = percent_format(accuracy=1)) +
labs(y = '', title = 'SVM Accuracy by Label') +
theme_bw()The above plot shows the model’s accuracy by label, with the blue vertical line representing the overall accuracy. We can see that shirts, pullovers, coats, and T-shirt/tops were the classes that performed the poorest. The similar shapes of the items suggest that the derived variables that were used did not adequately pick up the variance among the items.