library(tidyverse)
#library(keras)
library(magrittr)
library(ggplot2)
library(caret)
## 1. Function to set panel, axis, legend, and strip properties to all calls to ggplot in the rest of the document
gg_theme <- function(){
theme_bw()
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.title = element_text(face="bold"),
axis.text = element_text(face="bold"),
axis.line = element_line(color="#17212d"),
legend.title=element_text(size=rel(.75),face="bold"),
legend.text=element_text(size=rel(.65),face="bold"),
legend.key=element_rect(size=rel(.5),fill="transparent", color=NA),
legend.key.size=unit(1, "lines"),
legend.background=element_rect(fill="transparent",color=NA),
strip.background=element_rect(fill="#17212d", color="#17212d"),
strip.text=element_text(color="white", face="bold")
)
}
# this command will set the theme for the rest of the document
theme_set(gg_theme())
## _________________________________________________________
## 2. This function takes in a list of models (of class: train) and the test (no labels)
## Will output a dataframe of 10,000 obs where each column is a vector of model predictions
## Save the output to a variable and use an input to other functions:
## plot.con.mat()
## get.overall.acc()
## get.byclass.acc()
get.prediction.df <- function(mod.list, test.data){
df <- as.data.frame(matrix(NA, ncol = length(mod.list), nrow =nrow(test.data)))
c.names <- vector()
for(i in 1:length(mod.list)){
c.names[i] <- names(mod.list)[i]
mod.pred <- predict(mod.list[[i]], test.data) %>% as.data.frame()
df[,i] <- mod.pred
}
colnames(df) <- c.names
return(df)
}
##_____________________________________________________________
## 3. Function for confusion matrix visualization (See results section)
## Arguments: vector of model predictions, vector of test set labels, and string of the name of the model
## Output is a confusion matrix with 'prop' indicating the proportion of each predicted class relative to the sum of its reference class
plot.confusion.matrix <- function(mod.predict, test.data.label, mod.type){
# tabulate confusion matrix as a dataframe
tab <- data.frame(confusionMatrix(mod.predict, test.data.label)$table)
# clothing categories
label_cats = c('Top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot')
# group by Reference class to calculate per class proportions for each predicted value
plot.tab <- tab %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
# make a heatmap of the confusion matrix
ggplot(data = plot.tab, mapping = aes(x = Reference, y = Prediction, fill =prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_viridis_c(direction=-1, option="plasma")+
#scale_fill_gradient("blues") +
xlim(rev(levels(tab$Reference))) +
scale_x_discrete(labels=label_cats) +
scale_y_discrete(labels=label_cats) +
labs(x="Actual",
y="Predicted",
title = paste0(mod.type, " Confusion Matrix Heatmap"))
}
## ___________________________________________________
## 4. Function to compare overall accuracy and Kappa Score results for each model, returns a bar graph
get.overall.acc <- function(model.prediction.df, test.set.labels){
df <- data.frame()
names<- c()
for(i in 1:ncol(model.prediction.df)){
#print(i)
names[i] <- colnames(model.prediction.df[i])
df <- rbind(df, confusionMatrix(model.prediction.df[,i], test.set.labels)$overall %>%
data.frame() %>%
t() )
}
df %>% dplyr::select(Accuracy,Kappa) %>%
mutate(model=names) %>%
pivot_longer(-model) %>%
ggplot(aes(x=model,y=value)) + geom_bar(stat="identity", col="#17212d", fill="#17212d") +
geom_text(aes(label = round(value,3)), size = 4, hjust = 0.5, vjust = 3, col="white") +
facet_wrap(~name)
}
###_____________________________________________________________
## 5. Plots by-class accuracy results for all models
get.byclass.acc <- function(model.prediction.df, test.set.labels){
df <- data.frame()
model <- c()
# clothing categories
label_cats = c('Top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot')
for(i in 1:ncol(model.prediction.df)){
print(i)
model <- colnames(model.prediction.df[i])
df <- rbind(confusionMatrix(model.prediction.df[,i], test.set.labels)$byClass %>%
data.frame() %>%
dplyr::select(Sensitivity, Accuracy=Balanced.Accuracy) %>%
rownames_to_column(var="Class") %>%
pivot_longer(-Class) %>%
mutate(model=model),
df)
}
df %>%
ggplot(aes(y=value,x=Class, fill=model)) +
geom_col(position="dodge", width=0.25)+
scale_fill_viridis_d(option="plasma")+
scale_x_discrete(labels=label_cats) +
labs(x="",y="",title="By-Class Accuracy Metrics")+
#geom_text(aes(label = round(value,3)), size = 3, hjust = .4, vjust = 3, col="white",position="dodge") +
facet_wrap(~name,nrow=5)
}
Principle Component Analysis was used to reduce the number of dimensions. The dataset was reduced from 255 predictors to 10 principle components which explain 99% of the variance of the dataset. This reduced dataset requires less computing resources and contains the majority of the information. PCA statistically reduces the dimensions of a set of correlated variables by transforming them into a smaller number of linearly uncorrelated variables. The calculated principal components are linear combinations of the original data.
Three models were trained, and quality of predictions was evaluated. For each model, hyperparamters were tuned using 10-fold-cross-validation on the training set to find the combination that maximized Accuracy. Due to the large number of observations in the Fashion MNIST training set, only the first 10,000 observation were used due to insufficient computing power.
Gradient boosting is an ensemble machine learning technique that has application in both regression and classification, and is typically used in decision trees. It is premised on building a sequence of weak models where each at each iteration a new tree fits the errors from the previous. For this application, a stochastic gradient boosting (SGB) model will be used. In SGB, a random subsample (without replacement) of variables is used at each iteration so that correlation between successive trees is reduced. Successive trees leverage this randomness to update errors of the base.
There are four hyperparameters that are to be tuned for this model, and a brief description of each is provided as well as the values that were supplied to the grid for cross validation purposes of the model tuning effort:
# from github repo
gbm.mod <- readRDS("gbm.rds")
Random forest is an ensemble decision tree method where a series of decision trees independent from each other are provided a random subset of the predictor space. The results of each bootstrapped iteration are then given an aggregated response. There is only one hyperparameter for the rf
implementation of the random forest algorithm that was used.
The final value used for the model was mtry = 3.
rf.mod <- readRDS("rf.rds")
A neural network based on averaging of random seeds, avNNet
was trained. The following hyperparamters were tuned using cross-validation.
The final values used for the model were size = 12, decay = 0.01
nn.mod <- readRDS("nn.mod.rds")
## these are test set data i used based on 10 principle components
## and each of the three models
download.file("https://raw.githubusercontent.com/robertwelk/DATA622/main/test.data.pca.rds","test.data.pca.rds", method="curl")
test.data <- readRDS("test.data.pca.rds")
download.file("https://raw.githubusercontent.com/robertwelk/DATA622/main/rf.rds","rf.rds", method="curl")
rf.mod <- readRDS("rf.rds")
download.file("https://raw.githubusercontent.com/robertwelk/DATA622/main/gbm.rds","gbm.rds", method="curl")
gbm.mod <- readRDS("gbm.rds")
######## input all model object to this list ###########
# The functions above use this list to generate result summary visuals
mod.list <- list(random.forest=rf.mod,
gradient.boosted=gbm.mod,
neural.net=nn.mod
)
##############################################
## here, a dataframe that stores predictions for all models is made
## Please note: this function will have to be run each time a test set with different number/names of predictors is used so multiple mod.lists might have to be made for different test set data
model.pred.df <- get.prediction.df(mod.list, test.data)
## example of merging together different test set data:
## model.pred.df <- model.pred.df %>% bind_cols(get.prediction.df(mod.list2, test.data2) )
A high-level overview of the model based on metrics Accuracy and Cohen’s Kappa, not considering classes of clothing.
Accuracy describes how often the classifier was correct, and is defined as the sum of true positive and true negatives divided by the total number of observations. This was determined to be an appropriate metric to evaluate overall model performance since the test set classes are balanced.
The Cohen’s Kappa statistic is a measure of how well the model prediction matched the labeled test set, while controlling for the accuracy that would have been produced by a random classifier. As a stand alone metric, kappa can be difficult to interpret, but is useful for between model comparisons.
get.overall.acc(model.pred.df, test.data$label)
Next a model results are evaluated considering class categories
The following metrics were calculated on each of the classes: - Once again accuracy is used. - We also consider Sensitivity, the true positive rate
This graph allows for a comparison of model performances for each article of clothing.
get.byclass.acc(model.pred.df, test.data$label)
## [1] 1
## [1] 2
## [1] 3
Shows predictive power classes and models.
plot.confusion.matrix(model.pred.df$random.forest, test.data$label, "Random Forest")
plot.confusion.matrix(model.pred.df$gradient.boosted, test.data$label, "Gradient Boosted")
plot.confusion.matrix(model.pred.df$neural.net, test.data$label, "Neural Net")