Today’s Challenge

Today’s challenge is to create an algorithm that identifies handwritten digits 0-9. Boosted trees aren’t the only (or even the best) algorithm for this kind of thing (and this is a task at which human can outperform algorithms at this!) but let’s see what they can do.

The Packages and The Data

You’ll need the following packages and can load the data either on AWS or locally to your CRC computer.

library(rlang)
library(readr)
library(ggplot2)
library(caret)
library(Matrix)
library(xgboost)
library(grid)


# on AWS:
TRAIN <- read.csv("/home/rstudio/data/digits/train.csv")
TEST <- read.csv("/home/rstudio/data/digits/test.csv")


# locally
TRAIN <- read.csv("Data_Science_Data/digits/train.csv")
TEST <- read.csv("Data_Science_Data/digits/test.csv")

# separating feastures (used for prediction) and "labels" (what we're trying to predict)
labels   <- TRAIN[,1]
features <- TRAIN[,-1]

#converting the training and test sets to matrices
TRAIN <- as.matrix(TRAIN)
TEST <- as.matrix(TEST)

A Quick Look at the Data

Each row in the training set shows how the digit was labelled as well as the shading of 784 pixels (numbered 0 to 783).

#take a look at the first row
TRAIN[1,]

We can get a better idea what the data looks like by creating and using a function that converts this string of numbers into a 28x28 matrix and the intensity into a color. If you want to see what these colors look like, google “html color picker” and paste them in.

rowToMatrix <- function(row) {
  intensity <- as.numeric(row)/max(as.numeric(row))
  return(t(matrix((rgb(intensity, intensity, intensity)), 28, 28)))
}

# the first three digits:
rowToMatrix(TRAIN[1,-1])
rowToMatrix(TRAIN[2,-1])
rowToMatrix(TRAIN[3,-1])

A Longer Look at the Data

Here are the first nine digits and in the training set with their labels.

If you want to create an image like this you can create the following functions and run the following code (which I won’t attempt to explain):

rowsToPlot <- 1:9

geom_digit <- function(digits) 
{
  layer(geom = GeomRasterDigit, stat = "identity", position = "identity", data = NULL, 
        params = list(digits=digits))  
}

GeomRasterDigit <- ggproto("GeomRasterDigit", 
                           ggplot2::GeomRaster, 
                           draw_panel = function(data, panel_scales, coordinates, digits = digits) {
                             if (!inherits(coordinates, "CoordCartesian")) {
                               stop("geom_digit only works with Cartesian coordinates",
                                    call. = FALSE)
                             }
                             corners <- data.frame(x = c(-Inf, Inf), y = c(-Inf, Inf))
                             bounds <- coordinates$transform(corners, panel_scales)
                             x_rng <- range(bounds$x, na.rm = TRUE)
                             y_rng <- range(bounds$y, na.rm = TRUE)
                             rasterGrob(as.raster(rowToMatrix(digits[data$rows,])), 
                                        x = mean(x_rng), y = mean(y_rng), 
                                        default.units = "native", just = c("center","center"), 
                                        interpolate = FALSE)
                           }) 

p <- ggplot(data.frame(rows=rowsToPlot, labels=labels[rowsToPlot]), 
            aes(x=0.1, y=.9, rows=rows, label=labels)) + 
  geom_blank() + xlim(0,1) + ylim(0,1) + xlab("") + ylab("") + 
  facet_wrap(~ rows, ncol=3) +
  geom_digit(features) +
  geom_text(colour="#53cfff") +
  theme(panel.background = element_rect(fill = 'black'),
        panel.border = element_rect(fill = NA, colour = "#cfff53"),
        panel.grid = element_blank(),
        strip.background = element_blank(),
        strip.text.x = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_blank()) +
  ggtitle("Example Handwritten Digits")

plot(p)

Fitting a Model

# Remember that you can read about xgboost parameters by typing
?xgboost

PARAM <- list(
  # Booster Parameters
  eta                = 0.10,              # default = 0.30
  max_depth          = 5,                 # default = 6
  subsample          = 0.70,              # default = 1
  colsample_bytree   = 0.95,              # default = 1
  
  # Task Parameters
  objective          = "multi:softmax",   # default = "reg:linear"
  num_class          = 10,                # default = 0
  base_score         = 0.5,               # default
  eval_metric        = "merror"           # default = "rmes"
)


fit_xgboost <- xgboost(param =PARAM, data = TRAIN[, -c(1)], label = TRAIN[, c(1)], nrounds=15)

Making Predictions

PRED <- predict(fit_xgboost, TEST)

# how many times did we predict each digit?
table(PRED)

# create submission file for Kaggle
SUBMIT <- data.frame(ImageId = c(1:length(PRED)), Label = PRED)
write_csv(SUBMIT, "submission.csv")