Kaggle : Digital Recognizer Submitted By: Amruta Kulkarni, Manjari Gautam, Tilottama Bandopadhay

Pixel Distinction

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(png)

# This script calculates the frequency each pixel is turned on for digits of each label.
# It then Lists pixels that are used at least 5x more frequently in for one label than they are for all others
# For each digit, it plots the pixels that are used especially frequently in writing that digit
# (i.e. darkness in image 1 represents: (pixel usage in writing the number 1) / (pixel usage in all written numbers)

# Read competition data files:
setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
train<- data.frame(read.csv("train.csv",header=T))
binarizePixels = function(rawDf) 
  {
  output = rawDf
  output[,2:ncol(output)] = ifelse(output[,2:ncol(output)]>0,1,0)
  return(output)
}

magnify <- function(im, r)
  {
  result <- matrix(nrow=nrow(im)*r, ncol=ncol(im)*r)
  for(i in 1:nrow(im))
    {
    for(j in 1:ncol(im))
      {
      for(k in 1:r)
        {
        for(l in 1:r)
          {
          result[(i - 1) * r + (k - 1) + 1, (j - 1) * r + (l - 1) + 1] <- im[[i, j]]
        }
      }
    }
  }
  return(result)
}

binarizedTrain = binarizePixels(train)
fracPixelsOn = colMeans(binarizedTrain)
fracPixelsOnByLabel = binarizedTrain %>% group_by %>% summarise_each(funs(mean))
pixelOverrepresentationRatio = sweep(fracPixelsOnByLabel, 2, fracPixelsOn, '/')
pixelOverrepresentationRatio$label = fracPixelsOnByLabel$label  
pixelOverrepresentationRatio[is.na(pixelOverrepresentationRatio)] = 0 #replace na vals from dividing by 0 on unused pixels


## 
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 0 :
## pixel59 pixel224 pixel331 pixel332 pixel359 pixel360 pixel387 pixel388 pixel415 pixel449 pixel615 pixel643
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 1 :
## pixel364 pixel588 pixel616
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 2 :
## pixel12 pixel13 pixel14 pixel15 pixel86 pixel90 pixel91 pixel92 pixel114 pixel390 pixel391 pixel418 pixel419 pixel446 pixel447 pixel474 pixel475 pixel501 pixel502 pixel503 pixel528 pixel529 pixel530 pixel531 pixel556 pixel557 pixel558 pixel559 pixel583 pixel584 pixel585 pixel586 pixel611 pixel612 pixel613 pixel614 pixel639 pixel640 pixel641 pixel642 pixel668 pixel669 pixel698 pixel725 pixel726
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 3 :
## pixel113 pixel143 pixel144 pixel169 pixel171 pixel172 pixel450 pixel478 pixel505 pixel506 pixel533 pixel534 pixel535 pixel561 pixel562 pixel563 pixel589 pixel590 pixel591 pixel618 pixel619 pixel646 pixel647 pixel674 pixel675 pixel676 pixel702
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 4 :
## 
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 5 :
## pixel136 pixel137 pixel138 pixel164 pixel165 pixel166 pixel167 pixel192 pixel193 pixel194 pixel195 pixel220 pixel221 pixel222 pixel223 pixel248 pixel249 pixel250 pixel251 pixel277 pixel278 pixel279 pixel305 pixel306 pixel587
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 6 :
## pixel12 pixel13 pixel32 pixel33 pixel34 pixel35 pixel36 pixel37 pixel38 pixel39 pixel40 pixel41 pixel42 pixel43 pixel44 pixel45 pixel46 pixel47 pixel48 pixel49 pixel50 pixel51 pixel58 pixel60 pixel61 pixel62 pixel63 pixel64 pixel65 pixel66 pixel67 pixel68 pixel69 pixel70 pixel71 pixel72 pixel73 pixel74 pixel75 pixel76 pixel77 pixel78 pixel79 pixel80 pixel81 pixel86 pixel88 pixel98 pixel99 pixel100 pixel101 pixel102 pixel103 pixel104 pixel105 pixel106 pixel107 pixel108 pixel109 pixel110
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 7 :
## pixel197 pixel198 pixel225 pixel226 pixel227 pixel252 pixel253 pixel254 pixel255 pixel256 pixel280 pixel281 pixel282 pixel283 pixel284 pixel308 pixel309 pixel310 pixel311 pixel312 pixel336 pixel337 pixel338 pixel339 pixel365 pixel366 pixel367 pixel393 pixel394 pixel705 pixel706 pixel707 pixel733 pixel734 pixel735 pixel736 pixel737 pixel738 pixel739 pixel740 pixel741 pixel742 pixel743 pixel744 pixel753 pixel761 pixel762 pixel763 pixel764 pixel765 pixel766 pixel767 pixel768 pixel769 pixel770 pixel771 pixel772 pixel773 pixel774 pixel775 pixel776 pixel777 pixel778 pixel779
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 8 :
## pixel307 pixel335
## 
## Pixels that are used at least 5x more frequently when the handwritten digit is 9 :
## pixel504 pixel718 pixel719 pixel720 pixel721 pixel722 pixel723 pixel724 pixel725 pixel746 pixel747 pixel748 pixel749 pixel750 pixel751 pixel752

KNN Model

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
# Read all data
train<- data.frame(read.csv("train.csv",header=T))
# load test datasets
test <- data.frame(read.csv("test.csv", header=T))
# Its dimesions?
dim(train)
## [1] 42000   785
# Just the second row and only pixels not label
pixels<-train[2,2:785]
dim(pixels)
## [1]   1 784
# What is the label
train[2,1]
## [1] 0
# Save in a file but while saving round up [0,255] as [0,1]
write.table(matrix(round(pixels/255),28,28), file = "second.txt", sep = " ", row.names = FALSE, col.names = FALSE)
# Append also to this file label value

write.table(paste("label",train[2,1]), file = "second.txt", sep = " ", row.names = FALSE, col.names = FALSE,append=T)
# Read 2nd row and ignore label column
data<-train[2,2:785]
data<-as.matrix(data,nrow=28,ncol=28)
dim(data)
## [1]   1 784
data<-matrix(data,nrow=28,ncol=28)
dim(data)
## [1] 28 28
##Color ramp def.
colors<-c('white','black')
# Create a function to generate a continuum of colors
#  of desired number of colours from white to black
ramp_pal<-colorRampPalette(colors=colors)
# Draw an image of data over a grid of x(1:28), y(1:28)
image(1:28,1:28,data,main="IInd row. Label=0",col=ramp_pal(256))

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

library(kknn)
## 
## Attaching package: 'kknn'
## 
## The following object is masked from 'package:caret':
## 
##     contr.dummy
# remove near-zero variance features
badCols <- nearZeroVar(train[, -1])

train <- train[, -(badCols+1)]
 
# optimize knn for k=1:15
# and kernel=triangular, rectangular, or gaussian
model <- train.kknn(as.factor(label) ~ ., train, kmax=15, kernel= c("triangular","rectangular","gaussian"))
 
# print out best parameters and prediction error
print(paste("Best parameters:", "kernel =", model$best.parameters$kernel, ", k =", model$best.parameters$k))
## [1] "Best parameters: kernel = triangular , k = 6"
print(model$MISCLASS)
##    triangular rectangular   gaussian
## 1  0.03216667  0.03216667 0.03216667
## 2  0.03216667  0.03885714 0.03216667
## 3  0.03026190  0.03197619 0.03042857
## 4  0.02923810  0.03238095 0.02909524
## 5  0.02842857  0.03247619 0.03088095
## 6  0.02764286  0.03297619 0.03045238
## 7  0.02769048  0.03369048 0.03200000
## 8  0.02769048  0.03445238 0.03192857
## 9  0.02773810  0.03485714 0.03342857
## 10 0.02795238  0.03535714 0.03359524
## 11 0.02807143  0.03571429 0.03409524
## 12 0.02828571  0.03600000 0.03428571
## 13 0.02861905  0.03711905 0.03471429
## 14 0.02897619  0.03711905 0.03519048
## 15 0.02930952  0.03785714 0.03557143
# train the optimal kknn model
model <- kknn(as.factor(label) ~ ., train, test, k=9, kernel="triangular")
results <- model$fitted.values
 
# save the class predictions in a column vector

write(as.numeric(levels(results))[results], file="kknn_submission.csv", ncolumns=1)

** Linear Regression Using Standard Deviation**

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
library(ggplot2)
library(proto)
data<- data.frame(read.csv("train.csv",header=T))
labels   <- data[,1]
features <- data[,-1]

means <- aggregate(features, list(labels), mean)
means[is.na(means)] <- 0.0

stds <- aggregate(features, list(labels), sd)
stds[is.na(stds)] <- 0.0

rowsToPlot <- 1:10

rowToMatrix <- function(row)
  {
  intensity <- as.numeric(row)/max(as.numeric(row))
  return(t(matrix((rgb(intensity, intensity, intensity)), 28, 28)))
}

geom_digit <- function (digits, labels) GeomRasterDigit$new(geom_params = list(digits=digits), stat = "identity", position = "identity", data = NULL, inherit.aes = TRUE)

GeomRasterDigit <- proto(ggplot2:::GeomRaster, expr={
  draw_groups <- function(., data, scales, coordinates, digits, ...) 
    {
    bounds <- coord_transform(coordinates, data.frame(x = c(-Inf, Inf), y = c(-Inf, Inf)), scales)
    x_rng <- range(bounds$x, na.rm = TRUE)
    y_rng <- range(bounds$y, na.rm = TRUE)
    rasterGrob(as.raster(rowToMatrix(digits[data$rows,])), x_rng[1], y_rng[1], diff(x_rng), diff(y_rng), 
               default.units = "native", just = c("left","bottom"), interpolate = FALSE)
  }
})

blank <- theme(strip.background = element_blank(),strip.text.x = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks = element_blank(),axis.line = element_blank())

p <- ggplot(data.frame(rows=rowsToPlot, labels=means[,1]), aes(x=.1, y=.9, rows=rows, label=labels)) + geom_blank() + xlim(0,1) + ylim(0,1) + xlab("") + ylab("") + facet_wrap(~ rows, ncol=5) +geom_digit(means[,-1]) +geom_text(colour="#53cfff") +blank + ggtitle("Pixel Means")

p <- ggplot(data.frame(rows=rowsToPlot, labels=stds[,1]), aes(x=.1, y=.9, rows=rows, label=labels)) + geom_blank() + xlim(0,1) + ylim(0,1) + xlab("") + ylab("") + facet_wrap(~ rows, ncol=5) +geom_digit(stds[,-1]) +geom_text(colour="#53cfff") +blank +ggtitle("Pixel Standard Deviations")
plot(p)

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
train <- data.frame(read.csv("train.csv",header=T))
train<-as.matrix(train)

##Color ramp def.
colors<-c('white','black')
cus_col<-colorRampPalette(colors=colors)

## Plot the average image of each digit
par(mfrow=c(4,3),pty='s',mar=c(1,1,1,1),xaxt='n',yaxt='n')
all_img<-array(dim=c(10,28*28))
for(di in 0:9)
{
  print(di)
  all_img[di+1,]<-apply(train[train[,1]==di,-1],2,sum)
  all_img[di+1,]<-all_img[di+1,]/max(all_img[di+1,])*255
  
  z<-array(all_img[di+1,],dim=c(28,28))
  z<-z[,28:1] ##right side up
  image(1:28,1:28,z,main=di,col=cus_col(256))
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
pdf('train_letters.pdf')
par(mfrow=c(4,4),pty='s',mar=c(3,3,3,3),xaxt='n',yaxt='n')
for(i in 1:train[200])
{
  z<-array(train[i,-1],dim=c(28,28))
  z<-z[,28:1] ##right side up
  image(1:28,1:28,z,main=train[i,1],col=cus_col(256))
  print(i)
}
## [1] 1
## [1] 2
dev.off()
## png 
##   2

Random Forest Benchmark

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(0)
test <- read.csv("test.csv")
train <- data.frame(read.csv("train.csv",header=T))
numTrain <- 10000
numTrees <- 25

rows <- sample(1:nrow(train), numTrain)
labels <- as.factor(train[rows,1])
train <- train[rows,-1]

rf <- randomForest(train, labels, xtest=test, ntree=numTrees)
predictions <- data.frame(ImageId=1:nrow(test), Label=levels(labels)[rf$test$predicted])
head(predictions)
##   ImageId Label
## 1       1     2
## 2       2     0
## 3       3     9
## 4       4     4
## 5       5     2
## 6       6     7

Using RandomForest proximity to visualize digits data set This script fits a random forest model, and uses “proximity” to visualize the results. Proximity between two examples is based on how often they are in a common leaf node. The examples are then embedded in R^2 using multidimensional scaling so as to make the Euclidean distances match the proximities from the RF.

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
set.seed(0)

library(randomForest)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
train <- data.frame(read.csv("train.csv",header=T))
test <- data.frame(read.csv("test.csv",header=T))

numTrees <- 50
numRowsForModel <- 10000
numRowsForMDS <- 1000
numRowsToDrawAsImages <- 200

# Use only a subset of train to save time
smallTrain = train[sample(1:nrow(train), size = numRowsForModel),]
labels <- as.factor(smallTrain[[1]])
smallTrain = smallTrain[,-1]

# Make my own train/test split
inMyTrain = sample(c(TRUE,FALSE), size = numRowsForModel, replace = TRUE)
myTrain = smallTrain[inMyTrain,]
myTest = smallTrain[!inMyTrain,]
labelsMyTrain = labels[inMyTrain]
labelsMyTest = labels[!inMyTrain]

# Random forest (generates proximities)
rf <- randomForest(myTrain, labelsMyTrain, ntree = numTrees, xtest = myTest, proximity = TRUE)
predictions <- levels(labels)[rf$test$predicted]
predictionIsCorrect = labelsMyTest == predictions

cat(sprintf("Proportion correct in my test set: %f\n", mean(predictionIsCorrect)))
## Proportion correct in my test set: 0.932869

Do MDS on subset (to save time) of proximities Get the portion of the proximity matrix just for my holdout set:

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")

prox = rf$test$proximity[,1:nrow(myTest)]

proxSmall = prox[1:numRowsForMDS,1:numRowsForMDS]

cat("Beginnging MDS (embedding data in R^2, respecting the RF proximities as much as possible:\n")
## Beginnging MDS (embedding data in R^2, respecting the RF proximities as much as possible:
embeddingSmall = isoMDS(1 - proxSmall, k = 2)
## initial  value 75.607519 
## iter   5 value 38.558179
## iter  10 value 37.029800
## iter  15 value 36.470217
## final  value 36.349417 
## converged
embeddedSubsetPredictions <- predictions[1:numRowsForMDS]

embeddedSubsetLabels <- labelsMyTest[1:numRowsForMDS]

embeddedSubsetPredictionIsCorrect <- predictionIsCorrect[1:numRowsForMDS]

makeAnnotationRaster <- function(rowNum, size, posDF,  imageDF, correct) 
  {
  row = as.numeric(imageDF[rowNum,])
  t = row/max(row)
  z = t*correct[rowNum]
  rowRGB = rgb(t,z,z)
  rowMatrix = matrix((rowRGB), 28, 28)
  pos = c(posDF[rowNum,] - size/2, posDF[rowNum,] + size/2)
  return(annotation_raster(t(rowMatrix), pos[1], pos[3], pos[2], pos[4]))
}


rowsForPlottingAsImages = sample(1:numRowsForMDS, numRowsToDrawAsImages)
ARs = Map(function(rows) makeAnnotationRaster(rows, .04, embeddingSmall$points, myTest, (predictions == labelsMyTest)),rowsForPlottingAsImages)

p <- ggplot(data.frame(embeddingSmall$points), aes(x=X1, y=X2)) + geom_point(aes(colour=embeddedSubsetLabels, shape=embeddedSubsetPredictionIsCorrect, size=embeddedSubsetPredictionIsCorrect)) +scale_shape_manual(values = c(17,16)) +scale_size_manual(values = c(3,2)) +labs(color = "True Label", size = "Classified Correctly by RF", shape = "Classified Correctly by RF")  

png(filename = "DigitsEmbedding.png", width = 960, height = 960)

Reduce("+", ARs, init = p)

invisible(dev.off())

plot(p)