** Linear Regression Using Standard Deviation**

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
library(ggplot2)
library(proto)
data<- data.frame(read.csv("train.csv",header=T))
labels   <- data[,1]
features <- data[,-1]

means <- aggregate(features, list(labels), mean)
means[is.na(means)] <- 0.0

stds <- aggregate(features, list(labels), sd)
stds[is.na(stds)] <- 0.0

rowsToPlot <- 1:10

rowToMatrix <- function(row)
  {
  intensity <- as.numeric(row)/max(as.numeric(row))
  return(t(matrix((rgb(intensity, intensity, intensity)), 28, 28)))
}

geom_digit <- function (digits, labels) GeomRasterDigit$new(geom_params = list(digits=digits), stat = "identity", position = "identity", data = NULL, inherit.aes = TRUE)

GeomRasterDigit <- proto(ggplot2:::GeomRaster, expr={
  draw_groups <- function(., data, scales, coordinates, digits, ...) 
    {
    bounds <- coord_transform(coordinates, data.frame(x = c(-Inf, Inf), y = c(-Inf, Inf)), scales)
    x_rng <- range(bounds$x, na.rm = TRUE)
    y_rng <- range(bounds$y, na.rm = TRUE)
    rasterGrob(as.raster(rowToMatrix(digits[data$rows,])), x_rng[1], y_rng[1], diff(x_rng), diff(y_rng), 
               default.units = "native", just = c("left","bottom"), interpolate = FALSE)
  }
})

blank <- theme(strip.background = element_blank(),strip.text.x = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks = element_blank(),axis.line = element_blank())

p <- ggplot(data.frame(rows=rowsToPlot, labels=means[,1]), aes(x=.1, y=.9, rows=rows, label=labels)) + geom_blank() + xlim(0,1) + ylim(0,1) + xlab("") + ylab("") + facet_wrap(~ rows, ncol=5) +geom_digit(means[,-1]) +geom_text(colour="#53cfff") +blank + ggtitle("Pixel Means")

p <- ggplot(data.frame(rows=rowsToPlot, labels=stds[,1]), aes(x=.1, y=.9, rows=rows, label=labels)) + geom_blank() + xlim(0,1) + ylim(0,1) + xlab("") + ylab("") + facet_wrap(~ rows, ncol=5) +geom_digit(stds[,-1]) +geom_text(colour="#53cfff") +blank +ggtitle("Pixel Standard Deviations")
plot(p)

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
train <- data.frame(read.csv("train.csv",header=T))
train<-as.matrix(train)

##Color ramp def.
colors<-c('white','black')
cus_col<-colorRampPalette(colors=colors)

## Plot the average image of each digit
par(mfrow=c(4,3),pty='s',mar=c(1,1,1,1),xaxt='n',yaxt='n')
all_img<-array(dim=c(10,28*28))
for(di in 0:9)
{
  print(di)
  all_img[di+1,]<-apply(train[train[,1]==di,-1],2,sum)
  all_img[di+1,]<-all_img[di+1,]/max(all_img[di+1,])*255
  
  z<-array(all_img[di+1,],dim=c(28,28))
  z<-z[,28:1] ##right side up
  image(1:28,1:28,z,main=di,col=cus_col(256))
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
pdf('train_letters.pdf')
par(mfrow=c(4,4),pty='s',mar=c(3,3,3,3),xaxt='n',yaxt='n')
for(i in 1:train[200])
{
  z<-array(train[i,-1],dim=c(28,28))
  z<-z[,28:1] ##right side up
  image(1:28,1:28,z,main=train[i,1],col=cus_col(256))
  print(i)
}
## [1] 1
## [1] 2
dev.off()
## png 
##   2

Random Forest Benchmark

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
set.seed(0)
test <- read.csv("test.csv")
train <- data.frame(read.csv("train.csv",header=T))
numTrain <- 10000
numTrees <- 25

rows <- sample(1:nrow(train), numTrain)
labels <- as.factor(train[rows,1])
train <- train[rows,-1]

rf <- randomForest(train, labels, xtest=test, ntree=numTrees)
predictions <- data.frame(ImageId=1:nrow(test), Label=levels(labels)[rf$test$predicted])
head(predictions)
##   ImageId Label
## 1       1     2
## 2       2     0
## 3       3     9
## 4       4     4
## 5       5     2
## 6       6     7

Using RandomForest proximity to visualize digits data set This script fits a random forest model, and uses “proximity” to visualize the results. Proximity between two examples is based on how often they are in a common leaf node. The examples are then embedded in R^2 using multidimensional scaling so as to make the Euclidean distances match the proximities from the RF.

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")
set.seed(0)

library(randomForest)
library(ggplot2)
library(MASS)

train <- data.frame(read.csv("train.csv",header=T))
test <- data.frame(read.csv("test.csv",header=T))

numTrees <- 50
numRowsForModel <- 10000
numRowsForMDS <- 1000
numRowsToDrawAsImages <- 200

# Use only a subset of train to save time
smallTrain = train[sample(1:nrow(train), size = numRowsForModel),]
labels <- as.factor(smallTrain[[1]])
smallTrain = smallTrain[,-1]

# Make my own train/test split
inMyTrain = sample(c(TRUE,FALSE), size = numRowsForModel, replace = TRUE)
myTrain = smallTrain[inMyTrain,]
myTest = smallTrain[!inMyTrain,]
labelsMyTrain = labels[inMyTrain]
labelsMyTest = labels[!inMyTrain]

# Random forest (generates proximities)
rf <- randomForest(myTrain, labelsMyTrain, ntree = numTrees, xtest = myTest, proximity = TRUE)
predictions <- levels(labels)[rf$test$predicted]
predictionIsCorrect = labelsMyTest == predictions

cat(sprintf("Proportion correct in my test set: %f\n", mean(predictionIsCorrect)))
## Proportion correct in my test set: 0.932869

Do MDS on subset (to save time) of proximities Get the portion of the proximity matrix just for my holdout set:

setwd("C:/Users/Manjari/Desktop/Machine learning/Final Report Manjari")

prox = rf$test$proximity[,1:nrow(myTest)]

proxSmall = prox[1:numRowsForMDS,1:numRowsForMDS]

cat("Beginnging MDS (embedding data in R^2, respecting the RF proximities as much as possible:\n")
## Beginnging MDS (embedding data in R^2, respecting the RF proximities as much as possible:
embeddingSmall = isoMDS(1 - proxSmall, k = 2)
## initial  value 75.607519 
## iter   5 value 38.558179
## iter  10 value 37.029800
## iter  15 value 36.470217
## final  value 36.349417 
## converged
embeddedSubsetPredictions <- predictions[1:numRowsForMDS]

embeddedSubsetLabels <- labelsMyTest[1:numRowsForMDS]

embeddedSubsetPredictionIsCorrect <- predictionIsCorrect[1:numRowsForMDS]

makeAnnotationRaster <- function(rowNum, size, posDF,  imageDF, correct) 
  {
  row = as.numeric(imageDF[rowNum,])
  t = row/max(row)
  z = t*correct[rowNum]
  rowRGB = rgb(t,z,z)
  rowMatrix = matrix((rowRGB), 28, 28)
  pos = c(posDF[rowNum,] - size/2, posDF[rowNum,] + size/2)
  return(annotation_raster(t(rowMatrix), pos[1], pos[3], pos[2], pos[4]))
}


rowsForPlottingAsImages = sample(1:numRowsForMDS, numRowsToDrawAsImages)
ARs = Map(function(rows) makeAnnotationRaster(rows, .04, embeddingSmall$points, myTest, (predictions == labelsMyTest)),rowsForPlottingAsImages)

p <- ggplot(data.frame(embeddingSmall$points), aes(x=X1, y=X2)) + geom_point(aes(colour=embeddedSubsetLabels, shape=embeddedSubsetPredictionIsCorrect, size=embeddedSubsetPredictionIsCorrect)) +scale_shape_manual(values = c(17,16)) +scale_size_manual(values = c(3,2)) +labs(color = "True Label", size = "Classified Correctly by RF", shape = "Classified Correctly by RF")  

png(filename = "DigitsEmbedding.png", width = 960, height = 960)

Reduce("+", ARs, init = p)

invisible(dev.off())

plot(p)