rm(list=ls())
t1<- Sys.time()
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Loaded gbm 2.1.8
library(nnet)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
cores<- parallel::detectCores()
registerDoParallel(cores=cores)
library(keras)
library(tensorflow)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
train <- read.csv('./Digit_Data/train.csv',header=T)
plotdigit<- function(datarow, rm1=F){
# function will produce an image of the data given
# the function takes away the first value because it is the target
title<- datarow[1] # get actual value from training data
if(rm1){
datarow<- datarow[-1] # remove the actual value
}
datarow<- as.numeric(datarow) # convert to numeric
x<- rep(0:27)/27
y<- rep(0:27)/27
z<- matrix(datarow, ncol=28, byrow=T)
rotate <- function(x) t(apply(x, 2, rev))
z<- rotate(z)
image(x,y,z, col=gray.colors(255, start=1, end=0), asp=1,
xlim=c(0,1), ylim=c(-0.1,1.1), useRaster = T, axes=F, xlab='', ylab='')
}
plotdigit(train[10,],rm1=T)
### Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.)
par(mfrow=c(2,5))
for(i in 1:10){
plotdigit(train[i,],rm1=T)
}
### Scaled Data
scaled_var <- train[,-1]/255
par(mfrow=c(2,5))
for(i in 1:10){
plotdigit(scaled_var[i,],rm1=T)
}
### Frequency
ggplot(train,aes(x=as.factor(label),fill=label))+
geom_bar(stat="count",color="white")+
geom_text(stat='count', aes(label=..count..), vjust=+3)+
scale_fill_gradient(low="blue",high="green")+
labs(title="Digits in Train Data",x="Digits")
mypca <-prcomp(scaled_var)
It took 154 components to break the 95% variance mark.
for(i in 1:ncol(train)){
pov <-sum((mypca$sdev[1:i]^2))/sum(mypca$sdev^2)
if(pov >= .95){
print(paste0(i,' ',pov))
break
}
}
## [1] "154 0.950433238263376"
It took 704 components to break the 100% variance mark. I would assume that the remaining components don’t have anything different in them. That is why there are only 704 components that account for all the variance.
for(i in 1:ncol(train)){
pov <-sum((mypca$sdev[1:i]^2))/sum(mypca$sdev^2)
if(pov == 1){
print(paste0(i,' ',pov))
break
}
}
## [1] "704 1"
I believe it is all noise because this is made up a mix of all the numbers.
f_ten <- mypca$x%*%t(mypca$rotation)+mypca$center
image(f_ten[,1:10])
I see a darker image similar to what I saw with the first ten of all the numbers.
eights <- train %>% filter(label == 8)
eights <- eights[,2:ncol(eights)]
eights_scaled <- eights/255
eights_pca <- prcomp(eights_scaled)
for(i in 1:ncol(eights)){
pov <-sum((eights_pca$sdev[1:i]^2))/sum(eights_pca$sdev^2)
if(pov == 1){
print(paste0(i,' ',pov))
break
}
}
## [1] "537 1"
image(eights_pca$x%*%t(eights_pca$rotation)+eights_pca$center)
nrows_train<- nrow(train)*0.75
set.seed(5465485)
rows_train<- sample(1:nrow(train),nrows_train)
data_train<- train[rows_train,]
data_test<- train[-rows_train,]
y<- as.numeric(data_train[,1])
X<- data_train[, -1]
var_cols<- apply(data_train, 2, 'var')
no_var_cols<- which(var_cols==0)
X<- data_train[ , -c(no_var_cols, 1)]
pca_svd<- prcomp(X, center=F, scale=F)
X_cov<- cov(X)
pca_cov<- prcomp(X_cov)
pca_cov_percent_var<- pca_cov$sdev^2/sum(pca_cov$sdev^2)
X_loading<- pca_cov$rotation #Loadings from the PCA
X_train<- as.matrix(data_train[, -c(no_var_cols,1)]/max(data_train[,-1]))
X_train<- X_train %*% X_loading[,1:50]
y_train<- data_train[,'label']
# do the same to the holdout set
X_test<- as.matrix(data_test[ ,-c(no_var_cols,1)]/max(data_train[,-1]))
X_test<- X_test %*% X_loading[,1:50]
y_test<- data_test[,'label']
df.train<- data.frame(y=as.factor(y_train), X=X_train/255)
df.test<- data.frame(y=as.factor(y_test), X=X_test/255)
model_nnet1<- nnet(y~., data=df.train,size=250, MaxNWts=100000, maxit=1000)
## # weights: 15260
## initial value 109470.589920
## iter 10 value 76820.843000
## iter 20 value 72249.492476
## iter 30 value 71614.058817
## iter 40 value 69602.847207
## iter 50 value 65121.562804
## iter 60 value 59113.468307
## iter 70 value 49148.323303
## iter 80 value 42999.115939
## iter 90 value 34416.032805
## iter 100 value 27023.360196
## iter 110 value 21543.070942
## iter 120 value 18309.366391
## iter 130 value 15708.956633
## iter 140 value 13906.560633
## iter 150 value 12525.465631
## iter 160 value 11541.303476
## iter 170 value 10808.252490
## iter 180 value 10454.462293
## iter 190 value 10169.287713
## iter 200 value 9900.465475
## iter 210 value 9715.604684
## iter 220 value 9607.370369
## iter 230 value 9468.343161
## iter 240 value 9311.773016
## iter 250 value 9177.303073
## iter 260 value 8893.694631
## iter 270 value 8676.387485
## iter 280 value 8413.118456
## iter 290 value 8044.167337
## iter 300 value 7532.316634
## iter 310 value 7041.451903
## iter 320 value 6525.558760
## iter 330 value 6067.007650
## iter 340 value 5677.762728
## iter 350 value 5279.908655
## iter 360 value 4982.873051
## iter 370 value 4668.023911
## iter 380 value 4306.585320
## iter 390 value 3947.470119
## iter 400 value 3609.799024
## iter 410 value 3344.953722
## iter 420 value 3033.157385
## iter 430 value 2791.404654
## iter 440 value 2562.281810
## iter 450 value 2342.816180
## iter 460 value 2124.672651
## iter 470 value 1868.910855
## iter 480 value 1632.766063
## iter 490 value 1390.536487
## iter 500 value 1182.232985
## iter 510 value 975.643204
## iter 520 value 792.311094
## iter 530 value 607.908515
## iter 540 value 435.277347
## iter 550 value 286.646576
## iter 560 value 171.813830
## iter 570 value 95.598155
## iter 580 value 30.961702
## iter 590 value 11.664114
## iter 600 value 5.430211
## iter 610 value 2.866294
## iter 620 value 1.713742
## iter 630 value 1.119423
## iter 640 value 0.768320
## iter 650 value 0.534799
## iter 660 value 0.391785
## iter 670 value 0.289408
## iter 680 value 0.219501
## iter 690 value 0.177276
## iter 700 value 0.144392
## iter 710 value 0.121584
## iter 720 value 0.102887
## iter 730 value 0.088378
## iter 740 value 0.075142
## iter 750 value 0.065713
## iter 760 value 0.055225
## iter 770 value 0.048735
## iter 780 value 0.043931
## iter 790 value 0.037243
## iter 800 value 0.031775
## iter 810 value 0.028938
## iter 820 value 0.025982
## iter 830 value 0.021580
## iter 840 value 0.018571
## iter 850 value 0.017635
## iter 860 value 0.014818
## iter 870 value 0.012852
## iter 880 value 0.012006
## iter 890 value 0.010448
## iter 900 value 0.009930
## iter 910 value 0.008539
## iter 920 value 0.008276
## iter 930 value 0.006799
## iter 940 value 0.006633
## iter 950 value 0.005726
## iter 960 value 0.005626
## iter 970 value 0.004772
## iter 980 value 0.004698
## iter 990 value 0.003967
## iter1000 value 0.003866
## final value 0.003866
## stopped after 1000 iterations
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:tensorflow':
##
## train
## The following object is masked from 'package:purrr':
##
## lift
y_pred_test<- predict(model_nnet1,df.test,type ='class')
confusionMatrix(as.factor(y_pred_test),df.test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 976 0 5 0 2 2 8 2 0 4
## 1 0 1157 6 1 3 0 1 1 4 3
## 2 2 1 1015 10 1 1 6 15 10 2
## 3 1 0 9 1046 0 9 0 5 13 8
## 4 2 1 2 3 1018 1 8 3 4 11
## 5 0 1 1 9 0 905 2 1 8 5
## 6 2 1 4 0 4 6 975 0 2 0
## 7 2 5 7 5 6 3 1 1088 5 10
## 8 2 1 6 9 2 5 3 3 952 6
## 9 8 1 3 8 17 11 0 18 9 996
##
## Overall Statistics
##
## Accuracy : 0.9646
## 95% CI : (0.9609, 0.968)
## No Information Rate : 0.1112
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9606
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.98090 0.9906 0.95936 0.95875 0.96676 0.95970
## Specificity 0.99758 0.9980 0.99492 0.99522 0.99630 0.99717
## Pos Pred Value 0.97698 0.9838 0.95484 0.95875 0.96676 0.97103
## Neg Pred Value 0.99800 0.9988 0.99544 0.99522 0.99630 0.99603
## Prevalence 0.09476 0.1112 0.10076 0.10390 0.10029 0.08981
## Detection Rate 0.09295 0.1102 0.09667 0.09962 0.09695 0.08619
## Detection Prevalence 0.09514 0.1120 0.10124 0.10390 0.10029 0.08876
## Balanced Accuracy 0.98924 0.9943 0.97714 0.97699 0.98153 0.97844
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.97112 0.9577 0.94538 0.95311
## Specificity 0.99800 0.9953 0.99610 0.99207
## Pos Pred Value 0.98089 0.9611 0.96259 0.92997
## Neg Pred Value 0.99695 0.9949 0.99422 0.99480
## Prevalence 0.09562 0.1082 0.09590 0.09952
## Detection Rate 0.09286 0.1036 0.09067 0.09486
## Detection Prevalence 0.09467 0.1078 0.09419 0.10200
## Balanced Accuracy 0.98456 0.9765 0.97074 0.97259