## set the work directory to where the file is and then read it...
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.1 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(OpenImageR)
## Warning: package 'OpenImageR' was built under R version 4.1.3
train <- read_csv("train.csv")
## Rows: 42000 Columns: 785
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (785): label, pixel0, pixel1, pixel2, pixel3, pixel4, pixel5, pixel6, pi...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("test.csv")
## Rows: 28000 Columns: 784
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (784): pixel0, pixel1, pixel2, pixel3, pixel4, pixel5, pixel6, pixel7, p...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## divide the training data by 255
## The labels tell what number is drawn by the image so the first label is an image of a 1
label = train[,1]
training_data = train[,-1]/255
## Each pixel is a 28 by 28 matrix
img1<- matrix(unlist(training_data[10,-1]),nrow=28,ncol=28,byrow=TRUE)
## Warning in matrix(unlist(training_data[10, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
image(img1,useRaster=TRUE)
## Maybe I should rotate it?? ## Fixed it
r <- rotateFixed(img1, 90)
image(r,useRaster=TRUE)
## printing the first ten images:
for (i in 1:10){
img1<- matrix(unlist(training_data[i,-1]),nrow=28,ncol=28,byrow=TRUE)
rot <- rotateFixed(img1, 90)
image(rot,useRaster=TRUE)
}
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]
## Group the data by label and then count the number of occurences...
Freq_Dist <- train %>%
group_by(label) %>%
summarise(Occurences = n())
Freq_Dist
## # A tibble: 10 x 2
## label Occurences
## <dbl> <int>
## 1 0 4132
## 2 1 4684
## 3 2 4177
## 4 3 4351
## 5 4 4072
## 6 5 3795
## 7 6 4137
## 8 7 4401
## 9 8 4063
## 10 9 4188
There is a balanced number of labels within the training dataset..
## The visualization
ggplot(data=Freq_Dist, aes(x=label, y=Occurences)) +
geom_bar(stat="identity", fill="red") +
geom_text(aes(label=Occurences))
The mean pixel intensity is the average primary information in the pixel,or according to adobe forums it is the average brightness so the average brightness is around 0.02 which is closer to 0 we can see that the average pixel intensity is closer to the darker spectrum than the lighter spectrum since we have values ranging between 0 to 1..
train %>%
group_by(label) %>%
summarise(Avg = (n()/784)/255)
## # A tibble: 10 x 2
## label Avg
## <dbl> <dbl>
## 1 0 0.0207
## 2 1 0.0234
## 3 2 0.0209
## 4 3 0.0218
## 5 4 0.0204
## 6 5 0.0190
## 7 6 0.0207
## 8 7 0.0220
## 9 8 0.0203
## 10 9 0.0209
###6.Reduce the data by using principal components that account for 95% of the variance. For 95% of the variance i got 154 components and for 100% of the variance i got 704 components..
### Use the training data without the labels::
train_pca <- prcomp(training_data)
## I had some help with this and we just compute the variance by calculating the standard deviation and finding the variance:
## thanks statology
var <- cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2)
## Variance that accounts for 95% of the variance:
which.max(var >= 0.95)
## [1] 154
which.max(var >= 1.00)
## [1] 704
###7.Plot the first 10 images generated by PCA. They will appear to be noise. Why?
Pca only reduces the dimensionalty of the dataset not the noise of the dataset.This looks really blurry so i am not sure if it is even correct..
## The principal components are stored in pca$rotation arguments where we have to iterate thorugh the columns it appears:
## this took a while sheesh
for (i in 1:10)
{
image(matrix(train_pca$rotation[,i], nrow=28, ncol=28),axes=FALSE)
}
## select all labels with 8
eight_label <- train %>%
group_by(label) %>%
filter(label == 8)
## take out the label column:
eight_label <- eight_label[,-1]
## run the pca
eight_pca <- prcomp(eight_label)
## calculate the variance:
eight_var <- cumsum(eight_pca$sdev^2)/ sum(eight_pca$sdev^2)
## find the variance that accounts for all 100: we have 537 components that accounts for 100 of the variance:
which.max(eight_var == 1.0)
## [1] 537
## The principal components are stored in pca$rotation arguments where we have to iterate thorugh the columns it appears:
## I can see all eights nice..
for (i in 1:10)
{
image(matrix(eight_pca$rotation[,i], nrow=28, ncol=28),axes=FALSE)
}
## Let us split the training dataset into a train and test datasets: 60% and 40%
Train <- sample_frac(train,0.6)
sample_train <- as.numeric(rownames(Train))
Test <- train[-sample_train,]
## creating the model:
require(nnet)
## Loading required package: nnet
## Warning: package 'nnet' was built under R version 4.1.3
model.fit <- multinom(label~.,data=Train,MaxNWts = 42000)
## # weights: 7860 (7065 variable)
## initial value 58025.144343
## iter 10 value 12350.634819
## iter 20 value 9738.070864
## iter 30 value 8714.471250
## iter 40 value 8240.324635
## iter 50 value 7979.205879
## iter 60 value 7802.035713
## iter 70 value 7689.060772
## iter 80 value 7610.629359
## iter 90 value 7557.658086
## iter 100 value 7499.396779
## final value 7499.396779
## stopped after 100 iterations
## using the model to predict the testing dataset:
predictions <- predict(model.fit,Test,"class")
confusion_matrix <- as.matrix(table("label" = Test$label,"predictions" = predictions))
confusion_matrix
## predictions
## label 0 1 2 3 4 5 6 7 8 9
## 0 1525 0 44 2 14 15 58 10 5 1
## 1 0 1818 10 13 0 8 1 0 14 4
## 2 20 20 1457 17 33 6 45 13 21 7
## 3 6 5 61 1561 2 28 15 12 18 20
## 4 2 10 2 3 1529 0 15 0 2 41
## 5 23 8 24 82 43 1243 40 4 33 17
## 6 3 10 18 0 10 20 1582 0 4 0
## 7 3 10 15 8 35 0 1 1595 4 108
## 8 22 44 73 88 45 78 17 4 1226 52
## 9 6 7 4 32 142 9 3 37 5 1450
# Calculating accuracy - sum of diagonal elements divided by total obs I got a 91% accuracy..
round((sum(diag(confusion_matrix))/sum(confusion_matrix))*100,2)
## [1] 89.2
https://www.statology.org/principal-components-analysis-in-r/ https://datasciencebeginners.com/2018/12/20/multinomial-logistic-regression-using-r/ https://www.projectpro.io/recipes/split-data-into-train-set-and-test-set-r https://www.datacamp.com/tutorial/pca-analysis-r