Data 605 Final Problem #2:

## set the work directory to where the file is and then read it... 
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.2

## Warning: package 'ggplot2' was built under R version 4.1.3

## Warning: package 'tibble' was built under R version 4.1.3

## Warning: package 'tidyr' was built under R version 4.1.3

## Warning: package 'readr' was built under R version 4.1.3

## Warning: package 'dplyr' was built under R version 4.1.3

## Warning: package 'stringr' was built under R version 4.1.3

## Warning: package 'forcats' was built under R version 4.1.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(OpenImageR)

## Warning: package 'OpenImageR' was built under R version 4.1.3

train <- read_csv("train.csv")

## Rows: 42000 Columns: 785
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (785): label, pixel0, pixel1, pixel2, pixel3, pixel4, pixel5, pixel6, pi...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

test <- read_csv("test.csv")

## Rows: 28000 Columns: 784
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (784): pixel0, pixel1, pixel2, pixel3, pixel4, pixel5, pixel6, pixel7, p...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## divide the training data by 255
## The labels tell what number is drawn by the image so the first label is an image of a 1 
label = train[,1]
training_data = train[,-1]/255

## Each pixel is a 28 by 28 matrix
img1<- matrix(unlist(training_data[10,-1]),nrow=28,ncol=28,byrow=TRUE)

## Warning in matrix(unlist(training_data[10, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

image(img1,useRaster=TRUE)

## Maybe I should rotate it?? ## Fixed it
r <- rotateFixed(img1, 90)
image(r,useRaster=TRUE)

## printing the first ten images:
for (i in 1:10){
  img1<- matrix(unlist(training_data[i,-1]),nrow=28,ncol=28,byrow=TRUE)
  rot <- rotateFixed(img1, 90)
  image(rot,useRaster=TRUE)
  
}

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

## Warning in matrix(unlist(training_data[i, -1]), nrow = 28, ncol = 28, byrow =
## TRUE): data length [783] is not a sub-multiple or multiple of the number of rows
## [28]

4.What is the frequency distribution of the numbers in the dataset?

## Group the data by label and then count the number of occurences... 
Freq_Dist <- train %>%
  group_by(label) %>%
  summarise(Occurences = n())
Freq_Dist

## # A tibble: 10 x 2
##    label Occurences
##    <dbl>      <int>
##  1     0       4132
##  2     1       4684
##  3     2       4177
##  4     3       4351
##  5     4       4072
##  6     5       3795
##  7     6       4137
##  8     7       4401
##  9     8       4063
## 10     9       4188

There is a balanced number of labels within the training dataset..

## The visualization 
ggplot(data=Freq_Dist, aes(x=label, y=Occurences)) +
  geom_bar(stat="identity", fill="red") +
  geom_text(aes(label=Occurences))

5. For each number, provide the mean pixel intensity. What does this tell you?

The mean pixel intensity is the average primary information in the pixel,or according to adobe forums it is the average brightness so the average brightness is around 0.02 which is closer to 0 we can see that the average pixel intensity is closer to the darker spectrum than the lighter spectrum since we have values ranging between 0 to 1..

train %>%
  group_by(label) %>%
  summarise(Avg = (n()/784)/255)

## # A tibble: 10 x 2
##    label    Avg
##    <dbl>  <dbl>
##  1     0 0.0207
##  2     1 0.0234
##  3     2 0.0209
##  4     3 0.0218
##  5     4 0.0204
##  6     5 0.0190
##  7     6 0.0207
##  8     7 0.0220
##  9     8 0.0203
## 10     9 0.0209

###6.Reduce the data by using principal components that account for 95% of the variance. For 95% of the variance i got 154 components and for 100% of the variance i got 704 components..

### Use the training data without the labels::
train_pca <- prcomp(training_data)

## I had some help with this and we just compute the variance by calculating the standard deviation and finding the variance:
## thanks statology
var <- cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2)

## Variance that accounts for 95% of the variance:
which.max(var >= 0.95)

## [1] 154

which.max(var >= 1.00)

## [1] 704

###7.Plot the first 10 images generated by PCA. They will appear to be noise. Why?

Pca only reduces the dimensionalty of the dataset not the noise of the dataset.This looks really blurry so i am not sure if it is even correct..

## The principal components are stored in pca$rotation arguments where we have to iterate thorugh the columns it appears:
## this took a while sheesh
for (i in 1:10)
{
 image(matrix(train_pca$rotation[,i], nrow=28, ncol=28),axes=FALSE)
}

8. Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images

## select all labels with 8
eight_label <- train %>%
  group_by(label) %>%
  filter(label == 8)

## take out the label column:

eight_label <- eight_label[,-1]

## run the pca
eight_pca <- prcomp(eight_label)

## calculate the variance:
eight_var <- cumsum(eight_pca$sdev^2)/ sum(eight_pca$sdev^2)

## find the variance that accounts for all 100: we have 537 components that accounts for 100 of the variance: 
which.max(eight_var == 1.0)

## [1] 537

## The principal components are stored in pca$rotation arguments where we have to iterate thorugh the columns it appears:
## I can see all eights nice.. 
for (i in 1:10)
{
 image(matrix(eight_pca$rotation[,i], nrow=28, ncol=28),axes=FALSE)
}

9. Build a multinomial model:

## Let us split the training dataset into a train and test datasets: 60% and 40% 
Train <- sample_frac(train,0.6)
sample_train <- as.numeric(rownames(Train)) 
Test <- train[-sample_train,]

## creating the model: 
require(nnet)

## Loading required package: nnet

## Warning: package 'nnet' was built under R version 4.1.3

model.fit <- multinom(label~.,data=Train,MaxNWts = 42000)

## # weights:  7860 (7065 variable)
## initial  value 58025.144343 
## iter  10 value 12350.634819
## iter  20 value 9738.070864
## iter  30 value 8714.471250
## iter  40 value 8240.324635
## iter  50 value 7979.205879
## iter  60 value 7802.035713
## iter  70 value 7689.060772
## iter  80 value 7610.629359
## iter  90 value 7557.658086
## iter 100 value 7499.396779
## final  value 7499.396779 
## stopped after 100 iterations

## using the model to predict the testing dataset:
predictions <- predict(model.fit,Test,"class")

confusion_matrix <- as.matrix(table("label" = Test$label,"predictions" = predictions))
confusion_matrix

##      predictions
## label    0    1    2    3    4    5    6    7    8    9
##     0 1525    0   44    2   14   15   58   10    5    1
##     1    0 1818   10   13    0    8    1    0   14    4
##     2   20   20 1457   17   33    6   45   13   21    7
##     3    6    5   61 1561    2   28   15   12   18   20
##     4    2   10    2    3 1529    0   15    0    2   41
##     5   23    8   24   82   43 1243   40    4   33   17
##     6    3   10   18    0   10   20 1582    0    4    0
##     7    3   10   15    8   35    0    1 1595    4  108
##     8   22   44   73   88   45   78   17    4 1226   52
##     9    6    7    4   32  142    9    3   37    5 1450

# Calculating accuracy - sum of diagonal elements divided by total obs I got a 91% accuracy.. 
round((sum(diag(confusion_matrix))/sum(confusion_matrix))*100,2)

## [1] 89.2

Sources: (I had a lot of help especially with PCA)

https://www.statology.org/principal-components-analysis-in-r/ https://datasciencebeginners.com/2018/12/20/multinomial-logistic-regression-using-r/ https://www.projectpro.io/recipes/split-data-into-train-set-and-test-set-r https://www.datacamp.com/tutorial/pca-analysis-r