library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nnet)
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(ggplot2)
bc <- read_csv(
  "BreastCancerDataset.csv",
  col_types = cols(
    clump_thickness = col_double(),
    cell_shape_uniformity = col_double(),
    cell_size_uniformity = col_double(),
    marginal_adhesion = col_double(),
    single_epithelial_cell_size = col_double(),
    bare_nuclei = col_double(),
    bland_chromatin = col_double(),
    normal_nucleoli = col_double(),
    mitoses = col_double(),
    class = col_factor(c("0", "1"))
  )
)
## Warning: 16 parsing failures.
## row         col expected actual                      file
##  24 bare_nuclei a double      * 'BreastCancerDataset.csv'
##  41 bare_nuclei a double      * 'BreastCancerDataset.csv'
## 140 bare_nuclei a double      * 'BreastCancerDataset.csv'
## 146 bare_nuclei a double      * 'BreastCancerDataset.csv'
## 159 bare_nuclei a double      * 'BreastCancerDataset.csv'
## ... ........... ........ ...... .........................
## See problems(...) for more details.

The only thing that stands out in the summary is the * in the bare nucleides column but without the code book it is hard to know what they misght be associated with. It is also curious that the column is a character type, and if there were some reasoning behind it.

str(bc)
## tibble [699 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ clump_thickness            : num [1:699] 5 5 3 6 4 8 1 2 2 4 ...
##  $ cell_shape_uniformity      : num [1:699] 1 4 1 8 1 10 1 1 1 2 ...
##  $ cell_size_uniformity       : num [1:699] 1 4 1 8 1 10 1 2 1 1 ...
##  $ marginal_adhesion          : num [1:699] 1 5 1 1 3 8 1 1 1 1 ...
##  $ single_epithelial_cell_size: num [1:699] 2 7 2 3 2 7 2 2 2 2 ...
##  $ bare_nuclei                : num [1:699] 1 10 2 4 1 10 10 1 1 1 ...
##  $ bland_chromatin            : num [1:699] 3 3 3 3 3 9 3 3 1 2 ...
##  $ normal_nucleoli            : num [1:699] 1 2 1 7 1 7 1 1 1 1 ...
##  $ mitoses                    : num [1:699] 1 1 1 1 1 1 1 1 5 1 ...
##  $ class                      : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   ..$ col     : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
##   ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
##   ..$ actual  : chr [1:16] "*" "*" "*" "*" ...
##   ..$ file    : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   clump_thickness = col_double(),
##   ..   cell_shape_uniformity = col_double(),
##   ..   cell_size_uniformity = col_double(),
##   ..   marginal_adhesion = col_double(),
##   ..   single_epithelial_cell_size = col_double(),
##   ..   bare_nuclei = col_double(),
##   ..   bland_chromatin = col_double(),
##   ..   normal_nucleoli = col_double(),
##   ..   mitoses = col_double(),
##   ..   class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
##   .. )
summary(bc)
##  clump_thickness  cell_shape_uniformity cell_size_uniformity marginal_adhesion
##  Min.   : 1.000   Min.   : 1.000        Min.   : 1.000       Min.   : 1.000   
##  1st Qu.: 2.000   1st Qu.: 1.000        1st Qu.: 1.000       1st Qu.: 1.000   
##  Median : 4.000   Median : 1.000        Median : 1.000       Median : 1.000   
##  Mean   : 4.418   Mean   : 3.134        Mean   : 3.207       Mean   : 2.807   
##  3rd Qu.: 6.000   3rd Qu.: 5.000        3rd Qu.: 5.000       3rd Qu.: 4.000   
##  Max.   :10.000   Max.   :10.000        Max.   :10.000       Max.   :10.000   
##                                                                               
##  single_epithelial_cell_size  bare_nuclei     bland_chromatin  normal_nucleoli 
##  Min.   : 1.000              Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000              1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000              Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.216              Mean   : 3.545   Mean   : 3.438   Mean   : 2.867  
##  3rd Qu.: 4.000              3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000              Max.   :10.000   Max.   :10.000   Max.   :10.000  
##                              NA's   :16                                        
##     mitoses       class  
##  Min.   : 1.000   0:458  
##  1st Qu.: 1.000   1:241  
##  Median : 1.000          
##  Mean   : 1.589          
##  3rd Qu.: 1.000          
##  Max.   :10.000          
## 

Determine the number of NA in the data set and verify with unique

colSums(is.na(bc))
##             clump_thickness       cell_shape_uniformity 
##                           0                           0 
##        cell_size_uniformity           marginal_adhesion 
##                           0                           0 
## single_epithelial_cell_size                 bare_nuclei 
##                           0                          16 
##             bland_chromatin             normal_nucleoli 
##                           0                           0 
##                     mitoses                       class 
##                           0                           0
unique(bc$bare_nuclei)
##  [1]  1 10  2  4  3  9  7 NA  5  8  6

Mark bare_nuclei as NULL as we have done before in the knn walk through

bc$bare_nuclei=NULL
str(bc)
## tibble [699 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ clump_thickness            : num [1:699] 5 5 3 6 4 8 1 2 2 4 ...
##  $ cell_shape_uniformity      : num [1:699] 1 4 1 8 1 10 1 1 1 2 ...
##  $ cell_size_uniformity       : num [1:699] 1 4 1 8 1 10 1 2 1 1 ...
##  $ marginal_adhesion          : num [1:699] 1 5 1 1 3 8 1 1 1 1 ...
##  $ single_epithelial_cell_size: num [1:699] 2 7 2 3 2 7 2 2 2 2 ...
##  $ bland_chromatin            : num [1:699] 3 3 3 3 3 9 3 3 1 2 ...
##  $ normal_nucleoli            : num [1:699] 1 2 1 7 1 7 1 1 1 1 ...
##  $ mitoses                    : num [1:699] 1 1 1 1 1 1 1 1 5 1 ...
##  $ class                      : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   ..$ col     : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
##   ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
##   ..$ actual  : chr [1:16] "*" "*" "*" "*" ...
##   ..$ file    : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   clump_thickness = col_double(),
##   ..   cell_shape_uniformity = col_double(),
##   ..   cell_size_uniformity = col_double(),
##   ..   marginal_adhesion = col_double(),
##   ..   single_epithelial_cell_size = col_double(),
##   ..   bare_nuclei = col_double(),
##   ..   bland_chromatin = col_double(),
##   ..   normal_nucleoli = col_double(),
##   ..   mitoses = col_double(),
##   ..   class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
##   .. )

Here we are scaling the data for the neural network

for(i in 1:8)
{
  minbc= min(bc[,i])
  maxbc= max(bc[,i])
  bc[,i] = (bc[,i] - minbc) / (maxbc - minbc)
}

str(bc)
## tibble [699 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ clump_thickness            : num [1:699] 0.444 0.444 0.222 0.556 0.333 ...
##  $ cell_shape_uniformity      : num [1:699] 0 0.333 0 0.778 0 ...
##  $ cell_size_uniformity       : num [1:699] 0 0.333 0 0.778 0 ...
##  $ marginal_adhesion          : num [1:699] 0 0.444 0 0 0.222 ...
##  $ single_epithelial_cell_size: num [1:699] 0.111 0.667 0.111 0.222 0.111 ...
##  $ bland_chromatin            : num [1:699] 0.222 0.222 0.222 0.222 0.222 ...
##  $ normal_nucleoli            : num [1:699] 0 0.111 0 0.667 0 ...
##  $ mitoses                    : num [1:699] 0 0 0 0 0 ...
##  $ class                      : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   ..$ col     : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
##   ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
##   ..$ actual  : chr [1:16] "*" "*" "*" "*" ...
##   ..$ file    : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   clump_thickness = col_double(),
##   ..   cell_shape_uniformity = col_double(),
##   ..   cell_size_uniformity = col_double(),
##   ..   marginal_adhesion = col_double(),
##   ..   single_epithelial_cell_size = col_double(),
##   ..   bare_nuclei = col_double(),
##   ..   bland_chromatin = col_double(),
##   ..   normal_nucleoli = col_double(),
##   ..   mitoses = col_double(),
##   ..   class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
##   .. )

looking at the visualization you can see that it is fairly well seperated all things considered. From what I have seen we run into this more often with smaller data sets.

plot_bc <- GGally::ggpairs(bc, aes(color = class)) 
plot_bc

set the seed and do the splits for test and train using 80/20 and create the index

set.seed(1)
train_index <- caret::createDataPartition(bc$class, p = .8, list = FALSE, times = 1)

create the training and the test split for the model that we will be using

train_bc <- bc %>% dplyr::slice(train_index)
test_bc <- bc %>% dplyr::slice(-train_index)

str(bc)
## tibble [699 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ clump_thickness            : num [1:699] 0.444 0.444 0.222 0.556 0.333 ...
##  $ cell_shape_uniformity      : num [1:699] 0 0.333 0 0.778 0 ...
##  $ cell_size_uniformity       : num [1:699] 0 0.333 0 0.778 0 ...
##  $ marginal_adhesion          : num [1:699] 0 0.444 0 0 0.222 ...
##  $ single_epithelial_cell_size: num [1:699] 0.111 0.667 0.111 0.222 0.111 ...
##  $ bland_chromatin            : num [1:699] 0.222 0.222 0.222 0.222 0.222 ...
##  $ normal_nucleoli            : num [1:699] 0 0.111 0 0.667 0 ...
##  $ mitoses                    : num [1:699] 0 0 0 0 0 ...
##  $ class                      : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   ..$ col     : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
##   ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
##   ..$ actual  : chr [1:16] "*" "*" "*" "*" ...
##   ..$ file    : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   clump_thickness = col_double(),
##   ..   cell_shape_uniformity = col_double(),
##   ..   cell_size_uniformity = col_double(),
##   ..   marginal_adhesion = col_double(),
##   ..   single_epithelial_cell_size = col_double(),
##   ..   bare_nuclei = col_double(),
##   ..   bland_chromatin = col_double(),
##   ..   normal_nucleoli = col_double(),
##   ..   mitoses = col_double(),
##   ..   class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
##   .. )

i set the range for the hidden layers starting at 5 and running up to 75. i wanted to get a good assessment of the activity running a large size range. I would be interested to know what the max layers are you should use based on the size of the data set. I attempted a 100 layers but I received a “too many weights warning”

nn5 <- nnet(class ~ ., size = 5, data = train_bc)
## # weights:  51
## initial  value 386.443586 
## iter  10 value 51.516566
## iter  20 value 45.168037
## iter  30 value 34.588867
## iter  40 value 24.940439
## iter  50 value 20.860166
## iter  60 value 20.573241
## iter  70 value 20.528793
## iter  80 value 20.526956
## final  value 20.526953 
## converged
nn12 <- nnet(class ~ ., size = 12, data = train_bc)
## # weights:  121
## initial  value 410.721966 
## iter  10 value 54.462362
## iter  20 value 45.528674
## iter  30 value 29.878931
## iter  40 value 10.787057
## iter  50 value 2.914721
## iter  60 value 1.356872
## iter  70 value 0.014892
## iter  80 value 0.000244
## iter  90 value 0.000141
## final  value 0.000076 
## converged
nn25 <- nnet(class ~ ., size = 25, data = train_bc)
## # weights:  251
## initial  value 377.264445 
## iter  10 value 54.203263
## iter  20 value 42.132413
## iter  30 value 27.402323
## iter  40 value 10.103064
## iter  50 value 3.077495
## iter  60 value 0.200330
## iter  70 value 0.019576
## iter  80 value 0.004930
## iter  90 value 0.001218
## final  value 0.000086 
## converged
nn50 <- nnet(class ~ ., size = 50, data = train_bc)
## # weights:  501
## initial  value 365.883324 
## iter  10 value 51.912947
## iter  20 value 40.444557
## iter  30 value 27.504186
## iter  40 value 5.448492
## iter  50 value 0.030838
## iter  60 value 0.001568
## iter  70 value 0.000519
## final  value 0.000099 
## converged
nn75 <- nnet(class ~ ., size = 75, data = train_bc)
## # weights:  751
## initial  value 880.041642 
## iter  10 value 48.927995
## iter  20 value 36.645721
## iter  30 value 18.481440
## iter  40 value 5.076999
## iter  50 value 0.137728
## iter  60 value 0.009858
## iter  70 value 0.003174
## iter  80 value 0.001437
## iter  90 value 0.000647
## iter 100 value 0.000346
## final  value 0.000346 
## stopped after 100 iterations

A lot of errors with this first run the training set, of course, it is running with very few layers and gets increasingly better until there are no errors at nn50. Overall the nn had relatively few errors from the beginning which can be attributed to the fact that it is a relatively small data set and it is fairly well seperated as can be seen in the ggpairs plot. Below we have the tables for the training set

table(predict(nn5, newdata = train_bc, type = "class"), train_bc$class)
##    
##       0   1
##   0 354   0
##   1  13 193
table(predict(nn12, newdata = train_bc, type = "class"), train_bc$class)
##    
##       0   1
##   0 367   0
##   1   0 193
table(predict(nn25, newdata = train_bc, type = "class"), train_bc$class)
##    
##       0   1
##   0 367   0
##   1   0 193
table(predict(nn50, newdata = train_bc, type = "class"), train_bc$class)
##    
##       0   1
##   0 367   0
##   1   0 193
table(predict(nn75, newdata = train_bc, type = "class"), train_bc$class)
##    
##       0   1
##   0 367   0
##   1   0 193

The first table with 12 hidden layers had several errors out and it got better as well with the addition of more hidden layers. Overall it doesn’t as well as the training set but that is expected, but it still does very well all things considered. Below is the tables for the test set.

table(predict(nn5, newdata = test_bc, type = "class"), test_bc$class)
##    
##      0  1
##   0 90  3
##   1  1 45
table(predict(nn12, newdata = test_bc, type = "class"), test_bc$class)
##    
##      0  1
##   0 89  9
##   1  2 39
table(predict(nn25, newdata = test_bc, type = "class"), test_bc$class)
##    
##      0  1
##   0 90  7
##   1  1 41
table(predict(nn50, newdata = test_bc, type = "class"), test_bc$class)
##    
##      0  1
##   0 90  5
##   1  1 43
table(predict(nn75, newdata = test_bc, type = "class"), test_bc$class)
##    
##      0  1
##   0 91 10
##   1  0 38

Here is the accuracy data for the baseline which is not good at all at 65%, it makes me very curious what the baseline would look like on a much larger data set.

caret::confusionMatrix(
  factor(rep("0", nrow(test_bc)), levels = levels(test_bc$class)),
  test_bc$class
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 91 48
##          1  0  0
##                                           
##                Accuracy : 0.6547          
##                  95% CI : (0.5694, 0.7332)
##     No Information Rate : 0.6547          
##     P-Value [Acc > NIR] : 0.5391          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.17e-11        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.6547          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.6547          
##          Detection Rate : 0.6547          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 

With just running the accuracy with the model we created at nn25 the accuracy jumps to 93% so that is an excellent jump in prediciton accuracy. I ran the measurement a few additional times increasing the nn each time and after nn25 you get minimal accuracy increase up to nn75 with had an accuracy rate of 96%

caret::confusionMatrix(
  factor(predict(nn25, newdata = test_bc, type = "class"), levels = levels(test_bc$class)),
  test_bc$class
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 90  7
##          1  1 41
##                                           
##                Accuracy : 0.9424          
##                  95% CI : (0.8897, 0.9748)
##     No Information Rate : 0.6547          
##     P-Value [Acc > NIR] : 5.092e-16       
##                                           
##                   Kappa : 0.8688          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.9890          
##             Specificity : 0.8542          
##          Pos Pred Value : 0.9278          
##          Neg Pred Value : 0.9762          
##              Prevalence : 0.6547          
##          Detection Rate : 0.6475          
##    Detection Prevalence : 0.6978          
##       Balanced Accuracy : 0.9216          
##                                           
##        'Positive' Class : 0               
## 

Summary: running the different models on these different datasets is fascinating and I look forward to running the same models on the larger data set for the group project. While the dataset is small I felt that most of the hidden layers captured the majority of the data with only a few errors. The number of errors slacks off once you hit nn50 and there is very little gain from going higher than that. As was seen with the confusionMatrix the difference between the baseline and the model accuracy was very high, but the gains in accuracy once again dissapate quickly after nn25, making the additional runs with more layers almost moot.

Below figure 2 visualizes a computed neural network. This was run using the neuralnet library and was done solely so I could render a visualized neural net and see how it works through the steps for my own edification. I could figure out how to build on it and run a comparisson on the original data and the predicted data, and that would be something I would very interested to lean more on if the opportunity in the course presented itself.

Our model has 5 neurons in its hidden layer, and I used mitoses as the target variable.

References: Larose, Daniel T., and Chantal D. Larose. Discovering Knowledge in Data: an Introduction to Data Mining. 2nd ed., Wiley, 2014. Chapter 5.6 Blog, Guest. “Creating & Visualizing Neural Network in R.” Analytics Vidhya, 17 May 2018, www.analyticsvidhya.com/blog/2017/09/creating-visualizing-neural-network-in-r/.