library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nnet)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(ggplot2)
bc <- read_csv(
"BreastCancerDataset.csv",
col_types = cols(
clump_thickness = col_double(),
cell_shape_uniformity = col_double(),
cell_size_uniformity = col_double(),
marginal_adhesion = col_double(),
single_epithelial_cell_size = col_double(),
bare_nuclei = col_double(),
bland_chromatin = col_double(),
normal_nucleoli = col_double(),
mitoses = col_double(),
class = col_factor(c("0", "1"))
)
)
## Warning: 16 parsing failures.
## row col expected actual file
## 24 bare_nuclei a double * 'BreastCancerDataset.csv'
## 41 bare_nuclei a double * 'BreastCancerDataset.csv'
## 140 bare_nuclei a double * 'BreastCancerDataset.csv'
## 146 bare_nuclei a double * 'BreastCancerDataset.csv'
## 159 bare_nuclei a double * 'BreastCancerDataset.csv'
## ... ........... ........ ...... .........................
## See problems(...) for more details.
The only thing that stands out in the summary is the * in the bare nucleides column but without the code book it is hard to know what they misght be associated with. It is also curious that the column is a character type, and if there were some reasoning behind it.
str(bc)
## tibble [699 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ clump_thickness : num [1:699] 5 5 3 6 4 8 1 2 2 4 ...
## $ cell_shape_uniformity : num [1:699] 1 4 1 8 1 10 1 1 1 2 ...
## $ cell_size_uniformity : num [1:699] 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : num [1:699] 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: num [1:699] 2 7 2 3 2 7 2 2 2 2 ...
## $ bare_nuclei : num [1:699] 1 10 2 4 1 10 10 1 1 1 ...
## $ bland_chromatin : num [1:699] 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : num [1:699] 1 2 1 7 1 7 1 1 1 1 ...
## $ mitoses : num [1:699] 1 1 1 1 1 1 1 1 5 1 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## ..$ col : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
## ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
## ..$ actual : chr [1:16] "*" "*" "*" "*" ...
## ..$ file : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. clump_thickness = col_double(),
## .. cell_shape_uniformity = col_double(),
## .. cell_size_uniformity = col_double(),
## .. marginal_adhesion = col_double(),
## .. single_epithelial_cell_size = col_double(),
## .. bare_nuclei = col_double(),
## .. bland_chromatin = col_double(),
## .. normal_nucleoli = col_double(),
## .. mitoses = col_double(),
## .. class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
## .. )
summary(bc)
## clump_thickness cell_shape_uniformity cell_size_uniformity marginal_adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean : 2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## single_epithelial_cell_size bare_nuclei bland_chromatin normal_nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.000
## Mean : 3.216 Mean : 3.545 Mean : 3.438 Mean : 2.867
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :16
## mitoses class
## Min. : 1.000 0:458
## 1st Qu.: 1.000 1:241
## Median : 1.000
## Mean : 1.589
## 3rd Qu.: 1.000
## Max. :10.000
##
Determine the number of NA in the data set and verify with unique
colSums(is.na(bc))
## clump_thickness cell_shape_uniformity
## 0 0
## cell_size_uniformity marginal_adhesion
## 0 0
## single_epithelial_cell_size bare_nuclei
## 0 16
## bland_chromatin normal_nucleoli
## 0 0
## mitoses class
## 0 0
unique(bc$bare_nuclei)
## [1] 1 10 2 4 3 9 7 NA 5 8 6
Mark bare_nuclei as NULL as we have done before in the knn walk through
bc$bare_nuclei=NULL
str(bc)
## tibble [699 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ clump_thickness : num [1:699] 5 5 3 6 4 8 1 2 2 4 ...
## $ cell_shape_uniformity : num [1:699] 1 4 1 8 1 10 1 1 1 2 ...
## $ cell_size_uniformity : num [1:699] 1 4 1 8 1 10 1 2 1 1 ...
## $ marginal_adhesion : num [1:699] 1 5 1 1 3 8 1 1 1 1 ...
## $ single_epithelial_cell_size: num [1:699] 2 7 2 3 2 7 2 2 2 2 ...
## $ bland_chromatin : num [1:699] 3 3 3 3 3 9 3 3 1 2 ...
## $ normal_nucleoli : num [1:699] 1 2 1 7 1 7 1 1 1 1 ...
## $ mitoses : num [1:699] 1 1 1 1 1 1 1 1 5 1 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## ..$ col : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
## ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
## ..$ actual : chr [1:16] "*" "*" "*" "*" ...
## ..$ file : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. clump_thickness = col_double(),
## .. cell_shape_uniformity = col_double(),
## .. cell_size_uniformity = col_double(),
## .. marginal_adhesion = col_double(),
## .. single_epithelial_cell_size = col_double(),
## .. bare_nuclei = col_double(),
## .. bland_chromatin = col_double(),
## .. normal_nucleoli = col_double(),
## .. mitoses = col_double(),
## .. class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
## .. )
Here we are scaling the data for the neural network
for(i in 1:8)
{
minbc= min(bc[,i])
maxbc= max(bc[,i])
bc[,i] = (bc[,i] - minbc) / (maxbc - minbc)
}
str(bc)
## tibble [699 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ clump_thickness : num [1:699] 0.444 0.444 0.222 0.556 0.333 ...
## $ cell_shape_uniformity : num [1:699] 0 0.333 0 0.778 0 ...
## $ cell_size_uniformity : num [1:699] 0 0.333 0 0.778 0 ...
## $ marginal_adhesion : num [1:699] 0 0.444 0 0 0.222 ...
## $ single_epithelial_cell_size: num [1:699] 0.111 0.667 0.111 0.222 0.111 ...
## $ bland_chromatin : num [1:699] 0.222 0.222 0.222 0.222 0.222 ...
## $ normal_nucleoli : num [1:699] 0 0.111 0 0.667 0 ...
## $ mitoses : num [1:699] 0 0 0 0 0 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## ..$ col : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
## ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
## ..$ actual : chr [1:16] "*" "*" "*" "*" ...
## ..$ file : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. clump_thickness = col_double(),
## .. cell_shape_uniformity = col_double(),
## .. cell_size_uniformity = col_double(),
## .. marginal_adhesion = col_double(),
## .. single_epithelial_cell_size = col_double(),
## .. bare_nuclei = col_double(),
## .. bland_chromatin = col_double(),
## .. normal_nucleoli = col_double(),
## .. mitoses = col_double(),
## .. class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
## .. )
looking at the visualization you can see that it is fairly well seperated all things considered. From what I have seen we run into this more often with smaller data sets.
plot_bc <- GGally::ggpairs(bc, aes(color = class))
plot_bc
set the seed and do the splits for test and train using 80/20 and create the index
set.seed(1)
train_index <- caret::createDataPartition(bc$class, p = .8, list = FALSE, times = 1)
create the training and the test split for the model that we will be using
train_bc <- bc %>% dplyr::slice(train_index)
test_bc <- bc %>% dplyr::slice(-train_index)
str(bc)
## tibble [699 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ clump_thickness : num [1:699] 0.444 0.444 0.222 0.556 0.333 ...
## $ cell_shape_uniformity : num [1:699] 0 0.333 0 0.778 0 ...
## $ cell_size_uniformity : num [1:699] 0 0.333 0 0.778 0 ...
## $ marginal_adhesion : num [1:699] 0 0.444 0 0 0.222 ...
## $ single_epithelial_cell_size: num [1:699] 0.111 0.667 0.111 0.222 0.111 ...
## $ bland_chromatin : num [1:699] 0.222 0.222 0.222 0.222 0.222 ...
## $ normal_nucleoli : num [1:699] 0 0.111 0 0.667 0 ...
## $ mitoses : num [1:699] 0 0 0 0 0 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "problems")= tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## ..$ col : chr [1:16] "bare_nuclei" "bare_nuclei" "bare_nuclei" "bare_nuclei" ...
## ..$ expected: chr [1:16] "a double" "a double" "a double" "a double" ...
## ..$ actual : chr [1:16] "*" "*" "*" "*" ...
## ..$ file : chr [1:16] "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" "'BreastCancerDataset.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. clump_thickness = col_double(),
## .. cell_shape_uniformity = col_double(),
## .. cell_size_uniformity = col_double(),
## .. marginal_adhesion = col_double(),
## .. single_epithelial_cell_size = col_double(),
## .. bare_nuclei = col_double(),
## .. bland_chromatin = col_double(),
## .. normal_nucleoli = col_double(),
## .. mitoses = col_double(),
## .. class = col_factor(levels = c("0", "1"), ordered = FALSE, include_na = FALSE)
## .. )
i set the range for the hidden layers starting at 5 and running up to 75. i wanted to get a good assessment of the activity running a large size range. I would be interested to know what the max layers are you should use based on the size of the data set. I attempted a 100 layers but I received a “too many weights warning”
nn5 <- nnet(class ~ ., size = 5, data = train_bc)
## # weights: 51
## initial value 386.443586
## iter 10 value 51.516566
## iter 20 value 45.168037
## iter 30 value 34.588867
## iter 40 value 24.940439
## iter 50 value 20.860166
## iter 60 value 20.573241
## iter 70 value 20.528793
## iter 80 value 20.526956
## final value 20.526953
## converged
nn12 <- nnet(class ~ ., size = 12, data = train_bc)
## # weights: 121
## initial value 410.721966
## iter 10 value 54.462362
## iter 20 value 45.528674
## iter 30 value 29.878931
## iter 40 value 10.787057
## iter 50 value 2.914721
## iter 60 value 1.356872
## iter 70 value 0.014892
## iter 80 value 0.000244
## iter 90 value 0.000141
## final value 0.000076
## converged
nn25 <- nnet(class ~ ., size = 25, data = train_bc)
## # weights: 251
## initial value 377.264445
## iter 10 value 54.203263
## iter 20 value 42.132413
## iter 30 value 27.402323
## iter 40 value 10.103064
## iter 50 value 3.077495
## iter 60 value 0.200330
## iter 70 value 0.019576
## iter 80 value 0.004930
## iter 90 value 0.001218
## final value 0.000086
## converged
nn50 <- nnet(class ~ ., size = 50, data = train_bc)
## # weights: 501
## initial value 365.883324
## iter 10 value 51.912947
## iter 20 value 40.444557
## iter 30 value 27.504186
## iter 40 value 5.448492
## iter 50 value 0.030838
## iter 60 value 0.001568
## iter 70 value 0.000519
## final value 0.000099
## converged
nn75 <- nnet(class ~ ., size = 75, data = train_bc)
## # weights: 751
## initial value 880.041642
## iter 10 value 48.927995
## iter 20 value 36.645721
## iter 30 value 18.481440
## iter 40 value 5.076999
## iter 50 value 0.137728
## iter 60 value 0.009858
## iter 70 value 0.003174
## iter 80 value 0.001437
## iter 90 value 0.000647
## iter 100 value 0.000346
## final value 0.000346
## stopped after 100 iterations
A lot of errors with this first run the training set, of course, it is running with very few layers and gets increasingly better until there are no errors at nn50. Overall the nn had relatively few errors from the beginning which can be attributed to the fact that it is a relatively small data set and it is fairly well seperated as can be seen in the ggpairs plot. Below we have the tables for the training set
table(predict(nn5, newdata = train_bc, type = "class"), train_bc$class)
##
## 0 1
## 0 354 0
## 1 13 193
table(predict(nn12, newdata = train_bc, type = "class"), train_bc$class)
##
## 0 1
## 0 367 0
## 1 0 193
table(predict(nn25, newdata = train_bc, type = "class"), train_bc$class)
##
## 0 1
## 0 367 0
## 1 0 193
table(predict(nn50, newdata = train_bc, type = "class"), train_bc$class)
##
## 0 1
## 0 367 0
## 1 0 193
table(predict(nn75, newdata = train_bc, type = "class"), train_bc$class)
##
## 0 1
## 0 367 0
## 1 0 193
The first table with 12 hidden layers had several errors out and it got better as well with the addition of more hidden layers. Overall it doesn’t as well as the training set but that is expected, but it still does very well all things considered. Below is the tables for the test set.
table(predict(nn5, newdata = test_bc, type = "class"), test_bc$class)
##
## 0 1
## 0 90 3
## 1 1 45
table(predict(nn12, newdata = test_bc, type = "class"), test_bc$class)
##
## 0 1
## 0 89 9
## 1 2 39
table(predict(nn25, newdata = test_bc, type = "class"), test_bc$class)
##
## 0 1
## 0 90 7
## 1 1 41
table(predict(nn50, newdata = test_bc, type = "class"), test_bc$class)
##
## 0 1
## 0 90 5
## 1 1 43
table(predict(nn75, newdata = test_bc, type = "class"), test_bc$class)
##
## 0 1
## 0 91 10
## 1 0 38
Here is the accuracy data for the baseline which is not good at all at 65%, it makes me very curious what the baseline would look like on a much larger data set.
caret::confusionMatrix(
factor(rep("0", nrow(test_bc)), levels = levels(test_bc$class)),
test_bc$class
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 91 48
## 1 0 0
##
## Accuracy : 0.6547
## 95% CI : (0.5694, 0.7332)
## No Information Rate : 0.6547
## P-Value [Acc > NIR] : 0.5391
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.17e-11
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.6547
## Neg Pred Value : NaN
## Prevalence : 0.6547
## Detection Rate : 0.6547
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
With just running the accuracy with the model we created at nn25 the accuracy jumps to 93% so that is an excellent jump in prediciton accuracy. I ran the measurement a few additional times increasing the nn each time and after nn25 you get minimal accuracy increase up to nn75 with had an accuracy rate of 96%
caret::confusionMatrix(
factor(predict(nn25, newdata = test_bc, type = "class"), levels = levels(test_bc$class)),
test_bc$class
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 90 7
## 1 1 41
##
## Accuracy : 0.9424
## 95% CI : (0.8897, 0.9748)
## No Information Rate : 0.6547
## P-Value [Acc > NIR] : 5.092e-16
##
## Kappa : 0.8688
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.9890
## Specificity : 0.8542
## Pos Pred Value : 0.9278
## Neg Pred Value : 0.9762
## Prevalence : 0.6547
## Detection Rate : 0.6475
## Detection Prevalence : 0.6978
## Balanced Accuracy : 0.9216
##
## 'Positive' Class : 0
##
Summary: running the different models on these different datasets is fascinating and I look forward to running the same models on the larger data set for the group project. While the dataset is small I felt that most of the hidden layers captured the majority of the data with only a few errors. The number of errors slacks off once you hit nn50 and there is very little gain from going higher than that. As was seen with the confusionMatrix the difference between the baseline and the model accuracy was very high, but the gains in accuracy once again dissapate quickly after nn25, making the additional runs with more layers almost moot.
Below figure 2 visualizes a computed neural network. This was run using the neuralnet library and was done solely so I could render a visualized neural net and see how it works through the steps for my own edification. I could figure out how to build on it and run a comparisson on the original data and the predicted data, and that would be something I would very interested to lean more on if the opportunity in the course presented itself.
Our model has 5 neurons in its hidden layer, and I used mitoses as the target variable.
References: Larose, Daniel T., and Chantal D. Larose. Discovering Knowledge in Data: an Introduction to Data Mining. 2nd ed., Wiley, 2014. Chapter 5.6 Blog, Guest. “Creating & Visualizing Neural Network in R.” Analytics Vidhya, 17 May 2018, www.analyticsvidhya.com/blog/2017/09/creating-visualizing-neural-network-in-r/.