Breast Cancer Analysis

Introduction - This script uses the “Breast Cancer Wisconsin (Diagnostic) Data Set” to predict cancer diagnosis based on cell features.

Breast Cancer Wisconsin (Diagnostic) Data Set Data Folder: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/

Data Set Information: From: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

“Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. A few of the images can be found at [Web Link]

Separating plane described above was obtained using Multisurface Method-Tree (MSM-T) [K. P. Bennett, “Decision Tree Construction Via Linear Programming.” Proceedings of the 4th Midwest Artificial Intelligence and Cognitive Science Society, pp. 97-101, 1992], a classification method which uses linear programming to construct a decision tree. Relevant features were selected using an exhaustive search in the space of 1-4 features and 1-3 separating planes.

The actual linear program used to obtain the separating plane in the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: “Robust Linear Programming Discrimination of Two Linearly Inseparable Sets”, Optimization Methods and Software 1, 1992, 23-34]."

This database is also available through the UW CS ftp server: ftp ftp.cs.wisc.edu cd math-prog/cpo-dataset/machine-learn/WDBC/ Attribute Domain – —————————————– 1. Sample code number id number 2. Clump Thickness 1 - 10 3. Uniformity of Cell Size 1 - 10 4. Uniformity of Cell Shape 1 - 10 5. Marginal Adhesion 1 - 10 6. Single Epithelial Cell Size 1 - 10 7. Bare Nuclei 1 - 10 8. Bland Chromatin 1 - 10 9. Normal Nucleoli 1 - 10 10. Mitoses 1 - 10 11. Class: (2 for benign, 4 for malignant)

  1. Missing attribute values: 16

There are 16 instances in Groups 1 to 6 that contain a single missing (i.e., unavailable) attribute value, now denoted by “?”.

Load the libraries

library("ggplot2")
library("corrgram")
library("car")
## Loading required package: carData
library("lattice")
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
library("ROCR")
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("plotly")
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library("tree")
library(devtools)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  1.4.2     v purrr   0.2.5
## v tidyr   0.8.1     v dplyr   0.7.6
## v readr   1.1.1     v stringr 1.3.1
## v tibble  1.4.2     v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
library(downloader)
## 
## Attaching package: 'downloader'
## The following object is masked from 'package:devtools':
## 
##     source_url
library(BiocGenerics)
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
library(corrplot)
## corrplot 0.84 loaded
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
## 
##     textplot
## The following object is masked from 'package:graphics':
## 
##     legend
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa

Read the Data

# url Breast Cancer data from Wisconsin

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"


# Read the data
data <- read.csv(url)
head(data)
##   X1000025 X5 X1 X1.1 X1.2 X2 X1.3 X3 X1.4 X1.5 X2.1
## 1  1002945  5  4    4    5  7   10  3    2    1    2
## 2  1015425  3  1    1    1  2    2  3    1    1    2
## 3  1016277  6  8    8    1  3    4  3    7    1    2
## 4  1017023  4  1    1    3  2    1  3    1    1    2
## 5  1017122  8 10   10    8  7   10  9    7    1    4
## 6  1018099  1  1    1    1  2   10  3    1    1    2

Clean the Data

#Change the column headings
data <- read.csv(file = url, header = FALSE,
                col.names = c("id","CT", "UCSize", "UCShape", "MA", "SECS", "BN", "BC", "NN","M", "diagnosis") )

data$outcome[data$diagnosis==4] = 1
data$outcome[data$diagnosis==2] = 0
data$outcome = as.integer(data$outcome)
head(data)
##        id CT UCSize UCShape MA SECS BN BC NN M diagnosis outcome
## 1 1000025  5      1       1  1    2  1  3  1 1         2       0
## 2 1002945  5      4       4  5    7 10  3  2 1         2       0
## 3 1015425  3      1       1  1    2  2  3  1 1         2       0
## 4 1016277  6      8       8  1    3  4  3  7 1         2       0
## 5 1017023  4      1       1  3    2  1  3  1 1         2       0
## 6 1017122  8     10      10  8    7 10  9  7 1         4       1

Adjust data set for analysis. Remove ID column and “?” values

library(dplyr)
library(tidyverse)
data2 <- data %>% select(-id, -BN)

data2$outcome[data2$diagnosis==4] = 1
data2$outcome[data2$diagnosis==2] = 0
data2$outcome = as.integer(data2$outcome)
head(data2)
##   CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 1  5      1       1  1    2  3  1 1         2       0
## 2  5      4       4  5    7  3  2 1         2       0
## 3  3      1       1  1    2  3  1 1         2       0
## 4  6      8       8  1    3  3  7 1         2       0
## 5  4      1       1  3    2  3  1 1         2       0
## 6  8     10      10  8    7  9  7 1         4       1

Check Data Types

glimpse(data2)
## Observations: 699
## Variables: 10
## $ CT        <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4...
## $ UCSize    <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, ...
## $ UCShape   <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, ...
## $ MA        <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, ...
## $ SECS      <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2...
## $ BC        <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3...
## $ NN        <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1...
## $ M         <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1...
## $ diagnosis <int> 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2...
## $ outcome   <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0...

Create a Correlation Chart for Means

chart.Correlation(data2[, c(1:7)], histogram=TRUE, col="grey10", pch=1, main="Cancer Means")

Create Correlation Chart for SE

pairs.panels(data2[,c(1:6)], method="pearson",
             hist.col = "#1fbbfa", density=TRUE, ellipses=TRUE, show.points = TRUE,
             pch=1, lm=TRUE, cex.cor=1, smoother=F, stars = T, main="SE")

Need to split the dataset into the training sample and the testing sample

using a 50/50 split

sample_size = floor(0.5 * nrow(data2))

# set the seed to make your partition reproductible
set.seed(1729)
train_set = sample(seq_len(nrow(data2)), size = sample_size)

training = data2[train_set, ]

testing = data2[-train_set, ]

head(training)
##     CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 410  3      1       2  1    2  2  1 1         2       0
## 306 10      8       4  4    4  3 10 4         4       1
## 400  1      2       3  1    2  1  1 1         2       0
## 246  5      1       1  2    2  3  1 1         2       0
## 599  3      1       1  1    2  2  1 1         2       0
## 286  8     10      10 10    8 10  7 3         4       1
head(testing)
##    CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 2   5      4       4  5    7  3  2 1         2       0
## 4   6      8       8  1    3  3  7 1         2       0
## 6   8     10      10  8    7  9  7 1         4       1
## 7   1      1       1  1    2  3  1 1         2       0
## 8   2      1       2  1    2  3  1 1         2       0
## 13  5      3       3  3    2  4  4 1         4       1

Look at the Data Set, Training Data and Testing Data

ggplot(data2, aes(x = outcome)) +
  geom_bar(aes(fill = "blue")) +
  ggtitle("Distribution of diagnosis for the entire dataset") +
  theme(legend.position="none")

ggplot(training, aes(x = diagnosis)) + 
  geom_bar(aes(fill = 'blue')) + 
  ggtitle("Distribution of diagnosis for the training dataset") + 
  theme(legend.position="none")

ggplot(testing, aes(x = outcome)) + 
  geom_bar(aes(fill = 'blue')) + 
  ggtitle("Distribution of diagnosis for the testing dataset") + 
  theme(legend.position="none")

# Corrgram of the entire dataset
corrgram(data2, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt,
         main="Corrgram of the data")

# Corrgram of the training dataset
corrgram(training, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt,
         main="Corrgram of the training data")

# Corrgram of the testing dataset
corrgram(testing, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt,
         main="Corrgram of the testing data")

# Calculating the Correlation Coeficients and p-values

panel.cor <- function(x, y, digits = 2, cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  # correlation coefficient
  r <- cor(x, y)
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste("r= ", txt, sep = "")
  text(0.5, 0.6, txt)
  
  # p-value calculation
  p <- cor.test(x, y)$p.value
  txt2 <- format(c(p, 0.123456789), digits = digits)[1]
  txt2 <- paste("p= ", txt2, sep = "")
  if(p<0.01) txt2 <- paste("p= ", "<0.01", sep = "")
  text(0.2, 0.1, txt2)
}

Graph the linear correlation coef

pairs(training, upper.panel = panel.cor)

Fit the model using glm Generalize Linear Model

# Model Fitting
# Start off with this (alpha = 0.05)
model_algorithm = model = glm(outcome ~ CT + 
                                UCSize +
                                UCShape +
                                MA +
                                SECS + 
                                BC  +
                                NN  +
                                M ,
                               family=binomial(link='logit'), control = list(maxit = 50),data=training)

print(summary(model_algorithm))
## 
## Call:
## glm(formula = outcome ~ CT + UCSize + UCShape + MA + SECS + BC + 
##     NN + M, family = binomial(link = "logit"), data = training, 
##     control = list(maxit = 50))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.85231  -0.11326  -0.05213   0.00449   2.99989  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.0443     2.0547  -5.862 4.58e-09 ***
## CT            0.7113     0.2176   3.268  0.00108 ** 
## UCSize        0.3156     0.4604   0.686  0.49293    
## UCShape       0.5103     0.4737   1.077  0.28134    
## MA            0.3715     0.1783   2.083  0.03721 *  
## SECS          0.1318     0.2286   0.577  0.56416    
## BC            0.7649     0.2959   2.585  0.00974 ** 
## NN            0.2009     0.2191   0.917  0.35896    
## M             0.9075     0.5499   1.650  0.09891 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 454.165  on 348  degrees of freedom
## Residual deviance:  50.827  on 340  degrees of freedom
## AIC: 68.827
## 
## Number of Fisher Scoring iterations: 10
print(anova(model_algorithm, test="Chisq"))
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: outcome
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                      348     454.17              
## CT       1  225.630       347     228.54 < 2.2e-16 ***
## UCSize   1  144.642       346      83.89 < 2.2e-16 ***
## UCShape  1   11.469       345      72.42 0.0007078 ***
## MA       1    6.520       344      65.90 0.0106666 *  
## SECS     1    2.019       343      63.89 0.1553909    
## BC       1   10.654       342      53.23 0.0010984 ** 
## NN       1    1.012       341      52.22 0.3144747    
## M        1    1.393       340      50.83 0.2378188    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Using Uniform Cell size and Uniform Cell Shape as predictors of diagnosis

# Settled Uniform Cell Size and Uniform Cell Shape
model_algorithm_final = model = glm(outcome ~ UCSize + UCShape ,
                                    family=binomial(link='logit'), control = list(maxit = 50),data=training)

print(summary(model_algorithm_final))
## 
## Call:
## glm(formula = outcome ~ UCSize + UCShape, family = binomial(link = "logit"), 
##     data = training, control = list(maxit = 50))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.72698  -0.18243  -0.18243   0.00852   2.86504  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.0123     0.6645  -9.047  < 2e-16 ***
## UCSize        0.9213     0.2853   3.229 0.001243 ** 
## UCShape       1.0035     0.2631   3.814 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 454.165  on 348  degrees of freedom
## Residual deviance:  98.788  on 346  degrees of freedom
## AIC: 104.79
## 
## Number of Fisher Scoring iterations: 7
model_algorithm_final = model = glm(outcome ~ UCSize + UCShape + MA ,
                                    family=binomial(link='logit'), control = list(maxit = 50),data=training)

print(summary(model_algorithm_final))
## 
## Call:
## glm(formula = outcome ~ UCSize + UCShape + MA, family = binomial(link = "logit"), 
##     data = training, control = list(maxit = 50))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.72927  -0.16094  -0.16094   0.00576   2.95060  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.5049     0.7524  -8.645  < 2e-16 ***
## UCSize        0.7823     0.3031   2.581 0.009864 ** 
## UCShape       0.9761     0.2760   3.537 0.000405 ***
## MA            0.4064     0.1508   2.695 0.007042 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 454.17  on 348  degrees of freedom
## Residual deviance:  90.25  on 345  degrees of freedom
## AIC: 98.25
## 
## Number of Fisher Scoring iterations: 8

Apply the algorith to the training sample

prediction_training = predict(model_algorithm_final,training, type = "response")
prediction_training = ifelse(prediction_training > 0.5, 1, 0)
error = mean(prediction_training != training$outcome)
print(paste('Model Accuracy',1-error))
## [1] "Model Accuracy 0.951289398280802"

Calcualte the ROC curve and the AUC

# Get the ROC curve and the AUC
p = predict(model_algorithm_final, training, type="response")
pr = prediction(p, training$outcome)
prf = performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc = performance(pr, measure = "auc")
auc = auc@y.values[[1]]
print(paste("Model Accuracy", auc))
## [1] "Model Accuracy 0.986236559139785"
# Apply the algorithm to the testing sample
prediction_testing = predict(model_algorithm_final,testing, type = "response")
prediction_testing = ifelse(prediction_testing > 0.5, 1, 0)
error = mean(prediction_testing != testing$outcome)
print(paste('Model Accuracy',1-error))
## [1] "Model Accuracy 0.942857142857143"
# Get the ROC curve and the AUC
p = predict(model_algorithm_final, testing, type="response")
pr = prediction(p, testing$outcome)
prf = performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc = performance(pr, measure = "auc")
auc = auc@y.values[[1]]
print(paste("Model Accuracy", auc))
## [1] "Model Accuracy 0.982997689006273"
# Apply the algorithm to the entire dataset
prediction_data = predict(model_algorithm_final,data, type = "response")
prediction_data = ifelse(prediction_data > 0.5, 1, 0)
error = mean(prediction_data != data$outcome)
print(paste('Model Accuracy',1-error))
## [1] "Model Accuracy 0.947067238912732"
# Get the ROC curve and the AUC
p = predict(model_algorithm_final, data, type="response")
pr = prediction(p, data$outcome)
prf = performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc = performance(pr, measure = "auc")
auc = auc@y.values[[1]]
print(paste("Model Accuracy", auc))
## [1] "Model Accuracy 0.984245048832195"

Decision Tree

# Droping the outcome variable which was used for the logistic model



training$outcome = NULL
testing$outcome = NULL

training$diagnosis[training$diagnosis == 4] = 1
training$diagnosis[training$diagnosis ==2] = 0



# Running our first tree 
model_tree = tree(diagnosis ~ UCSize + 
                    UCShape +
                    MA +
                    SECS +
                    BC  + 
                    NN  +
                    M,
                    data = training)

summary(model_tree)
## 
## Regression tree:
## tree(formula = diagnosis ~ UCSize + UCShape + MA + SECS + BC + 
##     NN + M, data = training)
## Variables actually used in tree construction:
## [1] "UCSize"  "BC"      "UCShape" "MA"     
## Number of terminal nodes:  5 
## Residual mean deviance:  0.02956 = 10.17 / 344 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.96400 -0.01860 -0.01860  0.00000  0.03604  0.98140
# Now we want to plot our results
plot(model_tree, type = "uniform")

# Add some text to the plot
text(model_tree, pretty = 0, cex=0.8)

# Check the tree on the training data
# Distributional prediction

model_tree_pred_train = predict(model_tree, training) # gives the probability for each class

model_tree_pred_test = predict(model_tree, testing) # gives the probability for each class
# Try to prune the tree to avoid over fitting
cv.tree(model_tree)
## $size
## [1] 5 4 3 2 1
## 
## $dev
## [1] 19.77385 21.55135 22.03910 21.66386 80.32060
## 
## $k
## [1]      -Inf  2.844444  3.267954  4.125988 59.533981
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
plot(cv.tree(model_tree)) # Seems like a tree of size 5 might be best

# Pruned model
model_tree_prune = prune.tree(model_tree, best = 5)
summary(model_tree_prune)
## 
## Regression tree:
## tree(formula = diagnosis ~ UCSize + UCShape + MA + SECS + BC + 
##     NN + M, data = training)
## Variables actually used in tree construction:
## [1] "UCSize"  "BC"      "UCShape" "MA"     
## Number of terminal nodes:  5 
## Residual mean deviance:  0.02956 = 10.17 / 344 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.96400 -0.01860 -0.01860  0.00000  0.03604  0.98140
# Now we want to plot our results
plot(model_tree_prune, type = "uniform")

# Add some text to the plot
text(model_tree, pretty = 0, cex=0.8)

head(data2)
##   CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 1  5      1       1  1    2  3  1 1         2       0
## 2  5      4       4  5    7  3  2 1         2       0
## 3  3      1       1  1    2  3  1 1         2       0
## 4  6      8       8  1    3  3  7 1         2       0
## 5  4      1       1  3    2  3  1 1         2       0
## 6  8     10      10  8    7  9  7 1         4       1
all_pca <- prcomp(data2[,1:8], cor=TRUE, scale = TRUE)
## Warning: In prcomp.default(data2[, 1:8], cor = TRUE, scale = TRUE) :
##  extra argument 'cor' will be disregarded
summary(all_pca)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.2975 0.86438 0.73405 0.63905 0.61290 0.54618
## Proportion of Variance 0.6598 0.09339 0.06735 0.05105 0.04696 0.03729
## Cumulative Proportion  0.6598 0.75322 0.82057 0.87162 0.91858 0.95586
##                            PC7    PC8
## Standard deviation     0.51251 0.3007
## Proportion of Variance 0.03283 0.0113
## Cumulative Proportion  0.98870 1.0000
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
fviz_eig(all_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink",  
         barcolor="blue",linecolor = "red", ncp=10)+
         labs(title = "Cancer All Variances - PCA",
          x = "Principal Components", y = "% of variances")

all_var <- get_pca_var(all_pca)
all_var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
corrplot(all_var$cos2, is.corr=FALSE)

corrplot(all_var$contrib, is.corr=FALSE)    

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 <- fviz_contrib(all_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(all_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)

set.seed(218)
res.all <- kmeans(all_var$coord, centers = 6, nstart = 25)
grp <- as.factor(res.all$cluster)

fviz_pca_var(all_pca, col.var = grp, 
             palette = "jco",
             legend.title = "Cluster")

head(data2)
##   CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 1  5      1       1  1    2  3  1 1         2       0
## 2  5      4       4  5    7  3  2 1         2       0
## 3  3      1       1  1    2  3  1 1         2       0
## 4  6      8       8  1    3  3  7 1         2       0
## 5  4      1       1  3    2  3  1 1         2       0
## 6  8     10      10  8    7  9  7 1         4       1
data2$outcome[data$diagnosis==4] = 'M'
data2$outcome[data$diagnosis==2] = 'B'

fviz_pca_biplot(all_pca, col.ind = data2$outcome, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)