Breast Cancer Wisconsin (Diagnostic) Data Set Data Folder: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
Data Set Information: From: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
“Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. A few of the images can be found at [Web Link]
Separating plane described above was obtained using Multisurface Method-Tree (MSM-T) [K. P. Bennett, “Decision Tree Construction Via Linear Programming.” Proceedings of the 4th Midwest Artificial Intelligence and Cognitive Science Society, pp. 97-101, 1992], a classification method which uses linear programming to construct a decision tree. Relevant features were selected using an exhaustive search in the space of 1-4 features and 1-3 separating planes.
The actual linear program used to obtain the separating plane in the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: “Robust Linear Programming Discrimination of Two Linearly Inseparable Sets”, Optimization Methods and Software 1, 1992, 23-34]."
This database is also available through the UW CS ftp server: ftp ftp.cs.wisc.edu cd math-prog/cpo-dataset/machine-learn/WDBC/ Attribute Domain – —————————————– 1. Sample code number id number 2. Clump Thickness 1 - 10 3. Uniformity of Cell Size 1 - 10 4. Uniformity of Cell Shape 1 - 10 5. Marginal Adhesion 1 - 10 6. Single Epithelial Cell Size 1 - 10 7. Bare Nuclei 1 - 10 8. Bland Chromatin 1 - 10 9. Normal Nucleoli 1 - 10 10. Mitoses 1 - 10 11. Class: (2 for benign, 4 for malignant)
There are 16 instances in Groups 1 to 6 that contain a single missing (i.e., unavailable) attribute value, now denoted by “?”.
library("ggplot2")
library("corrgram")
library("car")
## Loading required package: carData
library("lattice")
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
library("ROCR")
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library("tree")
library(devtools)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 1.4.2 v purrr 0.2.5
## v tidyr 0.8.1 v dplyr 0.7.6
## v readr 1.1.1 v stringr 1.3.1
## v tibble 1.4.2 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(downloader)
##
## Attaching package: 'downloader'
## The following object is masked from 'package:devtools':
##
## source_url
library(BiocGenerics)
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
library(corrplot)
## corrplot 0.84 loaded
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
# url Breast Cancer data from Wisconsin
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
# Read the data
data <- read.csv(url)
head(data)
## X1000025 X5 X1 X1.1 X1.2 X2 X1.3 X3 X1.4 X1.5 X2.1
## 1 1002945 5 4 4 5 7 10 3 2 1 2
## 2 1015425 3 1 1 1 2 2 3 1 1 2
## 3 1016277 6 8 8 1 3 4 3 7 1 2
## 4 1017023 4 1 1 3 2 1 3 1 1 2
## 5 1017122 8 10 10 8 7 10 9 7 1 4
## 6 1018099 1 1 1 1 2 10 3 1 1 2
#Change the column headings
data <- read.csv(file = url, header = FALSE,
col.names = c("id","CT", "UCSize", "UCShape", "MA", "SECS", "BN", "BC", "NN","M", "diagnosis") )
data$outcome[data$diagnosis==4] = 1
data$outcome[data$diagnosis==2] = 0
data$outcome = as.integer(data$outcome)
head(data)
## id CT UCSize UCShape MA SECS BN BC NN M diagnosis outcome
## 1 1000025 5 1 1 1 2 1 3 1 1 2 0
## 2 1002945 5 4 4 5 7 10 3 2 1 2 0
## 3 1015425 3 1 1 1 2 2 3 1 1 2 0
## 4 1016277 6 8 8 1 3 4 3 7 1 2 0
## 5 1017023 4 1 1 3 2 1 3 1 1 2 0
## 6 1017122 8 10 10 8 7 10 9 7 1 4 1
library(dplyr)
library(tidyverse)
data2 <- data %>% select(-id, -BN)
data2$outcome[data2$diagnosis==4] = 1
data2$outcome[data2$diagnosis==2] = 0
data2$outcome = as.integer(data2$outcome)
head(data2)
## CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 1 5 1 1 1 2 3 1 1 2 0
## 2 5 4 4 5 7 3 2 1 2 0
## 3 3 1 1 1 2 3 1 1 2 0
## 4 6 8 8 1 3 3 7 1 2 0
## 5 4 1 1 3 2 3 1 1 2 0
## 6 8 10 10 8 7 9 7 1 4 1
glimpse(data2)
## Observations: 699
## Variables: 10
## $ CT <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4...
## $ UCSize <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, ...
## $ UCShape <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, ...
## $ MA <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, ...
## $ SECS <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2...
## $ BC <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3...
## $ NN <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1...
## $ M <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1...
## $ diagnosis <int> 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2...
## $ outcome <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0...
chart.Correlation(data2[, c(1:7)], histogram=TRUE, col="grey10", pch=1, main="Cancer Means")
pairs.panels(data2[,c(1:6)], method="pearson",
hist.col = "#1fbbfa", density=TRUE, ellipses=TRUE, show.points = TRUE,
pch=1, lm=TRUE, cex.cor=1, smoother=F, stars = T, main="SE")
using a 50/50 split
sample_size = floor(0.5 * nrow(data2))
# set the seed to make your partition reproductible
set.seed(1729)
train_set = sample(seq_len(nrow(data2)), size = sample_size)
training = data2[train_set, ]
testing = data2[-train_set, ]
head(training)
## CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 410 3 1 2 1 2 2 1 1 2 0
## 306 10 8 4 4 4 3 10 4 4 1
## 400 1 2 3 1 2 1 1 1 2 0
## 246 5 1 1 2 2 3 1 1 2 0
## 599 3 1 1 1 2 2 1 1 2 0
## 286 8 10 10 10 8 10 7 3 4 1
head(testing)
## CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 2 5 4 4 5 7 3 2 1 2 0
## 4 6 8 8 1 3 3 7 1 2 0
## 6 8 10 10 8 7 9 7 1 4 1
## 7 1 1 1 1 2 3 1 1 2 0
## 8 2 1 2 1 2 3 1 1 2 0
## 13 5 3 3 3 2 4 4 1 4 1
ggplot(data2, aes(x = outcome)) +
geom_bar(aes(fill = "blue")) +
ggtitle("Distribution of diagnosis for the entire dataset") +
theme(legend.position="none")
ggplot(training, aes(x = diagnosis)) +
geom_bar(aes(fill = 'blue')) +
ggtitle("Distribution of diagnosis for the training dataset") +
theme(legend.position="none")
ggplot(testing, aes(x = outcome)) +
geom_bar(aes(fill = 'blue')) +
ggtitle("Distribution of diagnosis for the testing dataset") +
theme(legend.position="none")
# Corrgram of the entire dataset
corrgram(data2, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt,
main="Corrgram of the data")
# Corrgram of the training dataset
corrgram(training, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt,
main="Corrgram of the training data")
# Corrgram of the testing dataset
corrgram(testing, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt,
main="Corrgram of the testing data")
# Calculating the Correlation Coeficients and p-values
panel.cor <- function(x, y, digits = 2, cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
# correlation coefficient
r <- cor(x, y)
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste("r= ", txt, sep = "")
text(0.5, 0.6, txt)
# p-value calculation
p <- cor.test(x, y)$p.value
txt2 <- format(c(p, 0.123456789), digits = digits)[1]
txt2 <- paste("p= ", txt2, sep = "")
if(p<0.01) txt2 <- paste("p= ", "<0.01", sep = "")
text(0.2, 0.1, txt2)
}
pairs(training, upper.panel = panel.cor)
# Model Fitting
# Start off with this (alpha = 0.05)
model_algorithm = model = glm(outcome ~ CT +
UCSize +
UCShape +
MA +
SECS +
BC +
NN +
M ,
family=binomial(link='logit'), control = list(maxit = 50),data=training)
print(summary(model_algorithm))
##
## Call:
## glm(formula = outcome ~ CT + UCSize + UCShape + MA + SECS + BC +
## NN + M, family = binomial(link = "logit"), data = training,
## control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.85231 -0.11326 -0.05213 0.00449 2.99989
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.0443 2.0547 -5.862 4.58e-09 ***
## CT 0.7113 0.2176 3.268 0.00108 **
## UCSize 0.3156 0.4604 0.686 0.49293
## UCShape 0.5103 0.4737 1.077 0.28134
## MA 0.3715 0.1783 2.083 0.03721 *
## SECS 0.1318 0.2286 0.577 0.56416
## BC 0.7649 0.2959 2.585 0.00974 **
## NN 0.2009 0.2191 0.917 0.35896
## M 0.9075 0.5499 1.650 0.09891 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 454.165 on 348 degrees of freedom
## Residual deviance: 50.827 on 340 degrees of freedom
## AIC: 68.827
##
## Number of Fisher Scoring iterations: 10
print(anova(model_algorithm, test="Chisq"))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: outcome
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 348 454.17
## CT 1 225.630 347 228.54 < 2.2e-16 ***
## UCSize 1 144.642 346 83.89 < 2.2e-16 ***
## UCShape 1 11.469 345 72.42 0.0007078 ***
## MA 1 6.520 344 65.90 0.0106666 *
## SECS 1 2.019 343 63.89 0.1553909
## BC 1 10.654 342 53.23 0.0010984 **
## NN 1 1.012 341 52.22 0.3144747
## M 1 1.393 340 50.83 0.2378188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Settled Uniform Cell Size and Uniform Cell Shape
model_algorithm_final = model = glm(outcome ~ UCSize + UCShape ,
family=binomial(link='logit'), control = list(maxit = 50),data=training)
print(summary(model_algorithm_final))
##
## Call:
## glm(formula = outcome ~ UCSize + UCShape, family = binomial(link = "logit"),
## data = training, control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.72698 -0.18243 -0.18243 0.00852 2.86504
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.0123 0.6645 -9.047 < 2e-16 ***
## UCSize 0.9213 0.2853 3.229 0.001243 **
## UCShape 1.0035 0.2631 3.814 0.000137 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 454.165 on 348 degrees of freedom
## Residual deviance: 98.788 on 346 degrees of freedom
## AIC: 104.79
##
## Number of Fisher Scoring iterations: 7
model_algorithm_final = model = glm(outcome ~ UCSize + UCShape + MA ,
family=binomial(link='logit'), control = list(maxit = 50),data=training)
print(summary(model_algorithm_final))
##
## Call:
## glm(formula = outcome ~ UCSize + UCShape + MA, family = binomial(link = "logit"),
## data = training, control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.72927 -0.16094 -0.16094 0.00576 2.95060
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.5049 0.7524 -8.645 < 2e-16 ***
## UCSize 0.7823 0.3031 2.581 0.009864 **
## UCShape 0.9761 0.2760 3.537 0.000405 ***
## MA 0.4064 0.1508 2.695 0.007042 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 454.17 on 348 degrees of freedom
## Residual deviance: 90.25 on 345 degrees of freedom
## AIC: 98.25
##
## Number of Fisher Scoring iterations: 8
prediction_training = predict(model_algorithm_final,training, type = "response")
prediction_training = ifelse(prediction_training > 0.5, 1, 0)
error = mean(prediction_training != training$outcome)
print(paste('Model Accuracy',1-error))
## [1] "Model Accuracy 0.951289398280802"
# Get the ROC curve and the AUC
p = predict(model_algorithm_final, training, type="response")
pr = prediction(p, training$outcome)
prf = performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc = performance(pr, measure = "auc")
auc = auc@y.values[[1]]
print(paste("Model Accuracy", auc))
## [1] "Model Accuracy 0.986236559139785"
# Apply the algorithm to the testing sample
prediction_testing = predict(model_algorithm_final,testing, type = "response")
prediction_testing = ifelse(prediction_testing > 0.5, 1, 0)
error = mean(prediction_testing != testing$outcome)
print(paste('Model Accuracy',1-error))
## [1] "Model Accuracy 0.942857142857143"
# Get the ROC curve and the AUC
p = predict(model_algorithm_final, testing, type="response")
pr = prediction(p, testing$outcome)
prf = performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc = performance(pr, measure = "auc")
auc = auc@y.values[[1]]
print(paste("Model Accuracy", auc))
## [1] "Model Accuracy 0.982997689006273"
# Apply the algorithm to the entire dataset
prediction_data = predict(model_algorithm_final,data, type = "response")
prediction_data = ifelse(prediction_data > 0.5, 1, 0)
error = mean(prediction_data != data$outcome)
print(paste('Model Accuracy',1-error))
## [1] "Model Accuracy 0.947067238912732"
# Get the ROC curve and the AUC
p = predict(model_algorithm_final, data, type="response")
pr = prediction(p, data$outcome)
prf = performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc = performance(pr, measure = "auc")
auc = auc@y.values[[1]]
print(paste("Model Accuracy", auc))
## [1] "Model Accuracy 0.984245048832195"
# Droping the outcome variable which was used for the logistic model
training$outcome = NULL
testing$outcome = NULL
training$diagnosis[training$diagnosis == 4] = 1
training$diagnosis[training$diagnosis ==2] = 0
# Running our first tree
model_tree = tree(diagnosis ~ UCSize +
UCShape +
MA +
SECS +
BC +
NN +
M,
data = training)
summary(model_tree)
##
## Regression tree:
## tree(formula = diagnosis ~ UCSize + UCShape + MA + SECS + BC +
## NN + M, data = training)
## Variables actually used in tree construction:
## [1] "UCSize" "BC" "UCShape" "MA"
## Number of terminal nodes: 5
## Residual mean deviance: 0.02956 = 10.17 / 344
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.96400 -0.01860 -0.01860 0.00000 0.03604 0.98140
# Now we want to plot our results
plot(model_tree, type = "uniform")
# Add some text to the plot
text(model_tree, pretty = 0, cex=0.8)
# Check the tree on the training data
# Distributional prediction
model_tree_pred_train = predict(model_tree, training) # gives the probability for each class
model_tree_pred_test = predict(model_tree, testing) # gives the probability for each class
# Try to prune the tree to avoid over fitting
cv.tree(model_tree)
## $size
## [1] 5 4 3 2 1
##
## $dev
## [1] 19.77385 21.55135 22.03910 21.66386 80.32060
##
## $k
## [1] -Inf 2.844444 3.267954 4.125988 59.533981
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.tree(model_tree)) # Seems like a tree of size 5 might be best
# Pruned model
model_tree_prune = prune.tree(model_tree, best = 5)
summary(model_tree_prune)
##
## Regression tree:
## tree(formula = diagnosis ~ UCSize + UCShape + MA + SECS + BC +
## NN + M, data = training)
## Variables actually used in tree construction:
## [1] "UCSize" "BC" "UCShape" "MA"
## Number of terminal nodes: 5
## Residual mean deviance: 0.02956 = 10.17 / 344
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.96400 -0.01860 -0.01860 0.00000 0.03604 0.98140
# Now we want to plot our results
plot(model_tree_prune, type = "uniform")
# Add some text to the plot
text(model_tree, pretty = 0, cex=0.8)
head(data2)
## CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 1 5 1 1 1 2 3 1 1 2 0
## 2 5 4 4 5 7 3 2 1 2 0
## 3 3 1 1 1 2 3 1 1 2 0
## 4 6 8 8 1 3 3 7 1 2 0
## 5 4 1 1 3 2 3 1 1 2 0
## 6 8 10 10 8 7 9 7 1 4 1
all_pca <- prcomp(data2[,1:8], cor=TRUE, scale = TRUE)
## Warning: In prcomp.default(data2[, 1:8], cor = TRUE, scale = TRUE) :
## extra argument 'cor' will be disregarded
summary(all_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.2975 0.86438 0.73405 0.63905 0.61290 0.54618
## Proportion of Variance 0.6598 0.09339 0.06735 0.05105 0.04696 0.03729
## Cumulative Proportion 0.6598 0.75322 0.82057 0.87162 0.91858 0.95586
## PC7 PC8
## Standard deviation 0.51251 0.3007
## Proportion of Variance 0.03283 0.0113
## Cumulative Proportion 0.98870 1.0000
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
fviz_eig(all_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink",
barcolor="blue",linecolor = "red", ncp=10)+
labs(title = "Cancer All Variances - PCA",
x = "Principal Components", y = "% of variances")
all_var <- get_pca_var(all_pca)
all_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
corrplot(all_var$cos2, is.corr=FALSE)
corrplot(all_var$contrib, is.corr=FALSE)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- fviz_contrib(all_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(all_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
set.seed(218)
res.all <- kmeans(all_var$coord, centers = 6, nstart = 25)
grp <- as.factor(res.all$cluster)
fviz_pca_var(all_pca, col.var = grp,
palette = "jco",
legend.title = "Cluster")
head(data2)
## CT UCSize UCShape MA SECS BC NN M diagnosis outcome
## 1 5 1 1 1 2 3 1 1 2 0
## 2 5 4 4 5 7 3 2 1 2 0
## 3 3 1 1 1 2 3 1 1 2 0
## 4 6 8 8 1 3 3 7 1 2 0
## 5 4 1 1 3 2 3 1 1 2 0
## 6 8 10 10 8 7 9 7 1 4 1
data2$outcome[data$diagnosis==4] = 'M'
data2$outcome[data$diagnosis==2] = 'B'
fviz_pca_biplot(all_pca, col.ind = data2$outcome, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)