library(readr)
library(caret)
library(tidyverse)
Intro
data(iris)
iris.new <- iris[,1:4]
set.seed(61)
cluster_col <- c("red","Blue", "Black")
km.iris <- kmeans(iris.new,centers = 3,nstart = 1)
plot(iris.new$Sepal.Length,iris.new$Sepal.Width,
xlab = "Sepal Length",
ylab = "Sepal Width",
col = km.iris$cluster)
points(km.iris$centers,col= 1:3 , pch=8,cex=2)

wss <- 0
for (i in 1:15) {
km.iris <- kmeans(iris.new, centers = i, nstart=20, iter.max = 50)
wss[i] <- km.iris$tot.withinss
}
plot(1:15, wss, type = "b",
xlab = "Numero de Clusters",
ylab = "Suma de cuadrados entre grupos")

par(mfrow = c(2, 3))
# Set seed
set.seed(1)
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.mtcars <- kmeans(mtcars, 3, nstar=1,iter.max = 5)
# Plot clusters
plot(mtcars$wt,mtcars$qsec, col = km.mtcars$cluster,
main = km.mtcars$tot.withinss,
xlab = "", ylab = "")
}

Cancer
cancer <- read_csv("~/Dropbox/Cursos/fiabilidad/Material para la clase/clusters/data.csv")
dim(cancer)
[1] 569 33
names(cancer)
[1] "id" "diagnosis" "radius_mean" "texture_mean" "perimeter_mean"
[6] "area_mean" "smoothness_mean" "compactness_mean" "concavity_mean" "concave points_mean"
[11] "symmetry_mean" "fractal_dimension_mean" "radius_se" "texture_se" "perimeter_se"
[16] "area_se" "smoothness_se" "compactness_se" "concavity_se" "concave points_se"
[21] "symmetry_se" "fractal_dimension_se" "radius_worst" "texture_worst" "perimeter_worst"
[26] "area_worst" "smoothness_worst" "compactness_worst" "concavity_worst" "concave points_worst"
[31] "symmetry_worst" "fractal_dimension_worst" NA
cancer.data <- cancer[,3:32]
diagnosis <- cancer[,2]
row.names(cancer.data) <- cancer[,1]
cancer.data
sum(is.na(cancer.data))
[1] 0
table(diagnosis)
diagnosis
B M
357 212
round(colMeans(cancer.data),3)
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
14.127 19.290 91.969 654.889 0.096 0.104
concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se
0.089 0.049 0.181 0.063 0.405 1.217
perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se
2.866 40.337 0.007 0.025 0.032 0.012
symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
0.021 0.004 16.269 25.677 107.261 880.583
smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0.132 0.254 0.272 0.115 0.290 0.084
round(apply(cancer.data,2,sd),3)
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
3.524 4.301 24.299 351.914 0.014 0.053
concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se
0.080 0.039 0.027 0.007 0.277 0.552
perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se
2.022 45.491 0.003 0.018 0.030 0.006
symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
0.008 0.003 4.833 6.146 33.603 569.357
smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0.023 0.157 0.209 0.066 0.062 0.018
cancer.data_impute <- preProcess(cancer.data,method = c("center","scale"))
cancer.data.cs <- predict(cancer.data_impute,cancer.data)
round(colMeans(cancer.data.cs),3)
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
0 0 0 0 0 0
concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se
0 0 0 0 0 0
perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se
0 0 0 0 0 0
symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
0 0 0 0 0 0
smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 0 0 0 0 0
round(apply(cancer.data.cs,2,sd),3)
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
1 1 1 1 1 1
concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se
1 1 1 1 1 1
perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se
1 1 1 1 1 1
symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
1 1 1 1 1 1
smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
1 1 1 1 1 1
km.out <- kmeans(cancer.data.cs,centers = 3, nstart =20)
summary(km.out)
Length Class Mode
cluster 569 -none- numeric
centers 90 -none- numeric
totss 1 -none- numeric
withinss 3 -none- numeric
tot.withinss 1 -none- numeric
betweenss 1 -none- numeric
size 3 -none- numeric
iter 1 -none- numeric
ifault 1 -none- numeric
str(km.out)
List of 9
$ cluster : Named int [1:569] 1 1 1 3 1 3 1 3 3 3 ...
..- attr(*, "names")= chr [1:569] "842302" "842517" "84300903" "84348301" ...
$ centers : num [1:3, 1:30] 1.618 -0.448 -0.172 0.626 -0.242 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:3] "1" "2" "3"
.. ..$ : chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
$ totss : num 17040
$ withinss : num [1:3] 2894 4358 2792
$ tot.withinss: num 10044
$ betweenss : num 6996
$ size : int [1:3] 110 359 100
$ iter : int 3
$ ifault : int 0
- attr(*, "class")= chr "kmeans"
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(cancer.data.cs, centers = i, nstart=20, iter.max = 50)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
plot(1:15, wss, type = "b",
xlab = "Numero de Clusters",
ylab = "Suma de cuadrados entre grupos",main = "Scree Plot para el dataset de cancer de mamas")
points(2,wss[2],col = "red",pch = 16)
text(2,wss[2],labels = round(wss[2],2),cex = 0.7,pos = 4)

km.out <- kmeans(cancer.data.cs,centers = 2, nstart =30,iter.max = 30)
summary(km.out)
Length Class Mode
cluster 569 -none- numeric
centers 60 -none- numeric
totss 1 -none- numeric
withinss 2 -none- numeric
tot.withinss 1 -none- numeric
betweenss 1 -none- numeric
size 2 -none- numeric
iter 1 -none- numeric
ifault 1 -none- numeric
cancer.data$cluster <- km.out$cluster
table(diagnosis,cancer.data$cluster)
diagnosis 1 2
B 14 343
M 175 37
cancer.data$p_diag <- ifelse(cancer.data$cluster == 1 ,"M","B")
confusionMatrix(cancer.data$p_diag,diagnosis,positive = "M")
Confusion Matrix and Statistics
Reference
Prediction B M
B 343 37
M 14 175
Accuracy : 0.9104
95% CI : (0.8838, 0.9325)
No Information Rate : 0.6274
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.804
Mcnemar's Test P-Value : 0.002066
Sensitivity : 0.8255
Specificity : 0.9608
Pos Pred Value : 0.9259
Neg Pred Value : 0.9026
Prevalence : 0.3726
Detection Rate : 0.3076
Detection Prevalence : 0.3322
Balanced Accuracy : 0.8931
'Positive' Class : M
Hierarchical clustering
cancer.data.hc <- cancer[,3:32]
cancer.data.hc <- as.data.frame(cancer.data.hc)
diagnosis <- cancer[,2]
diagnosis_num <- as.numeric(diagnosis == "M")
cancer.data.hc_impute <- preProcess(cancer.data.hc,method = c("center","scale"))
cancer.data.hc.cs <- predict(cancer.data_impute,cancer.data.hc)
cancer.data.hc.cs.dist <- dist(cancer.data.hc.cs)
cancer.hclust <- hclust(cancer.data.hc.cs.dist, method = "complete")
plot(cancer.hclust)
abline(h=19, col = "red")

cancer.hclust.cluster <- cutree(cancer.hclust, k=4)
table(cancer.hclust.cluster,diagnosis_num)
diagnosis_num
cancer.hclust.cluster 0 1
1 12 165
2 2 5
3 343 40
4 0 2
cancer.data.hc$pred <- ifelse(cancer.hclust.cluster %in% c(1,2,4), "M", "B")
confusionMatrix(cancer.data.hc$pred, diagnosis$diagnosis,positive = "M")
Confusion Matrix and Statistics
Reference
Prediction B M
B 343 40
M 14 172
Accuracy : 0.9051
95% CI : (0.878, 0.9279)
No Information Rate : 0.6274
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7918
Mcnemar's Test P-Value : 0.0006688
Sensitivity : 0.8113
Specificity : 0.9608
Pos Pred Value : 0.9247
Neg Pred Value : 0.8956
Prevalence : 0.3726
Detection Rate : 0.3023
Detection Prevalence : 0.3269
Balanced Accuracy : 0.8861
'Positive' Class : M
Principal Component Analysis
cancer.data.pr <- cancer[,-c(1,2,33)]
cancer.pr <- prcomp(cancer.data.pr,scale = TRUE)
summary(cancer.pr)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28
Standard deviation 0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
Cumulative Proportion 0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
PC29 PC30
Standard deviation 0.02736 0.01153
Proportion of Variance 0.00002 0.00000
Cumulative Proportion 1.00000 1.00000
plot(cancer.pr)

biplot(cancer.pr)

plot(cancer.pr$x[, c(1, 2)], col=diagnosis_num+1, xlab = "PC1", ylab = "PC2")

LS0tCnRpdGxlOiAiQ2x1c3RlcnMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KGNhcmV0KQpsaWJyYXJ5KHRpZHl2ZXJzZSkKYGBgCgoKIyMga21lYW5zCgohW10oaW1nMTQ0MC5wbmcpCgoKIVtdKGhxZGVmYXVsdC5qcGcpCgoKCgohW10oa21lYW5zY2x1c3RlcmluZy5qcGcpCgoKCgoKCgoKIyMgSW50cm8KCgpgYGB7cn0KZGF0YShpcmlzKQppcmlzLm5ldyA8LSBpcmlzWywxOjRdCmBgYAoKYGBge3J9CnNldC5zZWVkKDYxKQpjbHVzdGVyX2NvbCA8LSBjKCJyZWQiLCJCbHVlIiwgIkJsYWNrIikKa20uaXJpcyA8LSBrbWVhbnMoaXJpcy5uZXcsY2VudGVycyA9IDMsbnN0YXJ0ID0gMSkKcGxvdChpcmlzLm5ldyRTZXBhbC5MZW5ndGgsaXJpcy5uZXckU2VwYWwuV2lkdGgsCiAgICAgeGxhYiA9ICJTZXBhbCBMZW5ndGgiLAogICAgIHlsYWIgPSAiU2VwYWwgV2lkdGgiLAogICAgIGNvbCA9IGttLmlyaXMkY2x1c3RlcikKcG9pbnRzKGttLmlyaXMkY2VudGVycyxjb2w9IDE6MyAsIHBjaD04LGNleD0yKQpgYGAKCgoKYGBge3J9CndzcyA8LSAwCgoKZm9yIChpIGluIDE6MTUpIHsKICBrbS5pcmlzIDwtIGttZWFucyhpcmlzLm5ldywgY2VudGVycyA9IGksIG5zdGFydD0yMCwgaXRlci5tYXggPSA1MCkKICB3c3NbaV0gPC0ga20uaXJpcyR0b3Qud2l0aGluc3MKfQoKcGxvdCgxOjE1LCB3c3MsIHR5cGUgPSAiYiIsIAogICAgIHhsYWIgPSAiTnVtZXJvIGRlIENsdXN0ZXJzIiwgCiAgICAgeWxhYiA9ICJTdW1hIGRlIGN1YWRyYWRvcyBlbnRyZSBncnVwb3MiKQpgYGAKCgoKCmBgYHtyfQpwYXIobWZyb3cgPSBjKDIsIDMpKQoKIyBTZXQgc2VlZApzZXQuc2VlZCgxKQoKZm9yKGkgaW4gMTo2KSB7CiAgIyBSdW4ga21lYW5zKCkgb24geCB3aXRoIHRocmVlIGNsdXN0ZXJzIGFuZCBvbmUgc3RhcnQKICBrbS5tdGNhcnMgPC0ga21lYW5zKG10Y2FycywgMywgbnN0YXI9MSkKICAKICAjIFBsb3QgY2x1c3RlcnMKICBwbG90KG10Y2FycyR3dCxtdGNhcnMkcXNlYywgY29sID0ga20ubXRjYXJzJGNsdXN0ZXIsIAogICAgICAgbWFpbiA9IGttLm10Y2FycyR0b3Qud2l0aGluc3MsIAogICAgICAgeGxhYiA9ICIiLCB5bGFiID0gIiIpCn0KYGBgCgoKCgoKIyMgQ2FuY2VyCmBgYHtyfQpjYW5jZXIgPC0gcmVhZF9jc3YoIn4vRHJvcGJveC9DdXJzb3MvZmlhYmlsaWRhZC9NYXRlcmlhbCBwYXJhIGxhIGNsYXNlL2NsdXN0ZXJzL2RhdGEuY3N2IikKYGBgCgpgYGB7cn0KZGltKGNhbmNlcikKYGBgCgpgYGB7cn0KbmFtZXMoY2FuY2VyKQpgYGAKCgpgYGB7cn0KY2FuY2VyLmRhdGEgPC0gY2FuY2VyWywzOjMyXQpkaWFnbm9zaXMgPC0gY2FuY2VyWywyXQpyb3cubmFtZXMoY2FuY2VyLmRhdGEpIDwtIGNhbmNlclssMV0KY2FuY2VyLmRhdGEKYGBgCgoKYGBge3J9CnN1bShpcy5uYShjYW5jZXIuZGF0YSkpCmBgYAoKYGBge3J9CnRhYmxlKGRpYWdub3NpcykKYGBgCgpgYGB7cn0Kcm91bmQoY29sTWVhbnMoY2FuY2VyLmRhdGEpLDMpCmBgYAoKYGBge3J9CnJvdW5kKGFwcGx5KGNhbmNlci5kYXRhLDIsc2QpLDMpCmBgYAoKYGBge3J9CmNhbmNlci5kYXRhX2ltcHV0ZSA8LSBwcmVQcm9jZXNzKGNhbmNlci5kYXRhLG1ldGhvZCA9IGMoImNlbnRlciIsInNjYWxlIikpCmNhbmNlci5kYXRhLmNzIDwtIHByZWRpY3QoY2FuY2VyLmRhdGFfaW1wdXRlLGNhbmNlci5kYXRhKQpgYGAKCgpgYGB7cn0Kcm91bmQoY29sTWVhbnMoY2FuY2VyLmRhdGEuY3MpLDMpCmBgYAoKYGBge3J9CnJvdW5kKGFwcGx5KGNhbmNlci5kYXRhLmNzLDIsc2QpLDMpCmBgYAoKCgpgYGB7cn0Ka20ub3V0IDwtIGttZWFucyhjYW5jZXIuZGF0YS5jcyxjZW50ZXJzID0gMywgbnN0YXJ0ID0yMCkKc3VtbWFyeShrbS5vdXQpCmBgYAoKCmBgYHtyfQpzdHIoa20ub3V0KQpgYGAKCgpgYGB7cn0Kd3NzIDwtIDAKCiMgRm9yIDEgdG8gMTUgY2x1c3RlciBjZW50ZXJzCmZvciAoaSBpbiAxOjE1KSB7CiAga20ub3V0IDwtIGttZWFucyhjYW5jZXIuZGF0YS5jcywgY2VudGVycyA9IGksIG5zdGFydD0yMCwgaXRlci5tYXggPSA1MCkKICAjIFNhdmUgdG90YWwgd2l0aGluIHN1bSBvZiBzcXVhcmVzIHRvIHdzcyB2YXJpYWJsZQogIHdzc1tpXSA8LSBrbS5vdXQkdG90LndpdGhpbnNzCn0KCnBsb3QoMToxNSwgd3NzLCB0eXBlID0gImIiLCAKICAgICB4bGFiID0gIk51bWVybyBkZSBDbHVzdGVycyIsIAogICAgIHlsYWIgPSAiU3VtYSBkZSBjdWFkcmFkb3MgZW50cmUgZ3J1cG9zIixtYWluID0gIlNjcmVlIFBsb3QgcGFyYSBlbCBkYXRhc2V0IGRlIGNhbmNlciBkZSBtYW1hcyIpCgpwb2ludHMoMix3c3NbMl0sY29sID0gInJlZCIscGNoID0gMTYpCnRleHQoMix3c3NbMl0sbGFiZWxzID0gcm91bmQod3NzWzJdLDIpLGNleCA9IDAuNyxwb3MgPSA0KQpgYGAKCgpgYGB7cn0Ka20ub3V0IDwtIGttZWFucyhjYW5jZXIuZGF0YS5jcyxjZW50ZXJzID0gMiwgbnN0YXJ0ID0zMCxpdGVyLm1heCA9IDMwKQpzdW1tYXJ5KGttLm91dCkKYGBgCgpgYGB7cn0KY2FuY2VyLmRhdGEkY2x1c3RlciA8LSBrbS5vdXQkY2x1c3Rlcgp0YWJsZShkaWFnbm9zaXMsY2FuY2VyLmRhdGEkY2x1c3RlcikKYGBgCgpgYGB7cn0KY2FuY2VyLmRhdGEkcF9kaWFnIDwtIGlmZWxzZShjYW5jZXIuZGF0YSRjbHVzdGVyID09IDEgLCJNIiwiQiIpCmNvbmZ1c2lvbk1hdHJpeChjYW5jZXIuZGF0YSRwX2RpYWcsZGlhZ25vc2lzLHBvc2l0aXZlID0gIk0iKQpgYGAKCgojIyBIaWVyYXJjaGljYWwgY2x1c3RlcmluZwoKCmBgYHtyfQpjYW5jZXIuZGF0YS5oYyA8LSBjYW5jZXJbLDM6MzJdCmNhbmNlci5kYXRhLmhjIDwtIGFzLmRhdGEuZnJhbWUoY2FuY2VyLmRhdGEuaGMpCmRpYWdub3NpcyA8LSBjYW5jZXJbLDJdCmRpYWdub3Npc19udW0gPC0gYXMubnVtZXJpYyhkaWFnbm9zaXMgPT0gIk0iKSAKY2FuY2VyLmRhdGEuaGNfaW1wdXRlIDwtIHByZVByb2Nlc3MoY2FuY2VyLmRhdGEuaGMsbWV0aG9kID0gYygiY2VudGVyIiwic2NhbGUiKSkKY2FuY2VyLmRhdGEuaGMuY3MgPC0gcHJlZGljdChjYW5jZXIuZGF0YV9pbXB1dGUsY2FuY2VyLmRhdGEuaGMpCmBgYAoKCmBgYHtyfQpjYW5jZXIuZGF0YS5oYy5jcy5kaXN0IDwtIGRpc3QoY2FuY2VyLmRhdGEuaGMuY3MpCmNhbmNlci5oY2x1c3QgPC0gaGNsdXN0KGNhbmNlci5kYXRhLmhjLmNzLmRpc3QsIG1ldGhvZCA9ICJjb21wbGV0ZSIpCnBsb3QoY2FuY2VyLmhjbHVzdCkKYWJsaW5lKGg9MTksIGNvbCA9ICJyZWQiKQpgYGAKCgpgYGB7cn0KY2FuY2VyLmhjbHVzdC5jbHVzdGVyIDwtIGN1dHJlZShjYW5jZXIuaGNsdXN0LCBrPTQpCnRhYmxlKGNhbmNlci5oY2x1c3QuY2x1c3RlcixkaWFnbm9zaXNfbnVtKQpgYGAKCmBgYHtyfQpjYW5jZXIuZGF0YS5oYyRwcmVkIDwtIGlmZWxzZShjYW5jZXIuaGNsdXN0LmNsdXN0ZXIgJWluJSBjKDEsMiw0KSwgIk0iLCAiQiIpCmNvbmZ1c2lvbk1hdHJpeChjYW5jZXIuZGF0YS5oYyRwcmVkLCBkaWFnbm9zaXMkZGlhZ25vc2lzLHBvc2l0aXZlID0gIk0iKQpgYGAKCgojIyBQcmluY2lwYWwgQ29tcG9uZW50IEFuYWx5c2lzCgoKCiFbXShwY2EucG5nKQoKIVtdKHBjYTIucG5nKQoKCiFbXShwY2EzLmdpZikKCgoKYGBge3J9CmNhbmNlci5kYXRhLnByIDwtIGNhbmNlclssLWMoMSwyLDMzKV0KY2FuY2VyLnByIDwtIHByY29tcChjYW5jZXIuZGF0YS5wcixzY2FsZSA9IFRSVUUpCnN1bW1hcnkoY2FuY2VyLnByKQpgYGAKCmBgYHtyfQpwbG90KGNhbmNlci5wcikKYGBgCgoKYGBge3J9CmJpcGxvdChjYW5jZXIucHIpCmBgYAoKCgpgYGB7cn0KcGxvdChjYW5jZXIucHIkeFssIGMoMSwgMildLCBjb2w9ZGlhZ25vc2lzX251bSsxLCB4bGFiID0gIlBDMSIsIHlsYWIgPSAiUEMyIikKYGBgCgoKCgoKCgoK