bupa=read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos-2019-2/master/bupa.txt", sep = ",")
head(bupa)
## V1 V2 V3 V4 V5 V6 V7
## 1 85 92 45 27 31 0 1
## 2 85 64 59 32 23 0 2
## 3 86 54 33 16 54 0 2
## 4 91 78 34 24 36 0 2
## 5 87 70 12 28 10 0 2
## 6 98 55 13 17 17 0 2
cor(bupa)
## V1 V2 V3 V4 V5 V6
## V1 1.00000000 0.04410300 0.14769505 0.1877652 0.2223145 0.31267960
## V2 0.04410300 1.00000000 0.07620761 0.1460565 0.1331404 0.10079606
## V3 0.14769505 0.07620761 1.00000000 0.7396749 0.5034353 0.20684793
## V4 0.18776515 0.14605655 0.73967487 1.0000000 0.5276259 0.27958777
## V5 0.22231449 0.13314040 0.50343525 0.5276259 1.0000000 0.34122396
## V6 0.31267960 0.10079606 0.20684793 0.2795878 0.3412240 1.00000000
## V7 -0.09107012 -0.09805018 -0.03500879 0.1573558 0.1463925 -0.02204853
## V7
## V1 -0.09107012
## V2 -0.09805018
## V3 -0.03500879
## V4 0.15735580
## V5 0.14639252
## V6 -0.02204853
## V7 1.00000000
library(VIM)
library(corrplot)
library(psych)
library(PerformanceAnalytics)
corrplot(cor(bupa))
chart.Correlation(bupa)
## En la primera prueba observamos que si hay correlacion con algunas variables esto es una buena señal para seguir analizando y ver si deberia hacerse un Analisis de componentes Principales
# PRUEBA GENERAL DE CORRELACIONES
# si la probabilidad es diferente de sero rechazo h sub 0
library(psych)
cortest(cor(bupa))
## Warning in cortest(cor(bupa)): n not specified, 100 used
## Tests of correlation matrices
## Call:cortest(R1 = cor(bupa))
## Chi Square value 208.01 with df = 21 with probability < 9.6e-33
#2 Prueba de Bartlet (n = cantidad de datos)
library(rela)
cortest.bartlett(cor(bupa), n=345)
## $chisq
## [1] 544.8724
##
## $p.value
## [1] 6.004754e-102
##
## $df
## [1] 21
#3 Prueba KMO
library(psych)
KMO(bupa)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = bupa)
## Overall MSA = 0.64
## MSA for each item =
## V1 V2 V3 V4 V5 V6 V7
## 0.70 0.53 0.59 0.63 0.81 0.73 0.23
# Grafico de Sedimentacion
scree(bupa)
#Analisis paralelo
fa.parallel((bupa), fa="pc")
## Parallel analysis suggests that the number of factors = NA and the number of components = 2
componentes=prcomp(bupa, scale=TRUE, center =T)
componentes
## Standard deviations (1, .., p=7):
## [1] 1.5837765 1.0926330 1.0046350 0.9465752 0.8188865 0.7061828 0.4724823
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5 PC6
## V1 0.26093155 0.4869910 0.49039467 -0.02430417 0.67017404 0.04830950
## V2 0.14769977 0.3252950 -0.66587263 -0.62523056 0.15949171 0.06888411
## V3 0.50668951 -0.1526510 -0.23040936 0.41245134 0.03558136 0.19739104
## V4 0.53762897 -0.2217274 -0.14693268 0.13167289 0.08655906 0.36430265
## V5 0.49239652 -0.1143955 0.03814116 -0.10572417 -0.07449247 -0.84970661
## V6 0.34359042 0.3470071 0.37146594 -0.27134023 -0.69137189 0.25772936
## V7 0.06173834 -0.6716080 0.31938586 -0.57986125 0.18200666 0.18115126
## PC7
## V1 0.04700907
## V2 0.08880155
## V3 0.67566973
## V4 -0.69473462
## V5 -0.06539520
## V6 0.07416000
## V7 0.20234232
summary(componentes)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5838 1.0926 1.0046 0.9466 0.8189 0.70618 0.47248
## Proportion of Variance 0.3583 0.1706 0.1442 0.1280 0.0958 0.07124 0.03189
## Cumulative Proportion 0.3583 0.5289 0.6731 0.8011 0.8969 0.96811 1.00000
plot(componentes)
biplot(componentes, scale=2)
## Warning in biplot.prcomp(componentes, scale = 2): 'scale' is outside [0, 1]
# Graficamos los componentes y se vizualizan los 3 componentes agrupados
componentes_prin=componentes$x
componentes_prin=componentes_prin[,1:3]
head(componentes_prin)
## PC1 PC2 PC3
## [1,] -0.1390785 0.11105334 -2.3448572
## [2,] 0.2907050 -1.94038272 -0.9286607
## [3,] -0.8721346 -1.54263788 0.1152322
## [4,] -0.1580974 -0.70132800 -0.3506241
## [5,] -1.1408941 -1.12133672 -0.3251554
## [6,] -1.0901984 0.03114855 1.5875375
componentes_prin=as.data.frame(componentes_prin)
clustering=kmeans(componentes_prin, 3)
clustering
## K-means clustering with 3 clusters of sizes 42, 156, 147
##
## Cluster means:
## PC1 PC2 PC3
## 1 3.4273531 -0.2353520 -0.02487045
## 2 -0.5857313 -0.7541420 0.41775506
## 3 -0.3576513 0.8675573 -0.43622605
##
## Clustering vector:
## [1] 3 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 2 3 3 3 3 3
## [36] 1 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 3 2
## [71] 3 3 3 3 3 3 1 2 2 2 3 2 3 2 1 2 2 2 3 3 3 2 3 3 3 3 2 2 2 2 2 2 3 3 3
## [106] 2 3 3 3 3 2 2 2 2 1 2 2 2 3 2 1 2 3 2 3 2 2 2 2 2 3 3 1 1 2 2 2 2 2 2
## [141] 3 3 3 3 3 3 3 1 3 3 1 3 2 2 2 2 1 3 3 2 3 2 2 2 2 2 1 1 1 3 3 3 3 3 1
## [176] 3 2 2 1 2 1 3 1 2 1 1 1 3 1 1 3 3 2 3 3 3 2 3 3 3 3 2 3 3 3 3 3 3 3 3
## [211] 3 3 3 3 3 3 3 3 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 3
## [246] 3 3 3 3 2 2 2 2 2 3 3 2 3 3 3 3 3 3 2 2 3 2 3 2 2 2 3 3 3 2 2 2 1 3 2
## [281] 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 3 2 1 3 3 3 1 1 3 3 3
## [316] 1 1 3 2 3 3 2 1 2 3 1 3 3 3 2 1 3 3 1 3 3 3 1 3 1 3 1 1 3 1
##
## Within cluster sum of squares by cluster:
## [1] 217.6258 273.8195 306.7057
## (between_SS / total_SS = 50.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(componentes_prin$PC1,componentes_prin$PC2, col=clustering$cluster)
library(cluster)
library(factoextra)
bupa <- scale(bupa)
set.seed(111)
pam_clusters <- pam(bupa, k = 3, metric = "manhattan")
pam_clusters
## Medoids:
## ID V1 V2 V3 V4 V5
## [1,] 66 -0.26065541 0.3886289 -0.2257958 -0.46137226 -0.4912558
## [2,] 60 -0.03584012 -0.3744108 -0.3282952 -0.06393548 -0.3638822
## [3,] 169 0.63860576 0.7701487 1.4141947 2.22132601 0.6041568
## V6 V7
## [1,] -0.7355283 -1.1727371
## [2,] -0.8853260 0.8502344
## [3,] 1.0620439 0.8502344
## Clustering vector:
## [1] 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2
## [71] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1
## [106] 1 1 1 1 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 1 3 3 2 2 2 2 2 2
## [141] 1 1 1 1 1 1 1 3 1 1 3 2 2 2 2 2 3 3 3 2 3 2 2 2 2 2 3 3 3 1 1 1 1 1 3
## [176] 1 2 2 3 2 3 1 3 2 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 1 1
## [246] 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 2 2 2 3 1 2
## [281] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 3 1 1 1 3 3 1 1 1
## [316] 3 3 1 2 2 2 2 3 2 1 1 1 1 1 2 2 2 2 3 1 1 2 3 2 3 1 3 3 1 3
## Objective function:
## build swap
## 3.823487 3.818007
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
# FINALMENTE PODEMOS OBSERVAR LOS 3 CLUESTERING CREADOS POR EL METODO PAM.
fviz_cluster(object = pam_clusters, data = bupa) +
theme_bw() +
labs(title = "Resultados Cluestering PAM") +
theme(legend.position = "none")