En esta publicacion se ha trabajado los componentes PCA, clustering y tambien hemos tomado en cuenta el Algoritmo KOHONE Y El Algoritmo PAM con la base de datos “BUPA”

#EXPLORACION DE DATOS

1.Cargamos la base de datos que nos brindó el profesor llamada “bupa”

bupa<-read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos-2019-2/master/bupa.txt", sep = ",")

2.Verificamos el número de filas y el número de columnas con el comando “dim(nombreBD)” de la DB “bupa”

dim(bupa)

## [1] 345   7

3.Con el siguiente comando “names(nombreBD)” procederemos a verificar el nombre de cada variable o también se puede utilizar el comando “colnames(nombreBD9” de igual forma nos mostrara los nombres de cada variable.

names(bupa)

## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7"

4.Con el comando “head(nombreBD) nos mostrara las 6 primeras filas.

head(bupa)

##   V1 V2 V3 V4 V5 V6 V7
## 1 85 92 45 27 31  0  1
## 2 85 64 59 32 23  0  2
## 3 86 54 33 16 54  0  2
## 4 91 78 34 24 36  0  2
## 5 87 70 12 28 10  0  2
## 6 98 55 13 17 17  0  2

5.Con el comando “tail(nombreBD) nos mostrara las 6 últimas filas.

tail(bupa)

##     V1 V2 V3 V4  V5 V6 V7
## 340 87 56 52 43  55 10  2
## 341 99 75 26 24  41 12  1
## 342 96 69 53 43 203 12  2
## 343 98 77 55 35  89 15  1
## 344 91 68 27 26  14 16  1
## 345 98 99 57 45  65 20  1

6.Con el comando lapply(nombreBD,class) sabremos qué clase tiene cada variable.

lapply(bupa,class)

## $V1
## [1] "integer"
## 
## $V2
## [1] "integer"
## 
## $V3
## [1] "integer"
## 
## $V4
## [1] "integer"
## 
## $V5
## [1] "integer"
## 
## $V6
## [1] "numeric"
## 
## $V7
## [1] "integer"

• V1: Es una variable de clase INTEGER.

• V2: Es una variable de clase INTEGER.

• V3: Es una variable de clase INTEGER.

• V4: Es una variable de clase INTEGER.

• V5: Es una variable de clase INTEGER.

• V6: Es una variable de clase NUMERIC

• V7: Es una variable de clase INTEGER.

Con el comando “summary” nos mostrara una lista detallada de cada variable identificando su primer cuartil, tercer cuartil, media, mediana, minimo y maximo:

summary(bupa)

##        V1               V2               V3               V4       
##  Min.   : 65.00   Min.   : 23.00   Min.   :  4.00   Min.   : 5.00  
##  1st Qu.: 87.00   1st Qu.: 57.00   1st Qu.: 19.00   1st Qu.:19.00  
##  Median : 90.00   Median : 67.00   Median : 26.00   Median :23.00  
##  Mean   : 90.16   Mean   : 69.87   Mean   : 30.41   Mean   :24.64  
##  3rd Qu.: 93.00   3rd Qu.: 80.00   3rd Qu.: 34.00   3rd Qu.:27.00  
##  Max.   :103.00   Max.   :138.00   Max.   :155.00   Max.   :82.00  
##        V5               V6               V7      
##  Min.   :  5.00   Min.   : 0.000   Min.   :1.00  
##  1st Qu.: 15.00   1st Qu.: 0.500   1st Qu.:1.00  
##  Median : 25.00   Median : 3.000   Median :2.00  
##  Mean   : 38.28   Mean   : 3.455   Mean   :1.58  
##  3rd Qu.: 46.00   3rd Qu.: 6.000   3rd Qu.:2.00  
##  Max.   :297.00   Max.   :20.000   Max.   :2.00

COMPONENTES PCA

paso 1: Cargamos la base de Datos bupa, brindada por el profesor.

bupa<-read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos-2019-2/master/bupa.txt", sep = ",")

paso 2: Mostraremos los 6 primeros datos con “head” y con el comando “tail” los 6 ultimos datos.

head(bupa)

##   V1 V2 V3 V4 V5 V6 V7
## 1 85 92 45 27 31  0  1
## 2 85 64 59 32 23  0  2
## 3 86 54 33 16 54  0  2
## 4 91 78 34 24 36  0  2
## 5 87 70 12 28 10  0  2
## 6 98 55 13 17 17  0  2

tail(bupa)

##     V1 V2 V3 V4  V5 V6 V7
## 340 87 56 52 43  55 10  2
## 341 99 75 26 24  41 12  1
## 342 96 69 53 43 203 12  2
## 343 98 77 55 35  89 15  1
## 344 91 68 27 26  14 16  1
## 345 98 99 57 45  65 20  1

paso 3: solo consideramos las variables cuantitativas, en este caso hemos elimnando la variable “V6” ya que cuenta con datos = 0.

bupa1=bupa[,c(1:5,7)]
head(bupa1)

##   V1 V2 V3 V4 V5 V7
## 1 85 92 45 27 31  1
## 2 85 64 59 32 23  2
## 3 86 54 33 16 54  2
## 4 91 78 34 24 36  2
## 5 87 70 12 28 10  2
## 6 98 55 13 17 17  2

#1 prueba de correlaciones individuales

cor(bupa1)

##             V1          V2          V3        V4        V5          V7
## V1  1.00000000  0.04410300  0.14769505 0.1877652 0.2223145 -0.09107012
## V2  0.04410300  1.00000000  0.07620761 0.1460565 0.1331404 -0.09805018
## V3  0.14769505  0.07620761  1.00000000 0.7396749 0.5034353 -0.03500879
## V4  0.18776515  0.14605655  0.73967487 1.0000000 0.5276259  0.15735580
## V5  0.22231449  0.13314040  0.50343525 0.5276259 1.0000000  0.14639252
## V7 -0.09107012 -0.09805018 -0.03500879 0.1573558 0.1463925  1.00000000

library(VIM)
library(corrplot)
corrplot(cor(bupa1))

library(PerformanceAnalytics)
chart.Correlation(bupa1)

library(psych)

#Prueba general de correlaciones

cortest(cor(bupa1))

## Tests of correlation matrices 
## Call:cortest(R1 = cor(bupa1))
##  Chi Square value 172.28  with df =  15   with probability < 8.5e-29

#2° Prueba de Bartlet

library(rela)
cortest.bartlett(cor(bupa1), n=850)

## $chisq
## [1] 1172.437
## 
## $p.value
## [1] 1.360284e-240
## 
## $df
## [1] 15

#3° Prueba KMO

library(psych)
KMO(bupa1)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = bupa1)
## Overall MSA =  0.61
## MSA for each item = 
##   V1   V2   V3   V4   V5   V7 
## 0.64 0.47 0.59 0.61 0.80 0.24

#Según los resultados se justifica la realización del PCA #Gráfico de sedimentación

scree(bupa1)

#Según el gráfico de sedimentación deberia #tomarse tres componentes #Análisis paralelo

fa.parallel(bupa1, fa="pc")

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  1

#Parallel analysis suggests that the number of factors = NA and the number of components = 2

componentes=prcomp(bupa1, scale=TRUE, center = T)
componentes

## Standard deviations (1, .., p=6):
## [1] 1.5200338 1.0690820 0.9769714 0.9217029 0.7184888 0.4757367
## 
## Rotation (n x k) = (6 x 6):
##          PC1         PC2          PC3        PC4        PC5         PC6
## V1 0.2327962  0.38854443  0.639934751 -0.5844288  0.1973974 -0.06928156
## V2 0.1511070  0.47588993 -0.754185658 -0.4028910  0.1046041 -0.09293183
## V3 0.5589465  0.01306629  0.017225076  0.4115143  0.2394649 -0.67854547
## V4 0.5826737 -0.10341115 -0.058769387  0.1434114  0.3758114  0.69609097
## V5 0.5136442 -0.08444962  0.001901549 -0.1385104 -0.8415496  0.04054015
## V7 0.0849261 -0.77753821 -0.133915451 -0.5357480  0.2081081 -0.19988410

summary(componentes)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.5200 1.0691 0.9770 0.9217 0.71849 0.47574
## Proportion of Variance 0.3851 0.1905 0.1591 0.1416 0.08604 0.03772
## Cumulative Proportion  0.3851 0.5756 0.7347 0.8762 0.96228 1.00000

plot(componentes)

#Graficamos los componentes

biplot(componentes, scale=1)

#Extraemos los componentes

componentes_prin=componentes$x
componentes_prin=componentes_prin[,1:3]
head(componentes_prin)

##             PC1        PC2          PC3
## [1,]  0.2718209  1.0364040 -1.496130231
## [2,]  0.7988551 -1.2875680 -0.633315258
## [3,] -0.4966290 -1.3792951 -0.006417727
## [4,]  0.2189788 -0.3628503 -0.320311637
## [5,] -0.7950928 -0.9196490 -0.610975830
## [6,] -0.8595250 -0.2492176  1.653595231

#Exportamos los componentes

write.csv(componentes_prin, file = "Componentes_bupa.csv")

#clustering

componentes_prin=as.data.frame(componentes_prin)
corrplot(cor(componentes_prin))

clustering=kmeans(componentes_prin, 3)
clustering

## K-means clustering with 3 clusters of sizes 161, 40, 144
## 
## Cluster means:
##          PC1        PC2          PC3
## 1 -0.5226288 -0.8568780  0.042056296
## 2  3.3493589 -0.1202634 -0.008161452
## 3 -0.3460495  0.9914437 -0.044754205
## 
## Clustering vector:
##   [1] 3 1 1 1 1 1 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 1 3 3 3 3 3
##  [36] 2 1 1 1 1 2 1 3 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 3 1
##  [71] 3 3 3 3 3 3 2 1 1 1 3 1 3 1 2 1 1 1 3 3 3 1 3 3 3 3 1 1 1 1 1 1 3 3 3
## [106] 1 3 3 3 3 1 1 1 1 2 1 1 1 1 1 2 1 3 1 1 1 1 1 1 1 3 3 2 2 1 1 1 1 1 1
## [141] 3 3 3 3 3 3 3 2 3 3 2 1 1 1 1 1 2 3 3 1 3 1 1 1 1 1 2 2 2 3 3 3 3 3 2
## [176] 3 1 1 2 1 2 3 2 1 1 2 2 3 2 2 3 3 1 3 3 3 1 3 3 3 3 1 3 3 3 3 3 3 3 3
## [211] 3 1 3 3 3 3 3 3 1 3 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 3 1 1 1 1 1 1 1 3
## [246] 3 3 3 3 1 1 1 1 1 3 3 1 3 3 3 3 3 3 1 1 3 1 3 1 1 1 3 3 3 1 1 1 2 3 1
## [281] 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 3 1 2 3 3 3 2 2 3 3 3
## [316] 2 2 3 1 3 3 1 2 1 3 3 3 3 3 1 2 3 3 2 3 3 3 3 1 2 3 2 2 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 268.6883 191.9575 284.8147
##  (between_SS / total_SS =  50.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(componentes_prin$PC1,componentes_prin$PC2, col=clustering$cluster)

library(rgl)
plot3d(x=componentes_prin$PC1,
       componentes_prin$PC2,
       componentes_prin$PC3, col=clustering$cluster)

1er ALGORITMO KOHONE

##CARGO LA DB “BUPA”

bupa<-read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos-2019-2/master/bupa.txt", sep = ",")
head(bupa)

##   V1 V2 V3 V4 V5 V6 V7
## 1 85 92 45 27 31  0  1
## 2 85 64 59 32 23  0  2
## 3 86 54 33 16 54  0  2
## 4 91 78 34 24 36  0  2
## 5 87 70 12 28 10  0  2
## 6 98 55 13 17 17  0  2

summary(bupa)

##        V1               V2               V3               V4       
##  Min.   : 65.00   Min.   : 23.00   Min.   :  4.00   Min.   : 5.00  
##  1st Qu.: 87.00   1st Qu.: 57.00   1st Qu.: 19.00   1st Qu.:19.00  
##  Median : 90.00   Median : 67.00   Median : 26.00   Median :23.00  
##  Mean   : 90.16   Mean   : 69.87   Mean   : 30.41   Mean   :24.64  
##  3rd Qu.: 93.00   3rd Qu.: 80.00   3rd Qu.: 34.00   3rd Qu.:27.00  
##  Max.   :103.00   Max.   :138.00   Max.   :155.00   Max.   :82.00  
##        V5               V6               V7      
##  Min.   :  5.00   Min.   : 0.000   Min.   :1.00  
##  1st Qu.: 15.00   1st Qu.: 0.500   1st Qu.:1.00  
##  Median : 25.00   Median : 3.000   Median :2.00  
##  Mean   : 38.28   Mean   : 3.455   Mean   :1.58  
##  3rd Qu.: 46.00   3rd Qu.: 6.000   3rd Qu.:2.00  
##  Max.   :297.00   Max.   :20.000   Max.   :2.00

muestra <- sample(1:200,75)
ttesting <- bupa[muestra,]
taprendizaje <- bupa[-muestra,]
head(taprendizaje)

##    V1 V2 V3 V4 V5  V6 V7
## 1  85 92 45 27 31 0.0  1
## 3  86 54 33 16 54 0.0  2
## 4  91 78 34 24 36 0.0  2
## 6  98 55 13 17 17 0.0  2
## 9  92 54 22 20  7 0.5  1
## 10 90 60 25 19  5 0.5  1

##Antes de crear un SOM, debemos elegir las variables en las que queremos buscar patrones

colnames(bupa)

## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7"

##Comenzaremos con algunos ejemplos simples usando intentos de disparo:

library(kohonen)

bupa.measures1 <- c("V1", "V2", "V3")
bupa.SOM1 <- som(scale(bupa[bupa.measures1]), grid = somgrid(6, 4, "rectangular"))
plot(bupa.SOM1)

##Mapa de calor SOM Recuerde que lo anterior es solo un mapa de los datos del bupa: ##cada celda muestra su vector representativo. Podríamos identificar bupas con celdas ##en el mapa asignando a cada bupa a la celda con el vector representativo más cercano a ##la línea estadística de ese bupa El tipo SOM de “conteo” hace exactamente esto, y crea ##un mapa de calor basado en la cantidad de bupa asignados a cada celda. Solo por diversión, ##invertimos el orden de la paleta predefinida heat.colorspara que el rojo represente las celdas ##de la cuadrícula con un mayor número de bupa representados.

#reverse color ramp

colors <- function(n, alpha = 1) {
  rev(heat.colors(n, alpha))
}

plot(bupa.SOM1, type = "counts", palette.name = colors, heatkey = TRUE)

par(mfrow = c(1, 2))
plot(bupa.SOM1, type = "mapping", pchs = 20, main = "Mapping Type SOM")
plot(bupa.SOM1, main = "Default SOM Plot")

bupa.SOM2 <- som(scale(bupa[bupa.measures1]), grid = somgrid(6, 6, "hexagonal"))
par(mfrow = c(1, 2))
plot(bupa.SOM2, type = "mapping", pchs = 20, main = "Mapping Type SOM")
plot(bupa.SOM2, main = "Default SOM Plot")

plot(bupa.SOM2, type = "dist.neighbours", palette.name = terrain.colors)

SOM supervisados

##El kohonenpaquete también admite SOM supervisados, lo que nos permite hacer ##clasificaciones. Hasta ahora solo hemos trabajado con el mapeo de datos tridimensionales a dos dimensiones. ##La utilidad de los SOM se vuelve más evidente cuando trabajamos con datos dimensionales más altos, ##así que hagamos este ejemplo supervisado con una lista ampliada de estadísticas de bupas:

colnames(bupa)

## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7"

bupa.measures1 <- c("V1", "V2", "V3")


training_indices <- sample(nrow(bupa), 150)
bupa.training <- scale(bupa[training_indices, bupa.measures1])
bupa.testing <- scale(bupa[-training_indices, bupa.measures1])
summary(bupa.testing)

##        V1                V2                V3         
##  Min.   :-2.7932   Min.   :-2.4082   Min.   :-1.0189  
##  1st Qu.:-0.6282   1st Qu.:-0.6718   1st Qu.:-0.5779  
##  Median : 0.1695   Median :-0.1611   Median :-0.2762  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6253   3rd Qu.: 0.4773   3rd Qu.: 0.1184  
##  Max.   : 2.4485   Max.   : 3.4649   Max.   : 5.7123

##Tenga en cuenta que cuando cambiamos la escala de nuestros datos de prueba, debemos escalar de acuerdo ##con la escala de nuestros datos de capacitación.

bupa.SOM3 <- xyf(bupa.training, classvec2classmat(bupa$V1[training_indices]), 
                 grid = somgrid(6, 6, "hexagonal"),  rlen = 100)


summary(bupa.SOM3)

## SOM of size 6x6 with a hexagonal topology and a bubble neighbourhood function.
## The number of data layers is 2.
## Distance measure(s) used: sumofsquares, tanimoto.
## Training data included: 150 objects.
## Mean distance to the closest unit in the map: 0.021.

2do ALGORITMO PAM

bupa<-read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos-2019-2/master/bupa.txt", sep = ",")

str(bupa)

## 'data.frame':    345 obs. of  7 variables:
##  $ V1: int  85 85 86 91 87 98 88 88 92 90 ...
##  $ V2: int  92 64 54 78 70 55 62 67 54 60 ...
##  $ V3: int  45 59 33 34 12 13 20 21 22 25 ...
##  $ V4: int  27 32 16 24 28 17 17 11 20 19 ...
##  $ V5: int  31 23 54 36 10 17 9 11 7 5 ...
##  $ V6: num  0 0 0 0 0 0 0.5 0.5 0.5 0.5 ...
##  $ V7: int  1 2 2 2 2 2 1 1 1 1 ...

#REALIZAR NUESTRO CLUESTERING

bupa$V1<-NULL
bupa$V2<-NULL

#LUEGO ESCALAMOS LOS DATOS PARA NORMALIZAR LA DATA

cliepam <- scale(bupa)

library(cluster)
library(factoextra)

#CON ESTE GRAFICO DE CODO OBSERVAMOS QUE LO OPTIMO SERIA CREAR DOS CLUESTERING, PERO #OPTAMOS CREAR 3 PORQUE DE ESTA MANERA GANAMOS UN POCO MAS DE EFICIENCIA.

fviz_nbclust(x = bupa, FUNcluster = pam, method = "wss", k.max =15,
             diss = dist(bupa, method = "manhattan"))

set.seed(111)
pam_clusters <- pam(bupa, k = 3, metric = "manhattan")
pam_clusters

## Medoids:
##       ID V3 V4  V5  V6 V7
## [1,]  50 29 25  38 0.5  2
## [2,]  75 20 21  16 2.0  1
## [3,] 133 47 39 107 5.0  2
## Clustering vector:
##   [1] 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 3 2 2 2 2 2 2 1 1 1 2
##  [36] 3 1 2 1 1 1 1 2 2 2 2 2 1 2 1 2 2 3 1 2 2 2 2 1 2 1 2 2 2 1 2 2 2 2 2
##  [71] 1 1 2 2 2 1 3 1 2 1 1 1 2 2 3 2 1 2 2 2 2 2 1 2 2 2 1 3 1 1 2 1 2 2 2
## [106] 2 1 2 2 1 2 2 2 2 3 2 2 2 2 2 1 2 2 2 1 2 1 1 2 2 2 2 3 3 2 2 2 2 3 2
## [141] 2 1 2 1 2 1 1 3 2 2 3 1 2 2 1 1 3 1 1 2 1 2 1 1 2 1 3 3 3 1 1 1 2 2 3
## [176] 1 1 2 3 1 3 3 1 2 1 3 3 1 3 3 2 2 1 2 2 2 2 2 2 2 2 2 1 2 3 1 2 2 2 2
## [211] 2 1 1 2 2 2 2 1 2 1 1 2 2 2 2 2 1 1 1 1 1 1 3 1 1 1 2 1 2 2 2 2 2 1 2
## [246] 2 2 2 2 3 1 3 2 1 1 2 2 2 2 1 1 2 1 2 1 1 2 1 1 2 2 2 2 2 2 1 1 1 1 1
## [281] 1 2 2 2 2 1 2 1 1 1 1 2 2 3 1 1 1 1 2 3 1 2 2 1 1 1 3 1 2 1 3 3 1 2 2
## [316] 3 3 2 1 1 2 1 3 2 2 1 1 2 1 1 3 1 1 3 1 1 2 1 2 3 1 3 3 2 3
## Objective function:
##    build     swap 
## 29.52319 28.29855 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"

#FINALMENTE PODEMOS OBSERVAR LOS 3 CLUESTERING CREADOS POR EL METODO PAM.

fviz_cluster(object = pam_clusters, data = bupa) +
  theme_bw() +
  labs(title = "Resultados Cluestering PAM") +
  theme(legend.position =  "none")

E3

Fernandez Torres Charly

14/11/2019

COMPONENTES PCA

1er ALGORITMO KOHONE

SOM supervisados

2do ALGORITMO PAM