Métodos multivariados

# Librerías necesarias

library(readr)
library(ggplot2)
library(FactoMineR)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(corrplot)

## corrplot 0.84 loaded

library("gplots")

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

Problema 1

Para el conjunto de datos de consumo de proteína,

protein <- read.csv("protein.csv")
protein.data <- protein[2:10]
rownames(protein.data) <- protein[,1]
head(protein.data, 2)

##         RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr.Veg
## Albania    10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5    1.7
## Austria     8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3    4.3

Grafique el escalado multidimensional en 2 variables sin escalar con la distancia euclidiana.

protein.dist <- dist(protein.data, method = "euclidian")
protein.cmd <- cmdscale(protein.dist)
protein.cmd <- as.data.frame(protein.cmd)
ggplot(protein.cmd, aes(x = V1, y = V2, label = rownames(protein.data))) + 
  geom_text(alpha = 0.8, size=3, col = "salmon")

Grafique el escalado multidimensional en 2 variables escalando con la distancia euclidiana.

protein.dist <- dist(scale(protein.data), method = "euclidian")
protein.cmd <- cmdscale(protein.dist)
protein.cmd <- as.data.frame(protein.cmd)
ggplot(protein.cmd, aes(x = V1, y = V2, label = rownames(protein.data))) + 
  geom_text(alpha = 0.8, size=3, col = "salmon")

Para el inciso anterior, utilice la matriz \[ \begin{bmatrix} cos\theta & -sen\theta \\ sen\theta & cos\theta \\ \end{bmatrix}, \] para rotar los puntos 45 grados, grafique el resultado.

protein.dist <- dist(scale(protein.data), method = "euclidian")
protein.cmd <- cmdscale(protein.dist)
protein.cmd <- protein.cmd%*%matrix(c(0.707,-0.707,0.707,0.707),2,2)
protein.cmd <- as.data.frame(protein.cmd)
ggplot(protein.cmd,aes(x = V1, y = V2, label = rownames(protein.data))) + 
  geom_text(alpha = 0.7, size = 3, col = "red")

Problema 2

Para la siguiente tabla de contingencia realice un análisis de correspondencia

productos <- matrix(c(66,  60,  67,  45,  35,  275,
                      12,  13,  13,  15,  12,  65,
                      95,  84,  94,  63,  49,  365,
                      30,  32,  32,  36,  28,  158,
                      205, 189, 206, 159, 124, 883), nrow = 5, byrow = T)
rownames(productos) <- c("Producto A", "Producto B", "Producto C", "Producto D", "Margen activo")
colnames(productos) <- c("Brillo", "Duración", "Olor", "Comodidad", "Limpieza", "Margen activo")
productos

##               Brillo Duración Olor Comodidad Limpieza Margen activo
## Producto A        66       60   67        45       35           275
## Producto B        12       13   13        15       12            65
## Producto C        95       84   94        63       49           365
## Producto D        30       32   32        36       28           158
## Margen activo    205      189  206       159      124           883

Represente en dos dimensiones las filas y las columnas.

# Representacion de las filas

ca_ht <- CA(productos[1:4,1:5], graph = F)
filas <- get_ca_row(ca_ht)
head(filas$coord)

##                  Dim 1         Dim 2         Dim 3
## Producto A -0.06082855 -0.0059168299 -0.0001023606
## Producto B  0.20643585 -0.0004477021  0.0072633936
## Producto C -0.06724154  0.0040817460  0.0001801080
## Producto D  0.18402441  0.0004615380 -0.0032501122

head(filas$cos2)

##                Dim 1        Dim 2        Dim 3
## Producto A 0.9906243 9.372864e-03 2.805175e-06
## Producto B 0.9987589 4.697520e-06 1.236430e-03
## Producto C 0.9963216 3.671268e-03 7.148094e-06
## Producto D 0.9996819 6.288193e-06 3.118226e-04

fviz_ca_row(ca_ht, col.row = "cos2",
            gradient.cols = c("orange", "yellow","salmon"), 
            repel = TRUE)

# Representacion de las columnas

ca_ht <- CA(productos[1:4,1:5], graph = F)
columnas <- get_ca_col(ca_ht)
head(columnas$coord)

##                 Dim 1         Dim 2         Dim 3
## Brillo    -0.10663528  0.0066182903  2.416632e-05
## Duración  -0.03455815 -0.0039632982 -1.300210e-03
## Olor      -0.07969751 -0.0043196923  9.923927e-04
## Comodidad  0.15557791  0.0011688864 -3.426479e-03
## Limpieza   0.16015525  0.0008835009  4.687188e-03

head(columnas$cos2)

##               Dim 1        Dim 2        Dim 3
## Brillo    0.9961627 3.837250e-03 5.116222e-08
## Duración  0.9856410 1.296377e-02 1.395228e-03
## Olor      0.9969167 2.928701e-03 1.545740e-04
## Comodidad 0.9994588 5.641747e-05 4.848029e-04
## Limpieza  0.9991138 3.040511e-05 8.557706e-04

fviz_ca_col(ca_ht, col.col = "cos2", 
            gradient.cols = c("yellow","salmon","steelblue"),
            repel = TRUE)

# Representacion de las filas y columnas

fviz_ca_biplot(ca_ht, repel = TRUE)

Aplique la prueba de Pearson para dependencia.

chisq <- chisq.test(productos)
chisq

## 
##  Pearson's Chi-squared test
## 
## data:  productos
## X-squared = 11.335, df = 20, p-value = 0.9371

balloonplot(t(as.table(productos)), xlab ="", ylab="",label = FALSE, show.margins = FALSE)

Explique los resultados.

Según los resultados los datos son independientes.

Problema 3

Para el conjunto de datos iris utilice análisis de clúster jerárquico.

data <-scale(iris[,1:4]) 
data_d <- dist(data,method = "euclidean")

Encuentre el dendograma con el método Single.

clust1 <-hclust(data_d,method = "single")
plot(clust1, hang = -0.01, cex = 0.6)

Dibuje el dendograma con el método complete.

clust1 <-hclust(data_d,method = "complete")
plot(clust1, hang = -0.01, cex = 0.6)

Para los siguientes incisos utilice el conjunto de datos position (posiciones iniciales de jugadores de futbol con 6 integrantes para cada equipo).

position <- read.csv("position.csv")
position <- position[,2:3]
position

##      x   y
## 1   -1   1
## 2   -2  -3
## 3    8   6
## 4    7  -8
## 5  -12   8
## 6  -15   0
## 7  -13 -10
## 8   15  16
## 9   21   2
## 10  12 -15
## 11 -25   1
## 12  26   0

Grafique las posiciones de los jugadores.

plot(position, main = 'Posición de jugadores')
labels <- 1:12
text(position$x, position$y, labels, cex = 0.7, pos = 1)

Aplique un análisis de clúster jerárquico con el método complete, corte el árbol con k= 3 y con k=4.

data_d <- dist(position, method = "euclidean")

# k = 3
clust3 <-hclust(data_d, method = "complete")
plot(clust3, hang = -0.01, cex = 0.6)
rect.hclust(clust3, k=3, border = "orange")

# k = 4
clust4 <-hclust(data_d, method = "complete")
plot(clust4, hang = -0.01, cex = 0.6)
rect.hclust(clust4, k=4, border = "orange")

Agregue al conjunto de datos una columna con el número de cluster al que pertenece.

df_analisis <- function(k, m, ajuste) {
  matriz <- matrix(data = 1:3, nrow = 1, ncol = 3)
  fila <- 1
  for (i in 1:k) {
    for (elemento in rownames(m)[ajuste==i]) {
      matriz <- rbind(matriz, c(m[fila,1], m[fila,2], i))
      fila <- fila + 1
    }
  }
  
  colnames(matriz) <- c("x", "y", "Cluster")
  matriz <- matriz[2:13, ]
  matriz
}

# k = 3
ajuste3 <- cutree(clust3, k = 3)
table(ajuste3)

## ajuste3
## 1 2 3 
## 6 4 2

print(rownames(position)[ajuste3==1])

## [1] "1"  "2"  "5"  "6"  "7"  "11"

print(rownames(position)[ajuste3==2])

## [1] "3"  "8"  "9"  "12"

print(rownames(position)[ajuste3==3])

## [1] "4"  "10"

df_analisis3 <- df_analisis(3, position, ajuste3)
df_analisis3

##         x   y Cluster
##  [1,]  -1   1       1
##  [2,]  -2  -3       1
##  [3,]   8   6       1
##  [4,]   7  -8       1
##  [5,] -12   8       1
##  [6,] -15   0       1
##  [7,] -13 -10       2
##  [8,]  15  16       2
##  [9,]  21   2       2
## [10,]  12 -15       2
## [11,] -25   1       3
## [12,]  26   0       3

# k = 4
ajuste4 <- cutree(clust3, k = 4)
table(ajuste4)

## ajuste4
## 1 2 3 4 
## 3 4 2 3

print(rownames(position)[ajuste4==1])

## [1] "1" "2" "7"

print(rownames(position)[ajuste4==2])

## [1] "3"  "8"  "9"  "12"

print(rownames(position)[ajuste4==3])

## [1] "4"  "10"

print(rownames(position)[ajuste4==4])

## [1] "5"  "6"  "11"

df_analisis4 <- df_analisis(4, position, ajuste4)
df_analisis4

##         x   y Cluster
##  [1,]  -1   1       1
##  [2,]  -2  -3       1
##  [3,]   8   6       1
##  [4,]   7  -8       2
##  [5,] -12   8       2
##  [6,] -15   0       2
##  [7,] -13 -10       2
##  [8,]  15  16       3
##  [9,]  21   2       3
## [10,]  12 -15       4
## [11,] -25   1       4
## [12,]  26   0       4

Grafique las posiciones agregando color para la variable cluster.

# k = 3
fviz_cluster(list(data = position, cluster = ajuste3), palette = c("red", "steelblue", "green"),
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_minimal())

# k = 4
fviz_cluster(list(data = position, cluster = ajuste4), palette = c("red", "steelblue", "green", "orange"),
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_minimal())

Repita con k=2 y el método War.

clust3 <-hclust(data_d, method = "ward.D2")
plot(clust3, hang = -0.01, cex=0.6)
rect.hclust(clust3, k = 2, border = "orange")

Repita con K=2 y el método kmeans.

kposition <- position

df_d <- dist(kposition)
set.seed(2)
km_p <- kmeans(df_d,nstart = 25,centers = 2)
rownames(kposition)[km_p$cluster==1]

## [1] "3"  "4"  "8"  "9"  "10" "12"

rownames(kposition)[km_p$cluster==2]

## [1] "1"  "2"  "5"  "6"  "7"  "11"

cluster = km_p$cluster
head(cbind(kposition, cluster))

##     x  y cluster
## 1  -1  1       2
## 2  -2 -3       2
## 3   8  6       1
## 4   7 -8       1
## 5 -12  8       2
## 6 -15  0       2

fviz_cluster(km_p, data = kposition,  palette=c("orange", "steelblue"),
             star.plot = TRUE,ellipse.type = "eclid",
             repel=T,
             ggtheme = theme_minimal())

Métodos multivariados - Tarea 3

Elmer Calel - 201213600

18/4/2020

Problema 1

Problema 2

Problema 3