TALLER_EVALUADO.knit

# CLUSTERING: K-MEANS Y ANÁLISIS DISCRIMINANTE

## MÉTODOS MULTIVARIADOS
### Estudiantes:
### Fabiola Aguilar - Javiera Ramírez - Sigrid Cespedes
### Prof. Marcelo Rodríguez

---
class:  middle
## **LIBRERIAS Y DATOS**
***

```r
library(Metrics)
library(philentropy)
library(ggplot2)
library(tidyverse) 
library(readr)
library(factoextra)

data <- read_delim("marketing_campaign.csv", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)
```

<div id="htmlwidget-7ef710e650dc7e817201" style="width:100%;height:auto;" class="datatables html-widget"></div>
<script type="application/json" data-for="htmlwidget-7ef710e650dc7e817201">{"x":{"filter":"none","vertical":false,"fillContainer":true,"data":[["1","2","3","4","5","6","7","8","9","10"],[5524,2174,4141,6182,5324,7446,965,6177,4855,5899],[1957,1954,1965,1984,1981,1967,1971,1985,1974,1950],["Graduation","Graduation","Graduation","Graduation","PhD","Master","Graduation","PhD","PhD","PhD"],["Single","Single","Together","Together","Married","Together","Divorced","Married","Together","Together"],[58138,46344,71613,26646,58293,62513,55635,33454,30351,5648],[0,1,0,1,1,0,0,1,1,1],[0,1,0,0,0,1,1,0,0,1],["04-09-2012","08-03-2014","21-08-2013","10-02-2014","19-01-2014","09-09-2013","13-11-2012","08-05-2013","06-06-2013","13-03-2014"],[58,38,26,26,94,16,34,32,19,68],[635,11,426,11,173,520,235,76,14,28],[88,1,49,4,43,42,65,10,0,0],[546,6,127,20,118,98,164,56,24,6],[172,2,111,10,46,0,50,3,3,1],[88,1,21,3,27,42,49,1,3,1],[88,6,42,5,15,14,27,23,2,13],[3,2,1,2,5,2,4,2,1,1],[8,1,8,2,5,6,7,4,3,1],[10,1,2,0,3,4,3,0,0,0],[4,2,10,4,6,10,7,4,2,0],[7,5,4,6,5,6,6,8,9,20],[0,0,0,0,0,0,0,0,0,1],[0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0],[3,3,3,3,3,3,3,3,3,3],[11,11,11,11,11,11,11,11,11,11],[1,0,0,0,0,0,0,0,1,0]],"container":"<table class=\"display fill-container\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>ID<\/th>\n      <th>Year_Birth<\/th>\n      <th>Education<\/th>\n      <th>Marital_Status<\/th>\n      <th>Income<\/th>\n      <th>Kidhome<\/th>\n      <th>Teenhome<\/th>\n      <th>Dt_Customer<\/th>\n      <th>Recency<\/th>\n      <th>MntWines<\/th>\n      <th>MntFruits<\/th>\n      <th>MntMeatProducts<\/th>\n      <th>MntFishProducts<\/th>\n      <th>MntSweetProducts<\/th>\n      <th>MntGoldProds<\/th>\n      <th>NumDealsPurchases<\/th>\n      <th>NumWebPurchases<\/th>\n      <th>NumCatalogPurchases<\/th>\n      <th>NumStorePurchases<\/th>\n      <th>NumWebVisitsMonth<\/th>\n      <th>AcceptedCmp3<\/th>\n      <th>AcceptedCmp4<\/th>\n      <th>AcceptedCmp5<\/th>\n      <th>AcceptedCmp1<\/th>\n      <th>AcceptedCmp2<\/th>\n      <th>Complain<\/th>\n      <th>Z_CostContact<\/th>\n      <th>Z_Revenue<\/th>\n      <th>Response<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"columnDefs":[{"className":"dt-right","targets":[1,2,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]},{"orderable":false,"targets":0}],"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script>

---
class:  middle
## **PREPARANDO LOS DATOS**
***
- Seleccionando las variables a utilizar

```r
newdata <- data %>%      
  select_if(is.numeric) %>% 
  select(-ID, -Kidhome, -Teenhome, -Year_Birth, -Complain,
         -Recency, -AcceptedCmp1, -AcceptedCmp2, -AcceptedCmp3,
         -AcceptedCmp4, -AcceptedCmp5, -Response, -Z_CostContact,
         -Z_Revenue, -NumCatalogPurchases, - NumStorePurchases,
         -NumWebPurchases, -NumWebVisitsMonth, -NumDealsPurchases)
```

- Buscando valores NA en las variables y reemplazando por su respectiva media

```r
apply(is.na(newdata), 2, sum) 
```

```
##           Income         MntWines        MntFruits  MntMeatProducts 
##               24                0                0                0 
##  MntFishProducts MntSweetProducts     MntGoldProds 
##                0                0                0
```

```r
mean_Income <- mean(newdata$Income[!is.na(newdata$Income)])
newdata$Income[is.na(newdata$Income)] <- mean_Income
```

---

- Eliminando outlier

```r
boxplot(newdata$Income,
        lwd = 2,
        col = rgb(1, c(0,1), 0, alpha = 0.4),
        outpch = 25,
        outbg = "green",
        whiskcol = "blue",
        xlab = "Income")
```

```r
i <- which.max(newdata$Income)
newdata[i,]$Income <- mean_Income
```

]

---

### MIN-MAX:

`$$X_{*} = \dfrac{x-min(x)}{Rango(x)}$$`

```r
MIN_MAX <- function(d){
  d_     <- matrix(NA, nrow = nrow(d), ncol = ncol(d))
  name.s <- colnames(d) # Guardando el nombre de las VAs.
  for (i in 1:ncol(d)) {
    d1     <- as.matrix((d[i] - min(d[i]))/(max(d[i]) - min(d[i]))) 
    d_[,i] <- d1
  }
  colnames(d_) <- name.s
  return(d_)
} 
```

---
class: middle

## **NORMALIZANDO LOS DATOS**
***

### Puntaje Z:

$$X_{*} = \dfrac{x - media(x)}{SD(x)} $$

```r
P_Z <- function(d){
  d_ <- sapply(d,
         function (d) (d-mean(d))/sd(d))
  return(d_)
}
```

---
class: middle

## **NORMALIZANDO LOS DATOS**
***

### Logística:

```r
logi <- function(d){
  d_       <- matrix(NA, nrow = nrow(d), ncol = ncol(d))
  name.s   <- colnames(d) 
  for (i in 1:ncol(d)) {
    d1     <- as.matrix(1/(1 + exp(-d[i])))
    d_[,i] <- d1
  }  
  colnames(d_) <- name.s
  return(d_)
}
```

---
class: middle

## **NORMALIZANDO LOS DATOS**
***

<div id="htmlwidget-b86758f936271b960f30" style="width:100%;height:auto;" class="datatables html-widget"></div>
<script type="application/json" data-for="htmlwidget-b86758f936271b960f30">{"x":{"filter":"none","vertical":false,"fillContainer":true,"data":[[0.351086408534422,0.277679921825888,0.434955529137906,0.155078516434613,0.352051136823368,0.378316642496592,0.335507602681322,0.197451872506489,0.178138634567148,0.0243858415231504],[0.425318151373074,0.00736771600803751,0.285331547220362,0.00736771600803751,0.115874079035499,0.348292029470864,0.157401205626256,0.0509042196918955,0.00937709310113865,0.0187541862022773],[0.442211055276382,0.0050251256281407,0.246231155778894,0.0201005025125628,0.21608040201005,0.21105527638191,0.326633165829146,0.050251256281407,0,0],[0.316521739130435,0.00347826086956522,0.0736231884057971,0.0115942028985507,0.0684057971014493,0.0568115942028986,0.0950724637681159,0.032463768115942,0.0139130434782609,0.00347826086956522],[0.664092664092664,0.00772200772200772,0.428571428571429,0.0386100386100386,0.177606177606178,0,0.193050193050193,0.0115830115830116,0.0115830115830116,0.00386100386100386],[0.334600760456274,0.00380228136882129,0.0798479087452472,0.0114068441064639,0.102661596958175,0.159695817490494,0.186311787072243,0.00380228136882129,0.0114068441064639,0.00380228136882129],[0.243093922651934,0.0165745856353591,0.116022099447514,0.0138121546961326,0.0414364640883978,0.0386740331491713,0.074585635359116,0.0635359116022099,0.00552486187845304,0.0359116022099447]],"container":"<table class=\"display fill-container\">\n  <thead>\n    <tr>\n      <th>Income<\/th>\n      <th>MntWines<\/th>\n      <th>MntFruits<\/th>\n      <th>MntMeatProducts<\/th>\n      <th>MntFishProducts<\/th>\n      <th>MntSweetProducts<\/th>\n      <th>MntGoldProds<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"columnDefs":[{"className":"dt-right","targets":[0,1,2,3,4,5,6]}],"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script><div id="htmlwidget-8a0ac7c0030e8b9eaaf6" style="width:100%;height:auto;" class="datatables html-widget"></div>
<script type="application/json" data-for="htmlwidget-8a0ac7c0030e8b9eaaf6">{"x":{"filter":"none","vertical":false,"fillContainer":true,"data":[[0.288007722624358,-0.262963816286082,0.917509290100886,-1.18318065310632,0.295248742509988,0.492391348428425,0.171076930535768,-0.865136373226523,-1.0100969197111,-2.16412862445119],[0.983561646606178,-0.870285155860234,0.362641804113422,-0.870285155860234,-0.388998005219915,0.641907187818298,-0.204801688308189,-0.677176113936649,-0.861372430848376,-0.819779714126373],[1.55123060840902,-0.636159107926729,0.570676597637819,-0.560731876328945,0.419822134442251,0.394679723909656,0.972955166159336,-0.409877413133376,-0.661301518459324,-0.661301518459324],[1.67932735720188,-0.713066186892605,-0.176992818678842,-0.651041169082748,-0.21686604441375,-0.305473212713545,-0.0130695573242198,-0.491548266143116,-0.633319735422789,-0.713066186892605],[2.46159739832782,-0.650304047721232,1.34497393827492,-0.503861626730689,0.155129267726757,-0.686914652968868,0.228350478222029,-0.631998745097414,-0.631998745097414,-0.66860935034505],[1.47617048736359,-0.631362202087733,-0.146871928650648,-0.582913174744025,-0.00152484661952298,0.361842858458291,0.53141445416127,-0.631362202087733,-0.582913174744025,-0.631362202087733],[0.843018670547712,-0.728843044450651,-0.0387574134757599,-0.748012089755509,-0.556321636706928,-0.575490682011786,-0.326293093048631,-0.402969274268063,-0.805519225670083,-0.594659727316644]],"container":"<table class=\"display fill-container\">\n  <thead>\n    <tr>\n      <th>Income<\/th>\n      <th>MntWines<\/th>\n      <th>MntFruits<\/th>\n      <th>MntMeatProducts<\/th>\n      <th>MntFishProducts<\/th>\n      <th>MntSweetProducts<\/th>\n      <th>MntGoldProds<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"columnDefs":[{"className":"dt-right","targets":[0,1,2,3,4,5,6]}],"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script>

---

## **NORMALIZANDO LOS DATOS**
***

<div id="htmlwidget-c29f6a7f53ea65463963" style="width:100%;height:auto;" class="datatables html-widget"></div>
<script type="application/json" data-for="htmlwidget-c29f6a7f53ea65463963">{"x":{"filter":"none","vertical":false,"fillContainer":true,"data":[[1,1,1,1,1,1,1,1,1,1],[1,0.999983298578152,1,0.999983298578152,1,1,1,1,0.999999168471972,0.999999999999309],[1,0.731058578630005,1,0.982013790037908,1,1,1,0.999954602131298,0.5,0.5],[1,0.997527376843365,1,0.999999997938846,1,1,1,1,0.999999999962249,0.997527376843365],[1,0.880797077977882,1,0.999954602131298,1,0.5,1,0.952574126822433,0.952574126822433,0.731058578630005],[1,0.731058578630005,0.999999999241744,0.952574126822433,0.99999999999812,1,1,0.731058578630005,0.952574126822433,0.731058578630005],[1,0.997527376843365,1,0.993307149075715,0.999999694097773,0.999999168471972,0.99999999999812,0.999999999897381,0.880797077977882,0.999997739675702]],"container":"<table class=\"display fill-container\">\n  <thead>\n    <tr>\n      <th>Income<\/th>\n      <th>MntWines<\/th>\n      <th>MntFruits<\/th>\n      <th>MntMeatProducts<\/th>\n      <th>MntFishProducts<\/th>\n      <th>MntSweetProducts<\/th>\n      <th>MntGoldProds<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"columnDefs":[{"className":"dt-right","targets":[0,1,2,3,4,5,6]}],"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script>

---
class: middle

## **ALGORITMO K-MEANS**
***

+ Fijando el nº de clusters a utilizar

```r
fviz_nbclust(x = newdata.sc,
             FUNcluster = kmeans,
             method = "silhouette",
             k.max = 15)
k = 2
```

+ Centroides iniciales

```r
centroides_0 <- function(d, c){
  i   <- sample(1:nrow(d), c, replace = F) 
  mk0 <- d[i,] 
  return(mk0)
}
```
]
.pull-right[
![](TALLER_EVALUADO_files/figure-html/17 setup-1.png)
]

---

# **ALGORITMO K-MEANS**
***
### Distancias

+ Distancia Euclidiana Escalada:

`$$\sqrt{(x_{i}-x_{q})^{T} \Sigma (x_{i}-x_{q})}$$`

```r
eucl_dist <- function(d, m, c){
  dist     <- matrix(NA, nrow = nrow(d), ncol = 1) 
  var_data <- diag(var(d))
  S        <- diag(var_data)
  for (i in 1:nrow(d)){ # Recorriendo filas de los datos 
    dist1    <- sqrt(t((d[i,]) - m) %*% S %*% ((d[i,]) - m)) # Forma matricial para calcular distancia eucl.
    dist[i,] <- dist1 # Guardando distancias en la matriz dist
  }
  return(dist)
}
```

---

# **ALGORITMO K-MEANS**
***

+ Distancia Canberra

`$$\sum_{k=1}^{n} \dfrac{|x_{ik}-x_{jk}|}{|x_{ik}|+|x_{jk}|}$$`

```r
canberra_dist <- function(d, m, c){
  dist  <- matrix(NA, nrow = nrow(d), ncol = 1) 
  dist1 <- c()
  for (i in 1:nrow(d)) {
      dist  <- sum((abs((d[i,]) - m))/((abs(d[i,]) + abs(m)))) 
      dist1 <- append(dist1, dist)
  }
  return(dist1)
}
```

---
class: middle

# **ALGORITMO K-MEANS**
***

+ Distancia Czekanowski

```r
czekanowski_dist <- function(d, m, c){
  dist <- matrix(NA, nrow = nrow(d), ncol = c) 
  for (i in 1:nrow(d)) {
    for (j in 1:c) {
      d_        <- rbind(d[i,], m[j,])
      dist[i,j] <- distance(d_, method = "czekanowski", mute.message = T) 
    }
  }
  return(dist)
}
```

---

# **ALGORITMO K-MEANS**
***

+ Distancia Chebyshev

```r
chebyshev_dist <- function(d, m, c){
  dist <- matrix(NA, nrow = nrow(d), ncol = c) 
  for (i in 1:nrow(d)) {
    for (j in 1:c) {
      d_        <- rbind(d[i,], m[j,])
      dist[i,j] <- distance(d_, method = "chebyshev", mute.message = T) 
    }
  }
  return(dist)
}
```

---
class: middle

# **ALGORITMO K-MEANS**
***
.pull-left[
### Criterios de parada:
+ Fijar el número de iteraciones en 100.
+ Criterio del ANOVA de Fisher (para cuando hay diferencias significativas el valor `$p<0.05$`)

```r
crit_par <- function(d, crit, count){
  if (crit == "1"){
    condition <- (count < 100)
  } else {
    if (crit == "2"){
      f <- matrix(NA, nrow = 1, ncol = (ncol(d)-1))
      f_ <- matrix(0.05, nrow = 1, ncol = (ncol(d)-1))
      clus <- factor(d[,ncol(d)])
```
]
.pull-right[

```r
      clus <- factor(d[,ncol(d)])
      for (i in 1:(ncol(d)-1)) {
        Y1 <- as.vector(d[,i])
        Y2 <- as.numeric(Y1)
        resultado <- aov(Y2~clus)
        result <- anova(resultado)
        f[,i] <- result[1,5]
      }
      condition <- (f > f_)
      matriz <- matrix(T, nrow = 1, ncol = (ncol(d)-1))
      if (identical(condition, matriz)){
        condition <- T
      } else {
        condition <- F
      }
    } 
  }
  return(condition)
}
```
]

---
class: middle

# **ALGORITMO K-MEANS**
***
.pull-left[

```r
k_means <- function(d, c, class, crit){
  d <- as.matrix(d)
  names     <- colnames(d) # Nombres de las columnas de los datos
  mk0       <- matrix(NA, nrow = c, ncol = ncol(d)) # Matriz centroides_0
  mk        <- matrix(NA, nrow = c, ncol = ncol(d)+1) # Matriz centroides_1
  dist      <- matrix(NA, nrow = nrow(d), ncol = c) # Matriz distancias entre centroides y datos
  clusters  <- matrix(NA, nrow = nrow(d), ncol = 1) # Matriz de clusters (elementos de la matriz: 1,2,..,c)
  set.seed(327) # Semilla para generar los centroides iniciales
  mk0 <- centroides_0(d, c)
  dimnames(mk0) <- list(1:c,names) # Cambiar nombre de filas y columnas de la matriz centroides
  iter      <- 0
  condition <- 3==3
  count     <- 1
  while (condition == TRUE) { # Mientras matriz a y matriz0 sean diferentes
    d <- as.matrix(d)
    for (i in 1:c){ # Recorriendo nº de clusters o igualmente filas de la matriz centroides
      if (class == "chebyshev"){
        dist  <- chebyshev_dist(d, mk0, c)
      } else {
        if (class == "canberra"){
          dist[,i] <- canberra_dist(d, mk0[i,], c)
```
]
.pull-right[

```r
        } else {
          if (class == "czekanowski"){
            dist <- czekanowski_dist(d, mk0, c)
          } else {
            dist[,i] <- eucl_dist(d, mk0[i,], c)
            }
          }
        }
      }
    clusters <- as.matrix(apply(dist, 1, which.min)) # Matriz con posición de la menor distancia
    d_new <- cbind(d, clusters)
    condition <- crit_par(d_new, crit, count)
    for (i in 1:nrow(mk0)){ # Filtrar por nº de cluster y calcular media por columna
      mk[i,] <- as.data.frame(d_new) %>% filter(clusters == i) %>% colMeans() # Nuevo centroide
    }
    mk0   <- mk[,-(ncol(mk))] # Eliminando ult. columna (columna clusters)
    iter  <- iter + 1 # Calcular iteraciones
    count <- count + 1
  }
  dimnames(mk0) <- list(1:c,names)
  d_new1        <- cbind(d, clusters) 
  colnames(d_new1)[ncol(d_new1)] <- "clusters"
  return(list(mk0, iter, d_new1))
}
```
]

---
class: middle

# **RESULTADOS**
***

```r
results <- k_means(d = newdata.sc, k, class = "eucl", crit = "1")
names(results) = c("Centroides","Iteraciones","Datos") 
results$Centroides
results$Iteraciones
```

<div id="htmlwidget-8fe324442fc5c14af95c" style="width:100%;height:auto;" class="datatables html-widget"></div>
<script type="application/json" data-for="htmlwidget-8fe324442fc5c14af95c">{"x":{"filter":"none","vertical":false,"fillContainer":true,"data":[["1","2"],[0.991801722341952,-0.516437149379686],[0.9020572747246,-0.469706673261214],[0.912639605038117,-0.475216956594865],[1.02948561957268,-0.53605938235726],[0.951135471654006,-0.495261986937286],[0.917996002197119,-0.478006064959396],[0.691465749128338,-0.360050393470085]],"container":"<table class=\"display fill-container\">\n  <thead>\n    <tr>\n      <th> <\/th>\n      <th>Income<\/th>\n      <th>MntWines<\/th>\n      <th>MntFruits<\/th>\n      <th>MntMeatProducts<\/th>\n      <th>MntFishProducts<\/th>\n      <th>MntSweetProducts<\/th>\n      <th>MntGoldProds<\/th>\n    <\/tr>\n  <\/thead>\n<\/table>","options":{"columnDefs":[{"className":"dt-right","targets":[1,2,3,4,5,6,7]},{"orderable":false,"targets":0}],"order":[],"autoWidth":false,"orderClasses":false}},"evals":[],"jsHooks":[]}</script>

---

# ¡GRACIAS POR SU ATENCIÓN!