class: center, middle, inverse <img src="logo.png" width="250px" /> # CLUSTERING: K-MEANS Y ANÁLISIS DISCRIMINANTE ## MÉTODOS MULTIVARIADOS ### Estudiantes: ### Fabiola Aguilar - Javiera Ramírez - Sigrid Cespedes ### Prof. Marcelo Rodríguez --- class: middle ## **LIBRERIAS Y DATOS** *** ```r library(Metrics) library(philentropy) library(ggplot2) library(tidyverse) library(readr) library(factoextra) data <- read_delim("marketing_campaign.csv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) ```
--- class: middle ## **PREPARANDO LOS DATOS** *** - Seleccionando las variables a utilizar ```r newdata <- data %>% select_if(is.numeric) %>% select(-ID, -Kidhome, -Teenhome, -Year_Birth, -Complain, -Recency, -AcceptedCmp1, -AcceptedCmp2, -AcceptedCmp3, -AcceptedCmp4, -AcceptedCmp5, -Response, -Z_CostContact, -Z_Revenue, -NumCatalogPurchases, - NumStorePurchases, -NumWebPurchases, -NumWebVisitsMonth, -NumDealsPurchases) ``` - Buscando valores NA en las variables y reemplazando por su respectiva media ```r apply(is.na(newdata), 2, sum) ``` ``` ## Income MntWines MntFruits MntMeatProducts ## 24 0 0 0 ## MntFishProducts MntSweetProducts MntGoldProds ## 0 0 0 ``` ```r mean_Income <- mean(newdata$Income[!is.na(newdata$Income)]) newdata$Income[is.na(newdata$Income)] <- mean_Income ``` --- class: middle ## **PREPARANDO LOS DATOS** *** - Eliminando outlier .pull-left[ ```r boxplot(newdata$Income, lwd = 2, col = rgb(1, c(0,1), 0, alpha = 0.4), outpch = 25, outbg = "green", whiskcol = "blue", xlab = "Income") ``` ```r i <- which.max(newdata$Income) newdata[i,]$Income <- mean_Income ``` ] .pull-right[ <!-- --> ] --- class: middle ## **NORMALIZANDO LOS DATOS** *** ### MIN-MAX: `$$X_{*} = \dfrac{x-min(x)}{Rango(x)}$$` ```r MIN_MAX <- function(d){ d_ <- matrix(NA, nrow = nrow(d), ncol = ncol(d)) name.s <- colnames(d) # Guardando el nombre de las VAs. for (i in 1:ncol(d)) { d1 <- as.matrix((d[i] - min(d[i]))/(max(d[i]) - min(d[i]))) d_[,i] <- d1 } colnames(d_) <- name.s return(d_) } ``` --- class: middle ## **NORMALIZANDO LOS DATOS** *** ### Puntaje Z: $$X_{*} = \dfrac{x - media(x)}{SD(x)} $$ ```r P_Z <- function(d){ d_ <- sapply(d, function (d) (d-mean(d))/sd(d)) return(d_) } ``` --- class: middle ## **NORMALIZANDO LOS DATOS** *** ### Logística: ```r logi <- function(d){ d_ <- matrix(NA, nrow = nrow(d), ncol = ncol(d)) name.s <- colnames(d) for (i in 1:ncol(d)) { d1 <- as.matrix(1/(1 + exp(-d[i]))) d_[,i] <- d1 } colnames(d_) <- name.s return(d_) } ``` --- class: middle ## **NORMALIZANDO LOS DATOS** ***
--- class: middle ## **NORMALIZANDO LOS DATOS** ***
--- class: middle ## **ALGORITMO K-MEANS** *** + Fijando el nº de clusters a utilizar .pull-left[ ```r fviz_nbclust(x = newdata.sc, FUNcluster = kmeans, method = "silhouette", k.max = 15) k = 2 ``` + Centroides iniciales ```r centroides_0 <- function(d, c){ i <- sample(1:nrow(d), c, replace = F) mk0 <- d[i,] return(mk0) } ``` ] .pull-right[ <!-- --> ] --- class: middle # **ALGORITMO K-MEANS** *** ### Distancias + Distancia Euclidiana Escalada: `$$\sqrt{(x_{i}-x_{q})^{T} \Sigma (x_{i}-x_{q})}$$` ```r eucl_dist <- function(d, m, c){ dist <- matrix(NA, nrow = nrow(d), ncol = 1) var_data <- diag(var(d)) S <- diag(var_data) for (i in 1:nrow(d)){ # Recorriendo filas de los datos dist1 <- sqrt(t((d[i,]) - m) %*% S %*% ((d[i,]) - m)) # Forma matricial para calcular distancia eucl. dist[i,] <- dist1 # Guardando distancias en la matriz dist } return(dist) } ``` --- class: middle # **ALGORITMO K-MEANS** *** + Distancia Canberra `$$\sum_{k=1}^{n} \dfrac{|x_{ik}-x_{jk}|}{|x_{ik}|+|x_{jk}|}$$` ```r canberra_dist <- function(d, m, c){ dist <- matrix(NA, nrow = nrow(d), ncol = 1) dist1 <- c() for (i in 1:nrow(d)) { dist <- sum((abs((d[i,]) - m))/((abs(d[i,]) + abs(m)))) dist1 <- append(dist1, dist) } return(dist1) } ``` --- class: middle # **ALGORITMO K-MEANS** *** + Distancia Czekanowski ```r czekanowski_dist <- function(d, m, c){ dist <- matrix(NA, nrow = nrow(d), ncol = c) for (i in 1:nrow(d)) { for (j in 1:c) { d_ <- rbind(d[i,], m[j,]) dist[i,j] <- distance(d_, method = "czekanowski", mute.message = T) } } return(dist) } ``` --- class: middle # **ALGORITMO K-MEANS** *** + Distancia Chebyshev ```r chebyshev_dist <- function(d, m, c){ dist <- matrix(NA, nrow = nrow(d), ncol = c) for (i in 1:nrow(d)) { for (j in 1:c) { d_ <- rbind(d[i,], m[j,]) dist[i,j] <- distance(d_, method = "chebyshev", mute.message = T) } } return(dist) } ``` --- class: middle # **ALGORITMO K-MEANS** *** .pull-left[ ### Criterios de parada: + Fijar el número de iteraciones en 100. + Criterio del ANOVA de Fisher (para cuando hay diferencias significativas el valor `\(p<0.05\)`) ```r crit_par <- function(d, crit, count){ if (crit == "1"){ condition <- (count < 100) } else { if (crit == "2"){ f <- matrix(NA, nrow = 1, ncol = (ncol(d)-1)) f_ <- matrix(0.05, nrow = 1, ncol = (ncol(d)-1)) clus <- factor(d[,ncol(d)]) ``` ] .pull-right[ ```r clus <- factor(d[,ncol(d)]) for (i in 1:(ncol(d)-1)) { Y1 <- as.vector(d[,i]) Y2 <- as.numeric(Y1) resultado <- aov(Y2~clus) result <- anova(resultado) f[,i] <- result[1,5] } condition <- (f > f_) matriz <- matrix(T, nrow = 1, ncol = (ncol(d)-1)) if (identical(condition, matriz)){ condition <- T } else { condition <- F } } } return(condition) } ``` ] --- class: middle # **ALGORITMO K-MEANS** *** .pull-left[ ```r k_means <- function(d, c, class, crit){ d <- as.matrix(d) names <- colnames(d) # Nombres de las columnas de los datos mk0 <- matrix(NA, nrow = c, ncol = ncol(d)) # Matriz centroides_0 mk <- matrix(NA, nrow = c, ncol = ncol(d)+1) # Matriz centroides_1 dist <- matrix(NA, nrow = nrow(d), ncol = c) # Matriz distancias entre centroides y datos clusters <- matrix(NA, nrow = nrow(d), ncol = 1) # Matriz de clusters (elementos de la matriz: 1,2,..,c) set.seed(327) # Semilla para generar los centroides iniciales mk0 <- centroides_0(d, c) dimnames(mk0) <- list(1:c,names) # Cambiar nombre de filas y columnas de la matriz centroides iter <- 0 condition <- 3==3 count <- 1 while (condition == TRUE) { # Mientras matriz a y matriz0 sean diferentes d <- as.matrix(d) for (i in 1:c){ # Recorriendo nº de clusters o igualmente filas de la matriz centroides if (class == "chebyshev"){ dist <- chebyshev_dist(d, mk0, c) } else { if (class == "canberra"){ dist[,i] <- canberra_dist(d, mk0[i,], c) ``` ] .pull-right[ ```r } else { if (class == "czekanowski"){ dist <- czekanowski_dist(d, mk0, c) } else { dist[,i] <- eucl_dist(d, mk0[i,], c) } } } } clusters <- as.matrix(apply(dist, 1, which.min)) # Matriz con posición de la menor distancia d_new <- cbind(d, clusters) condition <- crit_par(d_new, crit, count) for (i in 1:nrow(mk0)){ # Filtrar por nº de cluster y calcular media por columna mk[i,] <- as.data.frame(d_new) %>% filter(clusters == i) %>% colMeans() # Nuevo centroide } mk0 <- mk[,-(ncol(mk))] # Eliminando ult. columna (columna clusters) iter <- iter + 1 # Calcular iteraciones count <- count + 1 } dimnames(mk0) <- list(1:c,names) d_new1 <- cbind(d, clusters) colnames(d_new1)[ncol(d_new1)] <- "clusters" return(list(mk0, iter, d_new1)) } ``` ] --- class: middle # **RESULTADOS** *** ```r results <- k_means(d = newdata.sc, k, class = "eucl", crit = "1") names(results) = c("Centroides","Iteraciones","Datos") results$Centroides results$Iteraciones ```
--- class: center, middle, inverse # ¡GRACIAS POR SU ATENCIÓN!