install.packages(“dplyr”) library(“dplyr”)

Realizamos la carga de los datos, indicando que la primera fila contiene la descripcion de los campos

df <-read.csv(“Wholesale_customers_data.csv”, header=TRUE)

Estudiamos la naturaleza de los datos

str(df) head(df) df$Channel<-as.factor(df$Channel) df$Region<-as.factor(df$Region)

Renombramos las variables categoricas para un mejor entendimiento

library(“plyr”) df$Channel <-revalue(df$Channel, c(“1”=“Horeca”, “2”=“Retail”)) df$Region <-revalue(df$Region, c(“1”=“Lisbon”, “2”=“Oporto”,“3”=“Other”))

boxplots

library(ggplot2) library(gridExtra)

grocery <- ggplot(df, aes(x=Channel, y=Grocery, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”) + xlab(“”) delicassen <- ggplot(df, aes(x=Channel, y=Delicassen, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”) + xlab(“”) detergents <- ggplot(df, aes(x=Channel, y=Detergents_Paper, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) milk <- ggplot(df, aes(x=Channel, y=Milk, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) fresh <- ggplot(df, aes(x=Channel, y=Fresh, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) frozen <- ggplot(df, aes(x=Channel, y=Frozen, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”)

agrupamos todos los boxplot (FIGURA 1)

grid.arrange(fresh, milk, grocery, frozen,detergents,delicassen, ncol = 3)

Boxplot de los consumibles Fresh para el canal Horeca (FIGURA 2)

ggplot(filter(df, Channel == “Horeca”), aes(x=Channel, y=Fresh, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) + geom_jitter(alpha = 0.25)

########################## # ALGORITMOS JERARQUICOS # ##########################

Separamos los canales en dos tipos y eliminamos la primera columna, ya la conocemos.

df_horeca <- subset(df, Channel == “Horeca”) df_horeca <- df_horeca[,-1]

Cargamos librerias para metodos de clustering

library(tidyverse) library(cluster) library(factoextra) library(dendextend)

seaparmaos nuevamente el conjunto de clientes en las tres regiones y eliminamos la columna de la region

df_horeca_op<-subset(df_horeca, Region==“Oporto”) df_horeca_li<-subset(df_horeca, Region==“Lisbon”) df_horeca_oth<-subset(df_horeca, Region==“Other”)

df_horeca_op<-df_horeca_op[-1] df_horeca_li<-df_horeca_li[-1] df_horeca_oth<-df_horeca_oth[-1]

Estanzarizamos las variables

df_horeca_op<-scale(df_horeca_op) df_horeca_li<-scale(df_horeca_li) df_horeca_oth<-scale(df_horeca_oth)

Calculamos unos dendrogramas para cada region en busca de outliers

Matriz de distancias

dist_matrix_op <- dist(df_horeca_op, method = “euclidean”) dist_matrix_li <- dist(df_horeca_li, method = “euclidean”) dist_matrix_oth <- dist(df_horeca_oth, method = “euclidean”)

cluster aglomerativo con el metodo ward

hc_op <- hclust(dist_matrix_op, method = “ward.D2” ) hc_li <- hclust(dist_matrix_li, method = “ward.D2” ) hc_oth <- hclust(dist_matrix_oth, method = “ward.D2” )

plot de los dendrogramas cortados (FIGURAS 3, 4 Y 5)

plot(hc_op, hang = -1, cex = 0.5) ;rect.hclust(hc_op, k = 4, border = 2:6) plot(hc_li, hang = -1, cex = 0.6) ;rect.hclust(hc_li, k = 4, border = 2:6) plot(hc_oth, hang = -1, cex = 0.6) ;rect.hclust(hc_oth, k = 4, border = 2:6)

Cortamos en 4 clusters el dendrograma

groups <- cutree(hc_oth, k=4) #Definimos una funcion que nos muestre a que cluster pertenece cada observacion print_clusters <- function(labels, k) { for (i in 1:k){ print(paste(“cluster”,i)) print(df_horeca_oth[labels==i, c(“Fresh”, “Milk”,“Grocery”,“Frozen”,“Detergents_Paper”,“Delicassen”)]) } } #Evaluamos la funcion print_clusters(groups, 4)

ANALISIS DE COMPONENTES PRINCIPALES

install.packages(“devtools”) install_github(‘sinhrks/ggfortify’) library(devtools) library(ggfortify) library(ggplot2)

Creamos una funcion que elimine todos los outliers

remove_outliers <- function(x, na.rm = TRUE) { qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm) H <- 1.5 * IQR(x, na.rm = na.rm) y <- x y[x < (qnt[1] - H)] <- NA y[x > (qnt[2] + H)] <- NA y }

Los eliminamos llamando a la funcion anterior

df2<-df df2$Delicassen<-remove_outliers(df2$Delicassen) df2$Detergents_Paper<-remove_outliers(df2$Detergents_Paper) df2$Frozen<-remove_outliers(df2$Frozen) df2$Grocery<-remove_outliers(df2$Grocery) df2$Milk<-remove_outliers(df2$Milk) df2$Fresh<-remove_outliers(df2$Fresh)

Eliminamos todas las filas con algun valor NA

df2<-na.omit(df2)

Hacemos un analisis de componentes con los datos sin outliers

pca <- prcomp(df2[-(1:2)], scale. = TRUE)

autoplot(pca, loadings = TRUE, loadings.label = TRUE, data = df2[-2], colour = ‘Channel’) #plot basico

Hacemos el plot del resultado de PCA (FIGURA 6)

fviz_pca_biplot(pca, geom.ind = “point”, fill.ind = df2$Channel, col.ind = “black”,pointshape = 21, pointsize = 2,palette = “jco”,addEllipses = TRUE,alpha.var =“contrib”, col.var = “contrib”,gradient.cols = “RdYlBu”,legend.title = list(fill = “Channel”, color = “Contrib”,alpha = “Contrib”))

CLUSTERING K-MEANS

Sillhouette Method

ffviz_nbclust(scale(df2[-(1:2)]), kmeans, method = “silhouette”)

K-Means

k2 <- kmeans((scale(df2[-(1:2)])), centers = 2, nstart = 25)

Clustering plot (FIGURA 7)

fviz_cluster(k2, geom = “point”, data = scale(df2[-(1:2)]),main = “Cluster plot”, xlab = ‘Primera Componente Principal’, ylab = ‘Segunda Componente Principal’, ellipse.type = “norm”,ggtheme = theme_classic(), palette = “jco”)

comparacion de como ajusta el cluster horeca y retail

df3 <- cbind(k2$cluster,df2) df3$k2$cluster <-as.factor(df3$`k2$cluster) df3$k2$cluster` <-revalue(df3$k2$cluster, c(“1”=“Horeca”, “2”=“Retail”))

Plot comparativo (FIGURAS 8 Y 9)

plot(df3[-(1:3)], col = df3$Channel, main = "Representación de los Datos",pch=20,cex=0.5) plot(df3[-(1:3)], col = k2$cluster, main = “Representación del Cluster”,pch=20,cex=0.5)

Confusion Matrix

install.packages(“caret”) library(‘caret’) matriz_de_confusion<-confusionMatrix(df3$`k2$cluster`, df3$Channel)

install.packages(“GGally”) library(GGally) #Correlacion (FIGURA 10) ggcorr(df[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8) ggcorr(df2[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8) #Otra opcion #ggcorr(df[-(1:2)], label = TRUE,hjust = 0.75, geom = ‘blank’) + #geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.4)) + #scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0)) + #guides(color = FALSE, alpha = FALSE)

OTRAS CONSIDERACIONES ESTADISTICAS

Librerias para pintar las correlaciones

install.packages(“GGally”) library(GGally)

Correlacion (FIGURA 11)

ggcorr(df[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8) ggcorr(df2[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8)

Separamos los datos por region y canal

df_li_ho<-filter(df, Channel == “Horeca”, Region == “Lisbon”) df_op_ho<-filter(df, Channel == “Horeca”, Region == “Oporto”) df_ot_ho<-filter(df, Channel == “Horeca”, Region == “Other”) df_ot_re<-filter(df, Channel == “Retail”, Region == “Other”) df_op_re<-filter(df, Channel == “Retail”, Region == “Oporto”) df_li_re<-filter(df, Channel == “Retail”, Region == “Lisbon”)

Asignamos los plots de las correlaciones a 6 variables (FIGURA 12)

p1 <- ggcorr(df_li_ho[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p2 <- ggcorr(df_op_ho[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p3 <- ggcorr(df_ot_ho[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p4 <- ggcorr(df_li_re[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p5 <- ggcorr(df_op_re[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p6 <- ggcorr(df_ot_re[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE)

Multiplot

install.packages(“devtools”) library(devtools) install_github(“easyGgplot2”, “kassambara”) library(easyGgplot2) ggplot2.multiplot(p1,p2,p3,p4,p5,p6, cols=3)

Representacion en 3d (FIGURA 13)

install.packages(“rgl”) library(“rgl”) require(plotly) plot_ly(df2, x = ~Detergents_Paper, y = ~Milk, z = ~Grocery, color = ~Channel) %>% add_markers()

Anexos

df_est<-scale(df[-c(1,2)]) df_est<-cbind(df[1:2],df_est) ggpairs(df_est[-2], aes(colour = Channel, alpha = 0.4), title = ‘Resumen estadistico por canales’) ggpairs(df_est[-1], aes(colour = Region, alpha = 0.4), title = ‘Resumen estadistico por regiones’)

Wholesale Customer Data in R