Análisis de Clusters

TEORÍA

# 1. Crear base de datos
df <- data.frame(x=c(2,2,8,5,7,6,1,4),
                 y=c(10,5,4,8,5,4,2,9))

# 2. Determinar el número de grupos
grupos <- 3

# 3. Realizar la Clasificación
segmentos <- kmeans(df,grupos)
segmentos

## K-means clustering with 3 clusters of sizes 3, 2, 3
## 
## Cluster means:
##          x        y
## 1 3.666667 9.000000
## 2 1.500000 3.500000
## 3 7.000000 4.333333
## 
## Clustering vector:
## [1] 1 2 3 1 3 3 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 6.666667 5.000000 2.666667
##  (between_SS / total_SS =  85.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# 4. Revisar la Asignación de los Grupos
asignacion <- cbind(df, cluster=segmentos$cluster)
asignacion

##   x  y cluster
## 1 2 10       1
## 2 2  5       2
## 3 8  4       3
## 4 5  8       1
## 5 7  5       3
## 6 6  4       3
## 7 1  2       2
## 8 4  9       1

# 5. Graficar Resultados
library(ggplot2)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_cluster(segmentos, data=df,
             palette=c("red","blue","black"),
             ellipse.type = "euclid",
             star.plot=T,
             repel= T,
             ggtheme = theme())

## Too few points to calculate an ellipse

## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# 6. Optimizar Cantidad de Grupos
library(cluster)
library(data.table)

set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max = 7)

plot(optimizacion, xlab="Número de Clusters K")

# El punto más alto de la gráfica indica la cantidad de grupos óptimo.

EJERCICIO

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

1. Importar Base de Datos

bd <- read.csv("/Users/benjaminreyessanchez/Downloads/ventas.csv")

2. Entender la Base de Datos

summary(bd)

##     BillNo            Itemname            Quantity            Date          
##  Length:522064      Length:522064      Min.   :-9600.00   Length:522064     
##  Class :character   Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Mode  :character   Median :    3.00   Mode  :character  
##                                        Mean   :   10.09                     
##                                        3rd Qu.:   10.00                     
##                                        Max.   :80995.00                     
##                                                                             
##      Hour               Price              CustomerID       Country         
##  Length:522064      Min.   :-11062.060   Min.   :12346    Length:522064     
##  Class :character   1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Mode  :character   Median :     2.080   Median :15265    Mode  :character  
##                     Mean   :     3.827   Mean   :15317                      
##                     3rd Qu.:     4.130   3rd Qu.:16837                      
##                     Max.   : 13541.330   Max.   :18287                      
##                                          NA's   :134041                     
##      Total          
##  Min.   :-11062.06  
##  1st Qu.:     3.75  
##  Median :     9.78  
##  Mean   :    19.69  
##  3rd Qu.:    17.40  
##  Max.   :168469.60  
##

#count(bd, BillNo, sort=TRUE)
#count(bd, Itemname, sort=TRUE)
#count(bd, Date, sort=TRUE)
#count(bd, Hour, sort=TRUE)
#count(bd, Country, sort=TRUE)

Observaciones: 1. Tenemos cantidades, precios y totales negativos 2. Fecha y Hora no tienen formato adecuado 3. Tenemos N/A’s en Uustomer ID

# Cuántos N/A tengo en la BD?
sum(is.na(bd))

## [1] 134041

# ¿Cuántos NA teengo por variable?
sapply(bd, function(x) sum(is.na(bd)))

##     BillNo   Itemname   Quantity       Date       Hour      Price CustomerID 
##     134041     134041     134041     134041     134041     134041     134041 
##    Country      Total 
##     134041     134041

# Eliminar NA
bd <- na.omit(bd)

#Eliminar Totales Negativos
bd <- bd[bd$Total>0,]

#Identificar Outliers
boxplot(bd$Total, horizontal=TRUE)

Observaciones: 4. Tenemos presencia de datos fuera de lo normal (outliers).

# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID + BillNo, data = bd, sum)

# Obtener el ticket promedio
ticket_promedio <- aggregate(Total ~ CustomerID, data = ticket_promedio, mean)
colnames(ticket_promedio) <- c("CustomerID", "TicketPromedio")

# Obtener cantidad de visitas por cliente
visitas <- group_by(bd, CustomerID) %>% summarise(Visitas = n_distinct(BillNo))

# Juntar las tablas Ticket Promedio y Visitas
objetos <- merge(ticket_promedio, visitas, by = "CustomerID")

# Llamar a los renglones como CustomerID
rownames(objetos) <- objetos$CustomerID
objetos <- subset(objetos, select = -c(CustomerID))

# Eliminar datos fuera de lo normal

# Los datos fuera de lo normal están fuera de los siguientes límites:
# Lím. inferior = Q1 - 1.5*IQR
# Lím. superior = Q3 + 1.5*IQR
# Q1: Cuartil 1, Q3: Cuartil 3, IQR = Rango intercuartil

# Columna de ticket promedio
IQR_TP <- IQR(objetos$TicketPromedio)
IQR_TP

## [1] 248.3318

summary(objetos)

##  TicketPromedio        Visitas       
##  Min.   :    3.45   Min.   :  1.000  
##  1st Qu.:  178.30   1st Qu.:  1.000  
##  Median :  292.00   Median :  2.000  
##  Mean   :  415.62   Mean   :  4.227  
##  3rd Qu.:  426.63   3rd Qu.:  5.000  
##  Max.   :84236.25   Max.   :209.000

LI_ITP <- 178.30 - 1.5*IQR_TP
LI_ITP

## [1] -194.1977

LS_ITP <- 426.63 + 1.5*IQR_TP
LS_ITP

## [1] 799.1277

objetos <- objetos[objetos$TicketPromedio <= 799.13, ]

#Columna de visitas
IQR_V <- IQR(objetos$Visitas)
IQR_V

## [1] 4

LI_V <- 1 - 1.5*IQR_V
LI_V

## [1] -5

LS_V <- 5 + 1.5*IQR_V
LS_V

## [1] 11

objetos <- objetos[objetos$Visitas <= 11, ]
summary(objetos)

##  TicketPromedio      Visitas      
##  Min.   :  3.45   Min.   : 1.000  
##  1st Qu.:168.66   1st Qu.: 1.000  
##  Median :267.12   Median : 2.000  
##  Mean   :293.86   Mean   : 2.971  
##  3rd Qu.:384.49   3rd Qu.: 4.000  
##  Max.   :797.45   Max.   :11.000

4. Asignación de Grupos

# O. normalizar variables
objetos <- as.data.frame(scale(objetos))

# 1. Crear base de datos
df <- objetos

# 2. Determinar el número de grupos
grupos <- 4

# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)


# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)


# 5. Graficar asignaciones
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)

fviz_cluster(segmentos, data = df,
             palette = c("red", "blue", "yellow", "green"),
             ellipse.type = "euclid",
             star.plot = T,
             repel = T,
             ggtheme = theme())

# 6. Optimizar la cantidad de grupos
library(cluster)
library(data.table)

set.seed(123)
optimizacion <- clusGap(df, FUN = kmeans, nstart = 1, K.max = 7)
plot(optimizacion, xlab = "Número de clusters K")

# El punto más alto de la gráfica indica la cantidad de grupos óptimo