# 1. Crear base de datos
df <- data.frame(x=c(2,2,8,5,7,6,1,4),
y=c(10,5,4,8,5,4,2,9))
# 2. Determinar el número de grupos
grupos <- 3
# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)
segmentos## K-means clustering with 3 clusters of sizes 2, 3, 3
##
## Cluster means:
## x y
## 1 1.500000 3.500000
## 2 7.000000 4.333333
## 3 3.666667 9.000000
##
## Clustering vector:
## [1] 3 1 2 3 2 2 1 3
##
## Within cluster sum of squares by cluster:
## [1] 5.000000 2.666667 6.666667
## (between_SS / total_SS = 85.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
## x y cluster
## 1 2 10 3
## 2 2 5 1
## 3 8 4 2
## 4 5 8 3
## 5 7 5 2
## 6 6 4 2
## 7 1 2 1
## 8 4 9 3
# 5. Graficar resultados
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data=df,
palette=c("red","blue","green"),
ellipse.type = "euclid",
star.plot= T,
repel= T,
ggtheme = theme()
)## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
# 6. Optimizar cantidad de grupos
library(cluster)
library(data.table)
set.seed(123)
# aleatorio el 123
optimizacion <- clusGap(df, FUN= kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab="Número de clusters k")## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. :-9600.00 Length:522064
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 10.09
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hour Price CustomerID Country
## Length:522064 Min. :-11062.060 Min. :12346 Length:522064
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
## Total
## Min. :-11062.06
## 1st Qu.: 3.75
## Median : 9.78
## Mean : 19.69
## 3rd Qu.: 17.40
## Max. :168469.60
##
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# count(bd, BillNo, sort=TRUE)
# count(bd,Itemname, sort = TRUE)
# count(bd, Date, sort= TRUE)
# count(bd, Hour, sort=TRUE)
# count(bd, Country, sort=TRUE)Observaciones:
1. Tenemos cantidades, precios, totales negativos. 2. Fecha y hora no
tienen formato adecuado. 3. Tenemos NA’s en CustomerID
## [1] 134041
## BillNo Itemname Quantity Date Hour Price CustomerID
## 0 0 0 0 0 0 134041
## Country Total
## 0 0
# Eliminar NA
bd <- na.omit(bd)
# Eliminar totales negativos
bd <- bd[bd$Total > 0, ]
# Identificar outliers
boxplot(bd$Total, horizontal = TRUE)
Observaciones:
4. Tenemos outliers en Total.
# Obtener cantidad de visitas por cliente
visitas <- group_by(bd, CustomerID) %>% summarise(Visitas= n_distinct(BillNo))
# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID + BillNo, data= bd, sum)
# Obtener el ticket promedio
ticket_promedio <- aggregate(Total ~ CustomerID, data= ticket_promedio, mean)
# Juntar las tablas visitas y ticket promedio
objetos <- merge(visitas, ticket_promedio, by="CustomerID")
# Llamar a los renglones como CustomerID
rownames(objetos) <- objetos$CustomerID
# Eliminar columna de CustomerID
objetos <- subset(objetos, select=-c(CustomerID))
# Eliminar datos fuera de lo normal
# Los datos fuera de lo normal están fuera de los siguientes límites:
# Límite inferior = Q1- 1.5*IQR
# Límite superior = Q3 + 1.5*IQR
# Q1: Cuartil 1, Q3: Cuartil 3, IQR= Rango intercuartil, TP- Ticket Promedio
#Columna Visitas
IQR_V <- IQR(objetos$Visitas)
IQR_V## [1] 4
## Visitas Total
## Min. : 1.000 Min. : 3.45
## 1st Qu.: 1.000 1st Qu.: 178.30
## Median : 2.000 Median : 292.00
## Mean : 4.227 Mean : 415.62
## 3rd Qu.: 5.000 3rd Qu.: 426.63
## Max. :209.000 Max. :84236.25
## [1] -5
## [1] 11
objetos <- objetos[objetos$Visitas <=11,]
# Columna Ticker promedio
colnames(objetos) <- c("Visitas", "TicketPromedio")
IQR_TP <- IQR(objetos$TicketPromedio)
IQR_TP## [1] 243.3733
## [1] -186.76
## [1] 791.69
## Visitas TicketPromedio
## Min. : 1.00 Min. : 3.45
## 1st Qu.: 1.00 1st Qu.:168.62
## Median : 2.00 Median :267.00
## Mean : 2.97 Mean :293.32
## 3rd Qu.: 4.00 3rd Qu.:383.98
## Max. :11.00 Max. :789.56
# 1. Crear base de datos
df <-objetos
# 2. Determinar el número de grupos
grupos <- 4
# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)
# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)
# 5. Graficar resultados
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)
fviz_cluster(segmentos, data=df,
palette=c("red","blue","green","yellow"),
ellipse.type = "euclid",
star.plot= T,
repel= T,
ggtheme = theme()
)# 6. Optimizar cantidad de grupos
library(cluster)
library(data.table)
set.seed(123)
# aleatorio el 123
optimizacion <- clusGap(df, FUN= kmeans, nstart=1, K.max=7)## Warning: Quick-TRANSfer stage steps exceeded maximum (= 188200)