Paso 1. Determinar el número de grupo o clusters =3 Paso 2. Seleccionar aleatoriamente los centroides (seeds) c1=A1 C2=A4 C3=A7
Paso 3. Asignar cada objeto al centoride más cercano
ITERACIÓN Objeto: distancia (Objeto,Centroide) A1:d(A1,A1): 0 ya que es centroide | d(A1,A4):3.61 | d(A1,A7):8.06 | A1 ∈ (pertenece a Cluster 1
A2:d(A2,A1): 5 | d(A2,A4): 4.24 | d(A2,A7): 3.16 | A2 ∈ Cluster 2
A3:d(A3,A1): √36 | d(A3,A4): √25 | d(A3,A7): √53 | A3 ∈ cluster 3
A4:d(A4,A1): | d(A4,A4):0 ya que es centroide | d(A4,A7): | A4 ∈ cluster 3
A5:d(A5,A1): √50 | d(A5,A4): √13 | d(A5,A7): √45 | A5 ∈ cluster 3
A6:d(A6,A1): √52 | d(A6,A4): √17 | d(A6,A7): √29| A6 ∈ cluster 3
A7:d(A7,A1): | d(A7,A4): | d(A7,A7): 0 ya que es centroide | A7 ∈ cluster 3
A8:d(A8,A1): √5 | d(A8,A4): √2 | d(A8,A7): √58 | A8 ∈ cluster 3
Resumen de la Iteración: Cluster 1=(A1), 2=(A2,A7), 3=(A3,A4,A5,A6,A8)
Paso 4. Actualizar posición de centroides con la posición promedio de los objetos pertenecientes a dicho grupo o cluster
CLUSTERS: C1=(2,10) | C2=((2+1)/2 , (5+2)/2)) -> C2= (1.5,3.5) | C3= ((8+5+7+6+4)/5 , (4+8+5+4+9)/5) -> C3=(6,6)
Paso 5. Repetir pasos 3,4 y hasta que los centorides no se muevan, o se muevan por debajo de una distancia umbral en cada paso
2 Iteración
Resumen de la 2da Iteración Cluster 1=(A1,A8), 2=(A2,A7), 3=(A3,A4,A5,A6) CLusters: C1=(3,9.5) | C2=(1.5,3.5) | C3=(7,4.33)
3era Iteración Resultado de la 3era Iteración Cluster: 1=(A1,A4,A8), 2=(A2,A7), 3(A3,A5,A6) Clusters: C1=(3.6,6.9), C2= (1.5,13.5), C3=(7,4.33)
4ta Iteración Resultado de la 4ta Iteración Cluster: 1=(A1,A4,A8), 2=(A2,A7), 3(A3,A5,A6) Clusters: C1=(3.6,6.9), C2= (1.5,13.5), C3=(7,4.33)
ITERACIONES EN R
x <- c(2, 2, 8, 5, 7, 6, 1, 4)
y <- c(10, 5, 4, 8, 5, 4, 2, 9)
data18S <- data.frame(x,y)
## Determinar el número de grupos
grupos<-3
## Realizar segmentación
segmentos <-kmeans(data18S,grupos)
segmentos
## K-means clustering with 3 clusters of sizes 3, 2, 3
##
## Cluster means:
## x y
## 1 3.666667 9.000000
## 2 1.500000 3.500000
## 3 7.000000 4.333333
##
## Clustering vector:
## [1] 1 2 3 1 3 3 2 1
##
## Within cluster sum of squares by cluster:
## [1] 6.666667 5.000000 2.666667
## (between_SS / total_SS = 85.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
asignacion<-cbind(data18S,cluster=segmentos$cluster)
asignacion
## x y cluster
## 1 2 10 1
## 2 2 5 2
## 3 8 4 3
## 4 5 8 1
## 5 7 5 3
## 6 6 4 3
## 7 1 2 2
## 8 4 9 1
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos,data=data18S,palette=c("cadetblue3","cornsilk3","darkolivegreen3"),
ellipse.type="euclid",
star.plot=T,
repel=T,
ggtheme=theme()
)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
Optimizar cantidad de grupos
library(cluster)
library(data.table)
set.seed(123)
optimizacion<-clusGap(data18S,FUN=kmeans,nstart=1,K.max=7)
plot(optimizacion,xlab="Número de clusters k")
El punto más alto de la gráfica indica la cantidad de grupos óptimos en
los que se pueden clasificar los datos.
library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (foreign)
library (ggplot2)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggrepel)
DTVC<-read.csv("C://Users/IanAb/Documents/7to Semestre/DATA BASE/ventasSEP18.csv")
CIDG<-aggregate(DTVC$Total,by=list(DTVC$CustomerID), FUN=sum)
colnames(CIDG)<-c("CustomerID","Total")
CIDB <- aggregate(DTVC$BillNo~ DTVC$CustomerID, data = DTVC, FUN = length)
colnames(CIDB) <- c("CustomerID", "Cantidad_Compras")
CIDB_G<-CIDG%>%left_join(CIDB,by="CustomerID")
summary (DTVC)
## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. : -9600 Length:522064
## Class :character Class :character 1st Qu.: 1 Class :character
## Mode :character Mode :character Median : 3 Mode :character
## Mean : 3385
## 3rd Qu.: 10
## Max. :560368
##
## Hour Price CustomerID Country
## Length:522064 Min. :-11062.060 Min. :12346 Length:522064
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
## Total
## Min. :-11062.06
## 1st Qu.: 3.75
## Median : 9.78
## Mean : 19.69
## 3rd Qu.: 17.40
## Max. :168469.60
##
# count(DTVC,BillNo,Sort=TRUE)
# count(DTVC,Itemname,Sort=TRUE)
# count(DTVC,Date,Sort=TRUE)
#count(DTVC,Hour,Sort=TRUE)
# count(DTVC,Country,Sort=TRUE)
Observaciones 1 Tenemos cantidad, precios y totales negativos 2 Fechas y hora no tienen formato adecuado 3 Tenemos NA’s en Customer ID
#¿cuántos NA tengo en la base de datos?
sum(is.na(DTVC))
## [1] 134041
#¿Cuántos NA tengo por variable?
sapply(DTVC,function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Hour Price CustomerID
## 0 0 0 0 0 0 134041
## Country Total
## 0 0
#Eliminar NA
DTVC<-na.omit(DTVC)
#Eliminar Totales Negativos
DTVC<-DTVC[DTVC$Total>0,]
# Identificar Outliers
boxplot(DTVC$Total,horizontal=TRUE)
Obsevraciones: 4. Tenemos outliers en Total.
## Obtener cantidad de visitas por cliente
Visitas<- group_by(DTVC,CustomerID)%>%summarise(Visitas= n_distinct(BillNo))
## Obtener el total por ticket
ticketprom<-aggregate(Total ~ CustomerID + BillNo,data=DTVC,sum)
## Obtener el Ticket Promedio
ticketprom1<-aggregate(Total ~ CustomerID, data=ticketprom,mean)
objetos<-merge(Visitas,ticketprom1,by="CustomerID")
rownames(objetos)<-objetos$CustomerID
##Eliminar la columna de CustomerID
objetos<-subset(objetos,select=-c(CustomerID))
IQR_V<-IQR(objetos$Visitas)
IQR_V
## [1] 4
summary(objetos)
## Visitas Total
## Min. : 1.000 Min. : 3.45
## 1st Qu.: 1.000 1st Qu.: 178.30
## Median : 2.000 Median : 292.00
## Mean : 4.227 Mean : 415.62
## 3rd Qu.: 5.000 3rd Qu.: 426.63
## Max. :209.000 Max. :84236.25
LI_V<-1-1.5*IQR_V
LS_V<-5+1.5*IQR_V
objetos<-objetos[objetos$Visitas<=11,]
#Columna Ticket Promedio
colnames(objetos)<-c("Visitas","TicketPromedio")
IQR_TP<-IQR(objetos$TicketPromedio,)
summary
## function (object, ...)
## UseMethod("summary")
## <bytecode: 0x000001ba2c94be00>
## <environment: namespace:base>
LI_TP<-178.30-1.5*IQR_TP
LI_TP
## [1] -186.76
LS_TP<-426.62+1.5*IQR_TP
LS_TP
## [1] 791.68
objetos<-objetos[objetos$TicketPromedio<=791.69,]
## Determinar el número de grupos
grupos1<-4
## Realizar segmentación
segmentos1 <-kmeans(objetos,grupos1)
# segmentos1
asignacion1<-cbind(objetos,cluster=segmentos1$cluster)
# asignacion1
library(ggplot2)
library(factoextra)
fviz_cluster(segmentos1,data=objetos,palette=c("cadetblue3","cornsilk3","darkolivegreen3","azure3"),
ellipse.type="euclid",
star.plot=T,
repel=T,
ggtheme=theme()
)
library(cluster)
library(data.table)
set.seed(123)
optimizacion1<-clusGap(objetos,FUN=kmeans,nstart=1,K.max=7)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 188200)
plot(optimizacion1,xlab="Número de clusters k")