# 1. Crear base de datos
df <- data.frame(x=c(2,2,8,5,7,6,1,4),
                 y=c(10,5,4,8,5,4,2,9))

# 2. Determinar el número de grupos
grupos <- 3

# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)
segmentos
## K-means clustering with 3 clusters of sizes 2, 3, 3
## 
## Cluster means:
##          x        y
## 1 1.500000 3.500000
## 2 7.000000 4.333333
## 3 3.666667 9.000000
## 
## Clustering vector:
## [1] 3 1 2 3 2 2 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 5.000000 2.666667 6.666667
##  (between_SS / total_SS =  85.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)
asignacion
##   x  y cluster
## 1 2 10       3
## 2 2  5       1
## 3 8  4       2
## 4 5  8       3
## 5 7  5       2
## 6 6  4       2
## 7 1  2       1
## 8 4  9       3
# 5. Graficar resultados
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data=df,
             palette=c("red","blue","green"),
             ellipse.type = "euclid",
             star.plot= T,
             repel= T,
             ggtheme = theme()
             )
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# 6. Optimizar cantidad de grupos

library(cluster)
library(data.table)
set.seed(123)
# aleatorio el 123
optimizacion <- clusGap(df, FUN= kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab="Número de clusters k")

# El punto mas alto de la grafica indica la cantidad de grupos óptimo

1. Importar la base de datos

bd <- read.csv("/Users/kristencarlos/Documents/Concentración Análisis Datos/ventas.csv")

2. Importar la base de datos

summary(bd)
##     BillNo            Itemname            Quantity            Date          
##  Length:522064      Length:522064      Min.   :-9600.00   Length:522064     
##  Class :character   Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Mode  :character   Median :    3.00   Mode  :character  
##                                        Mean   :   10.09                     
##                                        3rd Qu.:   10.00                     
##                                        Max.   :80995.00                     
##                                                                             
##      Hour               Price              CustomerID       Country         
##  Length:522064      Min.   :-11062.060   Min.   :12346    Length:522064     
##  Class :character   1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Mode  :character   Median :     2.080   Median :15265    Mode  :character  
##                     Mean   :     3.827   Mean   :15317                      
##                     3rd Qu.:     4.130   3rd Qu.:16837                      
##                     Max.   : 13541.330   Max.   :18287                      
##                                          NA's   :134041                     
##      Total          
##  Min.   :-11062.06  
##  1st Qu.:     3.75  
##  Median :     9.78  
##  Mean   :    19.69  
##  3rd Qu.:    17.40  
##  Max.   :168469.60  
## 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# count(bd, BillNo, sort=TRUE)
# count(bd,Itemname, sort = TRUE)
# count(bd, Date, sort= TRUE)
# count(bd, Hour, sort=TRUE)
# count(bd, Country, sort=TRUE)

Observaciones:
1. Tenemos cantidades, precios, totales negativos. 2. Fecha y hora no tienen formato adecuado. 3. Tenemos NA’s en CustomerID

3. Importar la base de datos

# ¿Cuántos NA tengo en la base de datos?
sum(is.na(bd))
## [1] 134041
# ¿Cuántos NA tengo por variable?
sapply(bd, function(x) sum(is.na(x)))
##     BillNo   Itemname   Quantity       Date       Hour      Price CustomerID 
##          0          0          0          0          0          0     134041 
##    Country      Total 
##          0          0
# Eliminar NA
bd <- na.omit(bd)

# Eliminar totales negativos
bd <- bd[bd$Total > 0, ]

# Identificar outliers
boxplot(bd$Total, horizontal = TRUE)

Observaciones:
4. Tenemos outliers en Total.

# Obtener cantidad de visitas por cliente
visitas <- group_by(bd, CustomerID) %>% summarise(Visitas= n_distinct(BillNo))

# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID + BillNo, data= bd, sum)

# Obtener el ticket promedio
ticket_promedio <- aggregate(Total ~ CustomerID, data= ticket_promedio, mean)

# Juntar las tablas visitas y ticket promedio
objetos <- merge(visitas, ticket_promedio, by="CustomerID")

# Llamar a los renglones como CustomerID
rownames(objetos) <- objetos$CustomerID

# Eliminar columna de CustomerID
objetos <- subset(objetos, select=-c(CustomerID))

# Eliminar datos fuera de lo normal

# Los datos fuera de lo normal están fuera de los siguientes límites: 
# Límite inferior = Q1- 1.5*IQR
# Límite superior = Q3 + 1.5*IQR
# Q1: Cuartil 1, Q3: Cuartil 3, IQR= Rango intercuartil, TP- Ticket Promedio

#Columna Visitas
IQR_V <- IQR(objetos$Visitas)
IQR_V
## [1] 4
summary(objetos)
##     Visitas            Total         
##  Min.   :  1.000   Min.   :    3.45  
##  1st Qu.:  1.000   1st Qu.:  178.30  
##  Median :  2.000   Median :  292.00  
##  Mean   :  4.227   Mean   :  415.62  
##  3rd Qu.:  5.000   3rd Qu.:  426.63  
##  Max.   :209.000   Max.   :84236.25
LI_V <- 1-1.5*IQR_V
LI_V
## [1] -5
LS_V <- 5+1.5*IQR_V
LS_V
## [1] 11
objetos <- objetos[objetos$Visitas <=11,]

# Columna Ticker promedio
colnames(objetos) <- c("Visitas", "TicketPromedio")
IQR_TP <- IQR(objetos$TicketPromedio)
IQR_TP
## [1] 243.3733
LI_TP <- 178.30- 1.5*IQR_TP
LI_TP
## [1] -186.76
LS_TP <- 426.63+ 1.5*IQR_TP
LS_TP
## [1] 791.69
objetos <- objetos[objetos$TicketPromedio <= 791.69,]
summary(objetos)
##     Visitas      TicketPromedio  
##  Min.   : 1.00   Min.   :  3.45  
##  1st Qu.: 1.00   1st Qu.:168.62  
##  Median : 2.00   Median :267.00  
##  Mean   : 2.97   Mean   :293.32  
##  3rd Qu.: 4.00   3rd Qu.:383.98  
##  Max.   :11.00   Max.   :789.56
# 1. Crear base de datos
df <-objetos

# 2. Determinar el número de grupos
grupos <- 4

# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)

# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)

# 5. Graficar resultados
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)

fviz_cluster(segmentos, data=df,
             palette=c("red","blue","green","yellow"),
             ellipse.type = "euclid",
             star.plot= T,
             repel= T,
             ggtheme = theme()
             )

# 6. Optimizar cantidad de grupos

library(cluster)
library(data.table)
set.seed(123)
# aleatorio el 123
optimizacion <- clusGap(df, FUN= kmeans, nstart=1, K.max=7)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 188200)
plot(optimizacion, xlab="Número de clusters k")

# El punto mas alto de la grafica indica la cantidad de grupos óptimo
LS0tCnRpdGxlOiAiQWN0aXZpZGFkIDQuMSIKYXV0aG9yOiAiS3Jpc3RlbiBDYXJsb3MgQTAxNTcwNzUzIgpkYXRlOiAiMjAyMy0wOS0xOCIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6ICJ5ZXRpIgogICAgaGlnaGxpZ2h0OiAidGFuZ28iCgotLS0KCiFbXSgvVXNlcnMva3Jpc3RlbmNhcmxvcy9EZXNrdG9wL2lrZWFnaWYuZ2lmKQoKYGBge3J9CiMgMS4gQ3JlYXIgYmFzZSBkZSBkYXRvcwpkZiA8LSBkYXRhLmZyYW1lKHg9YygyLDIsOCw1LDcsNiwxLDQpLAogICAgICAgICAgICAgICAgIHk9YygxMCw1LDQsOCw1LDQsMiw5KSkKCiMgMi4gRGV0ZXJtaW5hciBlbCBuw7ptZXJvIGRlIGdydXBvcwpncnVwb3MgPC0gMwoKIyAzLiBSZWFsaXphciBsYSBjbGFzaWZpY2FjacOzbgpzZWdtZW50b3MgPC0ga21lYW5zKGRmLGdydXBvcykKc2VnbWVudG9zCgojIDQuIFJldmlzYXIgbGEgYXNpZ25hY2nDs24gZGUgZ3J1cG9zCmFzaWduYWNpb24gPC0gY2JpbmQoZGYsIGNsdXN0ZXI9c2VnbWVudG9zJGNsdXN0ZXIpCmFzaWduYWNpb24KCiMgNS4gR3JhZmljYXIgcmVzdWx0YWRvcwojIGluc3RhbGwucGFja2FnZXMoImdncGxvdDIiKQpsaWJyYXJ5KGdncGxvdDIpCiMgaW5zdGFsbC5wYWNrYWdlcygiZmFjdG9leHRyYSIpCmxpYnJhcnkoZmFjdG9leHRyYSkKCmZ2aXpfY2x1c3RlcihzZWdtZW50b3MsIGRhdGE9ZGYsCiAgICAgICAgICAgICBwYWxldHRlPWMoInJlZCIsImJsdWUiLCJncmVlbiIpLAogICAgICAgICAgICAgZWxsaXBzZS50eXBlID0gImV1Y2xpZCIsCiAgICAgICAgICAgICBzdGFyLnBsb3Q9IFQsCiAgICAgICAgICAgICByZXBlbD0gVCwKICAgICAgICAgICAgIGdndGhlbWUgPSB0aGVtZSgpCiAgICAgICAgICAgICApCiMgNi4gT3B0aW1pemFyIGNhbnRpZGFkIGRlIGdydXBvcwoKbGlicmFyeShjbHVzdGVyKQpsaWJyYXJ5KGRhdGEudGFibGUpCnNldC5zZWVkKDEyMykKIyBhbGVhdG9yaW8gZWwgMTIzCm9wdGltaXphY2lvbiA8LSBjbHVzR2FwKGRmLCBGVU49IGttZWFucywgbnN0YXJ0PTEsIEsubWF4PTcpCnBsb3Qob3B0aW1pemFjaW9uLCB4bGFiPSJOw7ptZXJvIGRlIGNsdXN0ZXJzIGsiKQoKIyBFbCBwdW50byBtYXMgYWx0byBkZSBsYSBncmFmaWNhIGluZGljYSBsYSBjYW50aWRhZCBkZSBncnVwb3Mgw7NwdGltbwpgYGAKIyMjIDxzcGFuIHN0eWxlPSAiY29sb3I6IGJsdWU7Ij4xLiBJbXBvcnRhciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPgoKYGBge3J9CmJkIDwtIHJlYWQuY3N2KCIvVXNlcnMva3Jpc3RlbmNhcmxvcy9Eb2N1bWVudHMvQ29uY2VudHJhY2lvzIFuIEFuYcyBbGlzaXMgRGF0b3MvdmVudGFzLmNzdiIpCgpgYGAKCiMjIyA8c3BhbiBzdHlsZT0gImNvbG9yOiBibHVlOyI+Mi4gSW1wb3J0YXIgbGEgYmFzZSBkZSBkYXRvczwvc3Bhbj4KYGBge3J9CnN1bW1hcnkoYmQpCgpsaWJyYXJ5KGRwbHlyKQojIGNvdW50KGJkLCBCaWxsTm8sIHNvcnQ9VFJVRSkKIyBjb3VudChiZCxJdGVtbmFtZSwgc29ydCA9IFRSVUUpCiMgY291bnQoYmQsIERhdGUsIHNvcnQ9IFRSVUUpCiMgY291bnQoYmQsIEhvdXIsIHNvcnQ9VFJVRSkKIyBjb3VudChiZCwgQ291bnRyeSwgc29ydD1UUlVFKQpgYGAKCk9ic2VydmFjaW9uZXM6ICAKMS4gVGVuZW1vcyBjYW50aWRhZGVzLCBwcmVjaW9zLCB0b3RhbGVzIG5lZ2F0aXZvcy4KMi4gRmVjaGEgeSBob3JhIG5vIHRpZW5lbiBmb3JtYXRvIGFkZWN1YWRvLgozLiBUZW5lbW9zIE5BJ3MgZW4gQ3VzdG9tZXJJRAoKIyMjIDxzcGFuIHN0eWxlPSAiY29sb3I6IGJsdWU7Ij4zLiBJbXBvcnRhciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPgpgYGB7cn0KIyDCv0N1w6FudG9zIE5BIHRlbmdvIGVuIGxhIGJhc2UgZGUgZGF0b3M/CnN1bShpcy5uYShiZCkpCgojIMK/Q3XDoW50b3MgTkEgdGVuZ28gcG9yIHZhcmlhYmxlPwpzYXBwbHkoYmQsIGZ1bmN0aW9uKHgpIHN1bShpcy5uYSh4KSkpCgojIEVsaW1pbmFyIE5BCmJkIDwtIG5hLm9taXQoYmQpCgojIEVsaW1pbmFyIHRvdGFsZXMgbmVnYXRpdm9zCmJkIDwtIGJkW2JkJFRvdGFsID4gMCwgXQoKIyBJZGVudGlmaWNhciBvdXRsaWVycwpib3hwbG90KGJkJFRvdGFsLCBob3Jpem9udGFsID0gVFJVRSkKYGBgCk9ic2VydmFjaW9uZXM6ICAKNC4gVGVuZW1vcyBvdXRsaWVycyBlbiBUb3RhbC4KCmBgYHtyfQojIE9idGVuZXIgY2FudGlkYWQgZGUgdmlzaXRhcyBwb3IgY2xpZW50ZQp2aXNpdGFzIDwtIGdyb3VwX2J5KGJkLCBDdXN0b21lcklEKSAlPiUgc3VtbWFyaXNlKFZpc2l0YXM9IG5fZGlzdGluY3QoQmlsbE5vKSkKCiMgT2J0ZW5lciBlbCB0b3RhbCBwb3IgdGlja2V0CnRpY2tldF9wcm9tZWRpbyA8LSBhZ2dyZWdhdGUoVG90YWwgfiBDdXN0b21lcklEICsgQmlsbE5vLCBkYXRhPSBiZCwgc3VtKQoKIyBPYnRlbmVyIGVsIHRpY2tldCBwcm9tZWRpbwp0aWNrZXRfcHJvbWVkaW8gPC0gYWdncmVnYXRlKFRvdGFsIH4gQ3VzdG9tZXJJRCwgZGF0YT0gdGlja2V0X3Byb21lZGlvLCBtZWFuKQoKIyBKdW50YXIgbGFzIHRhYmxhcyB2aXNpdGFzIHkgdGlja2V0IHByb21lZGlvCm9iamV0b3MgPC0gbWVyZ2UodmlzaXRhcywgdGlja2V0X3Byb21lZGlvLCBieT0iQ3VzdG9tZXJJRCIpCgojIExsYW1hciBhIGxvcyByZW5nbG9uZXMgY29tbyBDdXN0b21lcklECnJvd25hbWVzKG9iamV0b3MpIDwtIG9iamV0b3MkQ3VzdG9tZXJJRAoKIyBFbGltaW5hciBjb2x1bW5hIGRlIEN1c3RvbWVySUQKb2JqZXRvcyA8LSBzdWJzZXQob2JqZXRvcywgc2VsZWN0PS1jKEN1c3RvbWVySUQpKQoKIyBFbGltaW5hciBkYXRvcyBmdWVyYSBkZSBsbyBub3JtYWwKCiMgTG9zIGRhdG9zIGZ1ZXJhIGRlIGxvIG5vcm1hbCBlc3TDoW4gZnVlcmEgZGUgbG9zIHNpZ3VpZW50ZXMgbMOtbWl0ZXM6IAojIEzDrW1pdGUgaW5mZXJpb3IgPSBRMS0gMS41KklRUgojIEzDrW1pdGUgc3VwZXJpb3IgPSBRMyArIDEuNSpJUVIKIyBRMTogQ3VhcnRpbCAxLCBRMzogQ3VhcnRpbCAzLCBJUVI9IFJhbmdvIGludGVyY3VhcnRpbCwgVFAtIFRpY2tldCBQcm9tZWRpbwoKI0NvbHVtbmEgVmlzaXRhcwpJUVJfViA8LSBJUVIob2JqZXRvcyRWaXNpdGFzKQpJUVJfVgpzdW1tYXJ5KG9iamV0b3MpCkxJX1YgPC0gMS0xLjUqSVFSX1YKTElfVgpMU19WIDwtIDUrMS41KklRUl9WCkxTX1YKb2JqZXRvcyA8LSBvYmpldG9zW29iamV0b3MkVmlzaXRhcyA8PTExLF0KCiMgQ29sdW1uYSBUaWNrZXIgcHJvbWVkaW8KY29sbmFtZXMob2JqZXRvcykgPC0gYygiVmlzaXRhcyIsICJUaWNrZXRQcm9tZWRpbyIpCklRUl9UUCA8LSBJUVIob2JqZXRvcyRUaWNrZXRQcm9tZWRpbykKSVFSX1RQCkxJX1RQIDwtIDE3OC4zMC0gMS41KklRUl9UUApMSV9UUApMU19UUCA8LSA0MjYuNjMrIDEuNSpJUVJfVFAKTFNfVFAKb2JqZXRvcyA8LSBvYmpldG9zW29iamV0b3MkVGlja2V0UHJvbWVkaW8gPD0gNzkxLjY5LF0Kc3VtbWFyeShvYmpldG9zKQoKYGBgCgpgYGB7cn0KIyAxLiBDcmVhciBiYXNlIGRlIGRhdG9zCmRmIDwtb2JqZXRvcwoKIyAyLiBEZXRlcm1pbmFyIGVsIG7Dum1lcm8gZGUgZ3J1cG9zCmdydXBvcyA8LSA0CgojIDMuIFJlYWxpemFyIGxhIGNsYXNpZmljYWNpw7NuCnNlZ21lbnRvcyA8LSBrbWVhbnMoZGYsZ3J1cG9zKQoKIyA0LiBSZXZpc2FyIGxhIGFzaWduYWNpw7NuIGRlIGdydXBvcwphc2lnbmFjaW9uIDwtIGNiaW5kKGRmLCBjbHVzdGVyPXNlZ21lbnRvcyRjbHVzdGVyKQoKIyA1LiBHcmFmaWNhciByZXN1bHRhZG9zCiMgaW5zdGFsbC5wYWNrYWdlcygiZ2dwbG90MiIpCmxpYnJhcnkoZ2dwbG90MikKIyBpbnN0YWxsLnBhY2thZ2VzKCJmYWN0b2V4dHJhIikKbGlicmFyeShmYWN0b2V4dHJhKQoKZnZpel9jbHVzdGVyKHNlZ21lbnRvcywgZGF0YT1kZiwKICAgICAgICAgICAgIHBhbGV0dGU9YygicmVkIiwiYmx1ZSIsImdyZWVuIiwieWVsbG93IiksCiAgICAgICAgICAgICBlbGxpcHNlLnR5cGUgPSAiZXVjbGlkIiwKICAgICAgICAgICAgIHN0YXIucGxvdD0gVCwKICAgICAgICAgICAgIHJlcGVsPSBULAogICAgICAgICAgICAgZ2d0aGVtZSA9IHRoZW1lKCkKICAgICAgICAgICAgICkKIyA2LiBPcHRpbWl6YXIgY2FudGlkYWQgZGUgZ3J1cG9zCgpsaWJyYXJ5KGNsdXN0ZXIpCmxpYnJhcnkoZGF0YS50YWJsZSkKc2V0LnNlZWQoMTIzKQojIGFsZWF0b3JpbyBlbCAxMjMKb3B0aW1pemFjaW9uIDwtIGNsdXNHYXAoZGYsIEZVTj0ga21lYW5zLCBuc3RhcnQ9MSwgSy5tYXg9NykKcGxvdChvcHRpbWl6YWNpb24sIHhsYWI9Ik7Dum1lcm8gZGUgY2x1c3RlcnMgayIpCgojIEVsIHB1bnRvIG1hcyBhbHRvIGRlIGxhIGdyYWZpY2EgaW5kaWNhIGxhIGNhbnRpZGFkIGRlIGdydXBvcyDDs3B0aW1vCmBgYAoKCgoKCgoK