"~/Desktop/ikea.gif"
## [1] "~/Desktop/ikea.gif"

<span style=“color:blue;>1. Teoria

1. Crear la base de datos

Teoria

# 1. Crear base de datos
df <- data.frame(x=c(2,2,8,5,7,6,1,4),
                 y=c (10,5,4,8,5,4,2,9))

# 2. Determinar el número de grupos
grupos <- 3

# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)
segmentos
## K-means clustering with 3 clusters of sizes 3, 3, 2
## 
## Cluster means:
##          x        y
## 1 3.666667 9.000000
## 2 7.000000 4.333333
## 3 1.500000 3.500000
## 
## Clustering vector:
## [1] 1 3 2 1 2 2 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 6.666667 2.666667 5.000000
##  (between_SS / total_SS =  85.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)
asignacion
##   x  y cluster
## 1 2 10       1
## 2 2  5       3
## 3 8  4       2
## 4 5  8       1
## 5 7  5       2
## 6 6  4       2
## 7 1  2       3
## 8 4  9       1
# 5. Graficar resultados 
#install.packages("ggplot2)
library(ggplot2)
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data=df,
             palette=c("red", "blue", "black", "green", "yellow"),
             ellpise.type="euclid",
             star.plot= T,
             repel= T,
             ggtheme= theme())

# 6. Optimizar cantidad de grupos
library(cluster)
library(data.table)

set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max = 7)
plot(optimizacion, xlab= "Número de clusters K")

# El punto más alto de la gráfica indica la cantidad de grupos óptimo.
optimizacion <- clusGap

Ejercicio

1. Importarla base de datos

#file.choose()
bd <- read.csv("/Users/blancagonzalez/Desktop/ventas.csv")

2. Entender la base de datos

summary(bd)
##     BillNo            Itemname            Quantity            Date          
##  Length:522064      Length:522064      Min.   :-9600.00   Length:522064     
##  Class :character   Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Mode  :character   Median :    3.00   Mode  :character  
##                                        Mean   :   10.09                     
##                                        3rd Qu.:   10.00                     
##                                        Max.   :80995.00                     
##                                                                             
##      Hour               Price              CustomerID       Country         
##  Length:522064      Min.   :-11062.060   Min.   :12346    Length:522064     
##  Class :character   1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Mode  :character   Median :     2.080   Median :15265    Mode  :character  
##                     Mean   :     3.827   Mean   :15317                      
##                     3rd Qu.:     4.130   3rd Qu.:16837                      
##                     Max.   : 13541.330   Max.   :18287                      
##                                          NA's   :134041                     
##      Total          
##  Min.   :-11062.06  
##  1st Qu.:     3.75  
##  Median :     9.78  
##  Mean   :    19.69  
##  3rd Qu.:    17.40  
##  Max.   :168469.60  
## 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#count(bd,BillNo, sort=TRUE)
#count(bd,Itemname, sort=TRUE)
#count(bd,Date, sort=TRUE)
#count(bd,Hour, sort=TRUE)
#count(bd,CustomerID, sort=TRUE)
#count(bd, Country, sort=TRUE)

Observaciones: 1. Tenemos cantidades, precios y totales negativos. 2. Fecha y hora no tienen formato adecuado. 3. Tenemos NA’s en CustomerID

<span style=“color:blue;>3. Limpiar la base de datos

# ¿Cuántos NA tengo en la base de datos?
sum(is.na(bd))
## [1] 134041
# ¿Cuántos NA tengo por variable?
sapply(bd, function(x) sum(is.na(x)))
##     BillNo   Itemname   Quantity       Date       Hour      Price CustomerID 
##          0          0          0          0          0          0     134041 
##    Country      Total 
##          0          0
#Eliminar Totales negativos 
bd <- bd[bd$Total > 0,]

#Identificar outliers
boxplot(bd$Total, horizontal = TRUE)

Observaciones: 4. Tenemos presencia de datos fuera de lo normal (outliers).

# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID + BillNo, data = bd, sum)

#Obtener el ticket promedio
ticket_promedio <- aggregate(Total ~ CustomerID, data=ticket_promedio, mean)
colnames(ticket_promedio) <- c("CustomerID", "TicketPromedio")

library(dplyr)
#Obtener cantidad de visitas por cliente
visitas <- group_by(bd, CustomerID) %>% summarize(Visitas= n_distinct(BillNo))

#Juntar las tables Ticket Promedio y Visitas
objetos <- merge(ticket_promedio, visitas,by="CustomerID")

#Llamar a los renglones como CustomerID
rownames(objetos) <- objetos$CustomerID
objetos <- subset(objetos, select = -c(CustomerID))

#Eliminar datos fuera de lo normal

#Los datos fuera de lo normal están fuera de los siguientes limites:
#Limite inferior = Q1 - 1.5*IQR
#lIMITE Superior = Q3 + 1.5*IQR
#Q1: Cuartil 1, Q3: Cuartil 3, IQR Rango Intercuartil

#Columna de Ticket Promedio
IQR_TP <- IQR(objetos$TicketPromedio)
IQR_TP
## [1] 248.3318
summary(objetos)
##  TicketPromedio        Visitas       
##  Min.   :    3.45   Min.   :  1.000  
##  1st Qu.:  178.30   1st Qu.:  1.000  
##  Median :  292.00   Median :  2.000  
##  Mean   :  415.62   Mean   :  4.227  
##  3rd Qu.:  426.63   3rd Qu.:  5.000  
##  Max.   :84236.25   Max.   :209.000
LI_TP <- 178.30 - 1.5*IQR_TP
LI_TP
## [1] -194.1977
LS_TP <- 426.63 + 1.5*IQR_TP
LS_TP
## [1] 799.1277
objetos <- objetos[objetos$TicketPromedio <= 799.13,]

IQR_V <- IQR(objetos$Visitas)
IQR_V
## [1] 4
LI_V <- 1 - 1-5*IQR_V
LI_V
## [1] -20
LS_V <- 5+1.5*IQR_V
LS_V
## [1] 11
objetos <- objetos[objetos$Visitas <= 11,]
# 0. Normalizar variables
objetos <- as.data.frame(scale(objetos))

# 1. Crear base de datos 

df <- objetos

# 2. Determinar el número de grupos
grupos <- 4

# 3. Realizar la clasificación
segmentos <- kmeans(df, grupos)

# 4. Revisar la asignción de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)

# 5. Graficar resultados
# instal ggplot2 y factoextra
library(ggplot2)
library(factoextra)

fviz_cluster(segmentos, data = df,
          palette = c("red", "blue", "yellow","green"),
          ellpise.type = "euclid",
          star.plot = TRUE,
          repel = TRUE,
          ggtheme = theme())

# 6. Optimizar cantidad de grupos
library(cluster)
library(data.table)
            
set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1,K.max = 7)
plot(optimizacion, xlab="Número de clusters K")

#El punto más alto de la gráfica indica la cantidad de grupos óptimo.
LS0tCnRpdGxlOiAiQWN0aXZpZGFkIDQuMSIKYXV0aG9yOiAiUGF1bGluYSBCYWRpbGxvIEdvbnrDoWxleiIKZGF0ZTogIjIwMjMtMDktMTgiCm91dHB1dDogCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFCiAgICB0aGVtZTogInlldGkiCiAgICBoaWdobGlnaHQ6ICJ0YW5nbyIKLS0tCgohW10ofi9EZXNrdG9wL2lrZWEuZ2lmKQoKYGBge3J9CiJ+L0Rlc2t0b3AvaWtlYS5naWYiCmBgYAoKIyMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWU7PjEuIFRlb3JpYTwvc3Bhbj4KCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPjEuIENyZWFyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+CgojIyMgVGVvcmlhCmBgYHtyfQojIDEuIENyZWFyIGJhc2UgZGUgZGF0b3MKZGYgPC0gZGF0YS5mcmFtZSh4PWMoMiwyLDgsNSw3LDYsMSw0KSwKICAgICAgICAgICAgICAgICB5PWMgKDEwLDUsNCw4LDUsNCwyLDkpKQoKIyAyLiBEZXRlcm1pbmFyIGVsIG7Dum1lcm8gZGUgZ3J1cG9zCmdydXBvcyA8LSAzCgojIDMuIFJlYWxpemFyIGxhIGNsYXNpZmljYWNpw7NuCnNlZ21lbnRvcyA8LSBrbWVhbnMoZGYsZ3J1cG9zKQpzZWdtZW50b3MKCiMgNC4gUmV2aXNhciBsYSBhc2lnbmFjacOzbiBkZSBncnVwb3MKYXNpZ25hY2lvbiA8LSBjYmluZChkZiwgY2x1c3Rlcj1zZWdtZW50b3MkY2x1c3RlcikKYXNpZ25hY2lvbgoKIyA1LiBHcmFmaWNhciByZXN1bHRhZG9zIAojaW5zdGFsbC5wYWNrYWdlcygiZ2dwbG90MikKbGlicmFyeShnZ3Bsb3QyKQojaW5zdGFsbC5wYWNrYWdlcygiZmFjdG9leHRyYSIpCmxpYnJhcnkoZmFjdG9leHRyYSkKCmZ2aXpfY2x1c3RlcihzZWdtZW50b3MsIGRhdGE9ZGYsCiAgICAgICAgICAgICBwYWxldHRlPWMoInJlZCIsICJibHVlIiwgImJsYWNrIiwgImdyZWVuIiwgInllbGxvdyIpLAogICAgICAgICAgICAgZWxscGlzZS50eXBlPSJldWNsaWQiLAogICAgICAgICAgICAgc3Rhci5wbG90PSBULAogICAgICAgICAgICAgcmVwZWw9IFQsCiAgICAgICAgICAgICBnZ3RoZW1lPSB0aGVtZSgpKQoKIyA2LiBPcHRpbWl6YXIgY2FudGlkYWQgZGUgZ3J1cG9zCmxpYnJhcnkoY2x1c3RlcikKbGlicmFyeShkYXRhLnRhYmxlKQoKc2V0LnNlZWQoMTIzKQpvcHRpbWl6YWNpb24gPC0gY2x1c0dhcChkZiwgRlVOPWttZWFucywgbnN0YXJ0PTEsIEsubWF4ID0gNykKcGxvdChvcHRpbWl6YWNpb24sIHhsYWI9ICJOw7ptZXJvIGRlIGNsdXN0ZXJzIEsiKQoKIyBFbCBwdW50byBtw6FzIGFsdG8gZGUgbGEgZ3LDoWZpY2EgaW5kaWNhIGxhIGNhbnRpZGFkIGRlIGdydXBvcyDDs3B0aW1vLgpvcHRpbWl6YWNpb24gPC0gY2x1c0dhcApgYGAKCiMjIDxzcGFuIHN5dGxlPSJjb2xvcjogYmx1ZTsiPkVqZXJjaWNpbzwvc3Bhbj4KCiMjIDxzcGFuIHN0bHllPSJjb2xvcjogYmx1ZTsiPjEuIEltcG9ydGFybGEgYmFzZSBkZSBkYXRvczwvc3Bhbj4KYGBge3J9CiNmaWxlLmNob29zZSgpCmJkIDwtIHJlYWQuY3N2KCIvVXNlcnMvYmxhbmNhZ29uemFsZXovRGVza3RvcC92ZW50YXMuY3N2IikKYGBgCgojIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+Mi4gRW50ZW5kZXIgbGEgYmFzZSBkZSBkYXRvczwvc3Bhbj4KYGBge3J9CnN1bW1hcnkoYmQpCmxpYnJhcnkoZHBseXIpCiNjb3VudChiZCxCaWxsTm8sIHNvcnQ9VFJVRSkKI2NvdW50KGJkLEl0ZW1uYW1lLCBzb3J0PVRSVUUpCiNjb3VudChiZCxEYXRlLCBzb3J0PVRSVUUpCiNjb3VudChiZCxIb3VyLCBzb3J0PVRSVUUpCiNjb3VudChiZCxDdXN0b21lcklELCBzb3J0PVRSVUUpCiNjb3VudChiZCwgQ291bnRyeSwgc29ydD1UUlVFKQpgYGAKCk9ic2VydmFjaW9uZXM6CjEuIFRlbmVtb3MgY2FudGlkYWRlcywgcHJlY2lvcyB5IHRvdGFsZXMgbmVnYXRpdm9zLgoyLiBGZWNoYSB5IGhvcmEgbm8gdGllbmVuIGZvcm1hdG8gYWRlY3VhZG8uCjMuIFRlbmVtb3MgTkEncyBlbiBDdXN0b21lcklECgojIyMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWU7PjMuIExpbXBpYXIgbGEgYmFzZSBkZSBkYXRvczwvc3Bhbj4KYGBge3J9CiMgwr9DdcOhbnRvcyBOQSB0ZW5nbyBlbiBsYSBiYXNlIGRlIGRhdG9zPwpzdW0oaXMubmEoYmQpKQoKIyDCv0N1w6FudG9zIE5BIHRlbmdvIHBvciB2YXJpYWJsZT8Kc2FwcGx5KGJkLCBmdW5jdGlvbih4KSBzdW0oaXMubmEoeCkpKQoKI0VsaW1pbmFyIFRvdGFsZXMgbmVnYXRpdm9zIApiZCA8LSBiZFtiZCRUb3RhbCA+IDAsXQoKI0lkZW50aWZpY2FyIG91dGxpZXJzCmJveHBsb3QoYmQkVG90YWwsIGhvcml6b250YWwgPSBUUlVFKQpgYGAKT2JzZXJ2YWNpb25lczoKNC4gVGVuZW1vcyBwcmVzZW5jaWEgZGUgZGF0b3MgZnVlcmEgZGUgbG8gbm9ybWFsIChvdXRsaWVycykuCmBgYHtyfQojIE9idGVuZXIgZWwgdG90YWwgcG9yIHRpY2tldAp0aWNrZXRfcHJvbWVkaW8gPC0gYWdncmVnYXRlKFRvdGFsIH4gQ3VzdG9tZXJJRCArIEJpbGxObywgZGF0YSA9IGJkLCBzdW0pCgojT2J0ZW5lciBlbCB0aWNrZXQgcHJvbWVkaW8KdGlja2V0X3Byb21lZGlvIDwtIGFnZ3JlZ2F0ZShUb3RhbCB+IEN1c3RvbWVySUQsIGRhdGE9dGlja2V0X3Byb21lZGlvLCBtZWFuKQpjb2xuYW1lcyh0aWNrZXRfcHJvbWVkaW8pIDwtIGMoIkN1c3RvbWVySUQiLCAiVGlja2V0UHJvbWVkaW8iKQoKbGlicmFyeShkcGx5cikKI09idGVuZXIgY2FudGlkYWQgZGUgdmlzaXRhcyBwb3IgY2xpZW50ZQp2aXNpdGFzIDwtIGdyb3VwX2J5KGJkLCBDdXN0b21lcklEKSAlPiUgc3VtbWFyaXplKFZpc2l0YXM9IG5fZGlzdGluY3QoQmlsbE5vKSkKCiNKdW50YXIgbGFzIHRhYmxlcyBUaWNrZXQgUHJvbWVkaW8geSBWaXNpdGFzCm9iamV0b3MgPC0gbWVyZ2UodGlja2V0X3Byb21lZGlvLCB2aXNpdGFzLGJ5PSJDdXN0b21lcklEIikKCiNMbGFtYXIgYSBsb3MgcmVuZ2xvbmVzIGNvbW8gQ3VzdG9tZXJJRApyb3duYW1lcyhvYmpldG9zKSA8LSBvYmpldG9zJEN1c3RvbWVySUQKb2JqZXRvcyA8LSBzdWJzZXQob2JqZXRvcywgc2VsZWN0ID0gLWMoQ3VzdG9tZXJJRCkpCgojRWxpbWluYXIgZGF0b3MgZnVlcmEgZGUgbG8gbm9ybWFsCgojTG9zIGRhdG9zIGZ1ZXJhIGRlIGxvIG5vcm1hbCBlc3TDoW4gZnVlcmEgZGUgbG9zIHNpZ3VpZW50ZXMgbGltaXRlczoKI0xpbWl0ZSBpbmZlcmlvciA9IFExIC0gMS41KklRUgojbElNSVRFIFN1cGVyaW9yID0gUTMgKyAxLjUqSVFSCiNRMTogQ3VhcnRpbCAxLCBRMzogQ3VhcnRpbCAzLCBJUVIgUmFuZ28gSW50ZXJjdWFydGlsCgojQ29sdW1uYSBkZSBUaWNrZXQgUHJvbWVkaW8KSVFSX1RQIDwtIElRUihvYmpldG9zJFRpY2tldFByb21lZGlvKQpJUVJfVFAKc3VtbWFyeShvYmpldG9zKQpMSV9UUCA8LSAxNzguMzAgLSAxLjUqSVFSX1RQCkxJX1RQCkxTX1RQIDwtIDQyNi42MyArIDEuNSpJUVJfVFAKTFNfVFAKb2JqZXRvcyA8LSBvYmpldG9zW29iamV0b3MkVGlja2V0UHJvbWVkaW8gPD0gNzk5LjEzLF0KCklRUl9WIDwtIElRUihvYmpldG9zJFZpc2l0YXMpCklRUl9WCkxJX1YgPC0gMSAtIDEtNSpJUVJfVgpMSV9WCkxTX1YgPC0gNSsxLjUqSVFSX1YKTFNfVgpvYmpldG9zIDwtIG9iamV0b3Nbb2JqZXRvcyRWaXNpdGFzIDw9IDExLF0KYGBgCmBgYHtyfQojIDAuIE5vcm1hbGl6YXIgdmFyaWFibGVzCm9iamV0b3MgPC0gYXMuZGF0YS5mcmFtZShzY2FsZShvYmpldG9zKSkKCiMgMS4gQ3JlYXIgYmFzZSBkZSBkYXRvcyAKCmRmIDwtIG9iamV0b3MKCiMgMi4gRGV0ZXJtaW5hciBlbCBuw7ptZXJvIGRlIGdydXBvcwpncnVwb3MgPC0gNAoKIyAzLiBSZWFsaXphciBsYSBjbGFzaWZpY2FjacOzbgpzZWdtZW50b3MgPC0ga21lYW5zKGRmLCBncnVwb3MpCgojIDQuIFJldmlzYXIgbGEgYXNpZ25jacOzbiBkZSBncnVwb3MKYXNpZ25hY2lvbiA8LSBjYmluZChkZiwgY2x1c3Rlcj1zZWdtZW50b3MkY2x1c3RlcikKCiMgNS4gR3JhZmljYXIgcmVzdWx0YWRvcwojIGluc3RhbCBnZ3Bsb3QyIHkgZmFjdG9leHRyYQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoZmFjdG9leHRyYSkKCmZ2aXpfY2x1c3RlcihzZWdtZW50b3MsIGRhdGEgPSBkZiwKICAgICAgICAgIHBhbGV0dGUgPSBjKCJyZWQiLCAiYmx1ZSIsICJ5ZWxsb3ciLCJncmVlbiIpLAogICAgICAgICAgZWxscGlzZS50eXBlID0gImV1Y2xpZCIsCiAgICAgICAgICBzdGFyLnBsb3QgPSBUUlVFLAogICAgICAgICAgcmVwZWwgPSBUUlVFLAogICAgICAgICAgZ2d0aGVtZSA9IHRoZW1lKCkpCiAgICAgICAgICAgIAojIDYuIE9wdGltaXphciBjYW50aWRhZCBkZSBncnVwb3MKbGlicmFyeShjbHVzdGVyKQpsaWJyYXJ5KGRhdGEudGFibGUpCiAgICAgICAgICAgIApzZXQuc2VlZCgxMjMpCm9wdGltaXphY2lvbiA8LSBjbHVzR2FwKGRmLCBGVU49a21lYW5zLCBuc3RhcnQ9MSxLLm1heCA9IDcpCnBsb3Qob3B0aW1pemFjaW9uLCB4bGFiPSJOw7ptZXJvIGRlIGNsdXN0ZXJzIEsiKQoKI0VsIHB1bnRvIG3DoXMgYWx0byBkZSBsYSBncsOhZmljYSBpbmRpY2EgbGEgY2FudGlkYWQgZGUgZ3J1cG9zIMOzcHRpbW8uCmBgYAoK