Actividad 4.1 K Vecinos

IKEA

Teoria

Paso 1. Determinar el número de grupo o clusters =3 Paso 2. Seleccionar aleatoriamente los centroides (seeds) c1=A1 C2=A4 C3=A7

Paso 3. Asignar cada objeto al centoride más cercano FÓMRULA

ITERACIÓN Objeto: distancia (Objeto,Centroide) A1:d(A1,A1): 0 ya que es centroide | d(A1,A4):3.61 | d(A1,A7):8.06 | A1 ∈ (pertenece a Cluster 1

A2:d(A2,A1): 5 | d(A2,A4): 4.24 | d(A2,A7): 3.16 | A2 ∈ Cluster 2

A3:d(A3,A1): √36 | d(A3,A4): √25 | d(A3,A7): √53 | A3 ∈ cluster 3

A4:d(A4,A1): | d(A4,A4):0 ya que es centroide | d(A4,A7): | A4 ∈ cluster 3

A5:d(A5,A1): √50 | d(A5,A4): √13 | d(A5,A7): √45 | A5 ∈ cluster 3

A6:d(A6,A1): √52 | d(A6,A4): √17 | d(A6,A7): √29| A6 ∈ cluster 3

A7:d(A7,A1): | d(A7,A4): | d(A7,A7): 0 ya que es centroide | A7 ∈ cluster 3

A8:d(A8,A1): √5 | d(A8,A4): √2 | d(A8,A7): √58 | A8 ∈ cluster 3

Resumen de la Iteración: Cluster 1=(A1), 2=(A2,A7), 3=(A3,A4,A5,A6,A8)

Paso 4. Actualizar posición de centroides con la posición promedio de los objetos pertenecientes a dicho grupo o cluster

CLUSTERS: C1=(2,10) | C2=((2+1)/2 , (5+2)/2)) -> C2= (1.5,3.5) | C3= ((8+5+7+6+4)/5 , (4+8+5+4+9)/5) -> C3=(6,6)

Paso 5. Repetir pasos 3,4 y hasta que los centorides no se muevan, o se muevan por debajo de una distancia umbral en cada paso

2 Iteración

Resumen de la 2da Iteración Cluster 1=(A1,A8), 2=(A2,A7), 3=(A3,A4,A5,A6) CLusters: C1=(3,9.5) | C2=(1.5,3.5) | C3=(7,4.33)

3era Iteración Resultado de la 3era Iteración Cluster: 1=(A1,A4,A8), 2=(A2,A7), 3(A3,A5,A6) Clusters: C1=(3.6,6.9), C2= (1.5,13.5), C3=(7,4.33)

4ta Iteración Resultado de la 4ta Iteración Cluster: 1=(A1,A4,A8), 2=(A2,A7), 3(A3,A5,A6) Clusters: C1=(3.6,6.9), C2= (1.5,13.5), C3=(7,4.33)

ITERACIONES EN R

x <- c(2, 2, 8, 5, 7, 6, 1, 4)
y <- c(10, 5, 4, 8, 5, 4, 2, 9)

data18S <- data.frame(x,y)

## Determinar el número de grupos
grupos<-3

## Realizar segmentación
segmentos <-kmeans(data18S,grupos)
segmentos

## K-means clustering with 3 clusters of sizes 3, 2, 3
## 
## Cluster means:
##          x        y
## 1 3.666667 9.000000
## 2 1.500000 3.500000
## 3 7.000000 4.333333
## 
## Clustering vector:
## [1] 1 2 3 1 3 3 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 6.666667 5.000000 2.666667
##  (between_SS / total_SS =  85.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Revisar la asignación de grupos

asignacion<-cbind(data18S,cluster=segmentos$cluster)
asignacion

##   x  y cluster
## 1 2 10       1
## 2 2  5       2
## 3 8  4       3
## 4 5  8       1
## 5 7  5       3
## 6 6  4       3
## 7 1  2       2
## 8 4  9       1

Graficar los Resultados

library(ggplot2)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_cluster(segmentos,data=data18S,palette=c("cadetblue3","cornsilk3","darkolivegreen3"),
             ellipse.type="euclid",
             star.plot=T,
             repel=T,
             ggtheme=theme()
             )

## Too few points to calculate an ellipse

## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

Optimizar cantidad de grupos

library(cluster)
library(data.table)

set.seed(123)
optimizacion<-clusGap(data18S,FUN=kmeans,nstart=1,K.max=7)
plot(optimizacion,xlab="Número de clusters k")

El punto más alto de la gráfica indica la cantidad de grupos óptimos en los que se pueden clasificar los datos.

Intento 1

library (tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library (foreign)
library (ggplot2)
library(dplyr)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggrepel)
DTVC<-read.csv("C://Users/IanAb/Documents/7to Semestre/DATA BASE/ventasSEP18.csv")

CIDG<-aggregate(DTVC$Total,by=list(DTVC$CustomerID), FUN=sum)
colnames(CIDG)<-c("CustomerID","Total")

CIDB <- aggregate(DTVC$BillNo~ DTVC$CustomerID, data = DTVC, FUN = length)
colnames(CIDB) <- c("CustomerID", "Cantidad_Compras")

CIDB_G<-CIDG%>%left_join(CIDB,by="CustomerID")

Solución

summary (DTVC)

##     BillNo            Itemname            Quantity          Date          
##  Length:522064      Length:522064      Min.   : -9600   Length:522064     
##  Class :character   Class :character   1st Qu.:     1   Class :character  
##  Mode  :character   Mode  :character   Median :     3   Mode  :character  
##                                        Mean   :  3385                     
##                                        3rd Qu.:    10                     
##                                        Max.   :560368                     
##                                                                           
##      Hour               Price              CustomerID       Country         
##  Length:522064      Min.   :-11062.060   Min.   :12346    Length:522064     
##  Class :character   1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Mode  :character   Median :     2.080   Median :15265    Mode  :character  
##                     Mean   :     3.827   Mean   :15317                      
##                     3rd Qu.:     4.130   3rd Qu.:16837                      
##                     Max.   : 13541.330   Max.   :18287                      
##                                          NA's   :134041                     
##      Total          
##  Min.   :-11062.06  
##  1st Qu.:     3.75  
##  Median :     9.78  
##  Mean   :    19.69  
##  3rd Qu.:    17.40  
##  Max.   :168469.60  
##

# count(DTVC,BillNo,Sort=TRUE)
# count(DTVC,Itemname,Sort=TRUE)
# count(DTVC,Date,Sort=TRUE)
#count(DTVC,Hour,Sort=TRUE)
# count(DTVC,Country,Sort=TRUE)

Observaciones 1 Tenemos cantidad, precios y totales negativos 2 Fechas y hora no tienen formato adecuado 3 Tenemos NA’s en Customer ID

#¿cuántos NA tengo en la base de datos?

sum(is.na(DTVC))

## [1] 134041

#¿Cuántos NA tengo por variable?
sapply(DTVC,function(x) sum(is.na(x)))

##     BillNo   Itemname   Quantity       Date       Hour      Price CustomerID 
##          0          0          0          0          0          0     134041 
##    Country      Total 
##          0          0

#Eliminar NA

DTVC<-na.omit(DTVC)

#Eliminar Totales Negativos
DTVC<-DTVC[DTVC$Total>0,]

# Identificar Outliers 
boxplot(DTVC$Total,horizontal=TRUE)

Obsevraciones: 4. Tenemos outliers en Total.

## Obtener cantidad de visitas por cliente 
Visitas<- group_by(DTVC,CustomerID)%>%summarise(Visitas= n_distinct(BillNo))

## Obtener el total por ticket 
ticketprom<-aggregate(Total ~ CustomerID + BillNo,data=DTVC,sum)

## Obtener el Ticket Promedio
ticketprom1<-aggregate(Total ~ CustomerID, data=ticketprom,mean)

objetos<-merge(Visitas,ticketprom1,by="CustomerID")

rownames(objetos)<-objetos$CustomerID


##Eliminar la columna de CustomerID
objetos<-subset(objetos,select=-c(CustomerID))

IQR_V<-IQR(objetos$Visitas)
IQR_V

## [1] 4

summary(objetos)

##     Visitas            Total         
##  Min.   :  1.000   Min.   :    3.45  
##  1st Qu.:  1.000   1st Qu.:  178.30  
##  Median :  2.000   Median :  292.00  
##  Mean   :  4.227   Mean   :  415.62  
##  3rd Qu.:  5.000   3rd Qu.:  426.63  
##  Max.   :209.000   Max.   :84236.25

LI_V<-1-1.5*IQR_V

LS_V<-5+1.5*IQR_V

objetos<-objetos[objetos$Visitas<=11,]

#Columna Ticket Promedio
colnames(objetos)<-c("Visitas","TicketPromedio")
IQR_TP<-IQR(objetos$TicketPromedio,)

summary

## function (object, ...) 
## UseMethod("summary")
## <bytecode: 0x000001ba2c94be00>
## <environment: namespace:base>

LI_TP<-178.30-1.5*IQR_TP
LI_TP

## [1] -186.76

LS_TP<-426.62+1.5*IQR_TP
LS_TP

## [1] 791.68

objetos<-objetos[objetos$TicketPromedio<=791.69,]

## Determinar el número de grupos
grupos1<-4

## Realizar segmentación
segmentos1 <-kmeans(objetos,grupos1)
# segmentos1
asignacion1<-cbind(objetos,cluster=segmentos1$cluster)
# asignacion1

library(ggplot2)
library(factoextra)

fviz_cluster(segmentos1,data=objetos,palette=c("cadetblue3","cornsilk3","darkolivegreen3","azure3"),
             ellipse.type="euclid",
             star.plot=T,
             repel=T,
             ggtheme=theme()
             )

library(cluster)
library(data.table)

set.seed(123)
optimizacion1<-clusGap(objetos,FUN=kmeans,nstart=1,K.max=7)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 188200)

plot(optimizacion1,xlab="Número de clusters k")

Actividad 4.1 K Vecinos

Ian Abraham Quiroz Cuapio

2023-09-18

Teoria

Intento 1

Solución