Act 4.1 K vecinos

1.Importar la base de datos

file.choose()

## [1] "C:\\Users\\sguerra\\Downloads\\Ac4.1--K-Vecinos.Rmd"

##read.csv("C:\\Users\\sguerra\\Downloads\\ventas.csv")
##install.packages("dplyr")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

##install.packages("ggplot2")
library(ggplot2)
##install.packages("COUNT")
library(COUNT)

## Loading required package: msme

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: lattice

## Loading required package: sandwich

##install.packages("magrittr")
library(magrittr)
##install.packages("tidyverse")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ MASS::select()     masks dplyr::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

2. Ejercicio

bd <- read.csv("C:\\Users\\sguerra\\Downloads\\ventas.csv")
summary(bd)

##     BillNo            Itemname            Quantity            Date          
##  Length:522064      Length:522064      Min.   :-9600.00   Length:522064     
##  Class :character   Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Mode  :character   Median :    3.00   Mode  :character  
##                                        Mean   :   10.09                     
##                                        3rd Qu.:   10.00                     
##                                        Max.   :80995.00                     
##                                                                             
##      Hour               Price              CustomerID       Country         
##  Length:522064      Min.   :-11062.060   Min.   :12346    Length:522064     
##  Class :character   1st Qu.:     1.250   1st Qu.:13950    Class :character  
##  Mode  :character   Median :     2.080   Median :15265    Mode  :character  
##                     Mean   :     3.827   Mean   :15317                      
##                     3rd Qu.:     4.130   3rd Qu.:16837                      
##                     Max.   : 13541.330   Max.   :18287                      
##                                          NA's   :134041                     
##      Total          
##  Min.   :-11062.06  
##  1st Qu.:     3.75  
##  Median :     9.78  
##  Mean   :    19.69  
##  3rd Qu.:    17.40  
##  Max.   :168469.60  
##

str(bd)

## 'data.frame':    522064 obs. of  9 variables:
##  $ BillNo    : chr  "536365" "536365" "536365" "536365" ...
##  $ Itemname  : chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity  : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ Date      : chr  "01/12/2010" "01/12/2010" "01/12/2010" "01/12/2010" ...
##  $ Hour      : chr  "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
##  $ Price     : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID: int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country   : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ Total     : num  15.3 20.3 22 20.3 20.3 ...

##count(bd, BillNo, sort=TRUE)
##count(bd, Country, sort=TRUE)
##count(bd, Itemname, sort=TRUE)

#¿Cuántos NA tengo en la base de datos?

sum(is.na(bd))

## [1] 134041

#¿Cuántos NA tengo por variable?+
sapply(bd, function(x) sum(is.na(x)))

##     BillNo   Itemname   Quantity       Date       Hour      Price CustomerID 
##          0          0          0          0          0          0     134041 
##    Country      Total 
##          0          0

# Eliminar NA

bd <- na.omit(bd)

# Eliminar Totales negativos

bd <- bd[bd$Total > 0, ]

# Identificar Outliers

boxplot(bd$Total, horizontal = TRUE)

# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID + BillNo, data = bd, sum)

ticket_promedio <- aggregate(Total ~ CustomerID, data= ticket_promedio, mean)
colnames(ticket_promedio) <- c("CustomerID", "TicketPromedio")

# Cantidad de visitas por cliente
visitas <- group_by(bd, CustomerID) %>% summarise(Visitas= n_distinct(BillNo))

# Juntar las tablas ticket promedio y visitas
objetos <- merge(ticket_promedio, visitas, by= "CustomerID")

#Llamar a los renglones como Customer ID
rownames(objetos) <- objetos$CustomerID
objetos <- subset(objetos, select = -c(CustomerID))

# Eliminar datos fuera de lo normal


# Columna de ticket promedio
IQR_TP <- IQR(objetos$TicketPromedio)
IQR_TP

## [1] 248.3318

summary(objetos)

##  TicketPromedio        Visitas       
##  Min.   :    3.45   Min.   :  1.000  
##  1st Qu.:  178.30   1st Qu.:  1.000  
##  Median :  292.00   Median :  2.000  
##  Mean   :  415.62   Mean   :  4.227  
##  3rd Qu.:  426.63   3rd Qu.:  5.000  
##  Max.   :84236.25   Max.   :209.000

LI_ITP <- 178.30 - 1.5*IQR_TP
LI_ITP

## [1] -194.1977

LS_ITP <- 426.63 + 1.5*IQR_TP
LS_ITP

## [1] 799.1277

objetos <- objetos[objetos$TicketPromedio<=799.13, ]

# Columna visitas
IQR_V <- IQR(objetos$Visitas)
IQR_V

## [1] 4

LI_V <- 1 - 1.5*IQR_V
LI_V

## [1] -5

LS_V <- 5 + 1.5*IQR_V
LS_V

## [1] 11

objetos <- objetos[objetos$Visitas <= 11, ]
summary(objetos)

##  TicketPromedio      Visitas      
##  Min.   :  3.45   Min.   : 1.000  
##  1st Qu.:168.66   1st Qu.: 1.000  
##  Median :267.12   Median : 2.000  
##  Mean   :293.86   Mean   : 2.971  
##  3rd Qu.:384.49   3rd Qu.: 4.000  
##  Max.   :797.45   Max.   :11.000

Asignación de grupos

# 0. Normalizar variables
objetos <- as.data.frame(scale(objetos))

# 1. Crear base de datos
df <- objetos 

# 2. Determinar el número de grupos

grupos <- 4

# 3. Realizar ka clasificación
segmentos <- kmeans(df, grupos)

# 4. Revisar la asignación de grupos
asignación <- cbind(df, cluster= segmentos$cluster)

# 5. Gráficar resultados
##install.packages("ggplot2")
library(ggplot2)
##install.packages("factoextra")
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_cluster(segmentos, data=df,
             palette=c("red","blue","yellow","green" ),
             ellipse.type="euclid",
             star.plot= T,
             repel= T,
             ggtheme = theme())

# 6. Optimizar cantidad de grupos
##install.packages("cluster")
library(cluster)
##install.packages("data.table")
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following object is masked from 'package:purrr':
## 
##     transpose

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab= "Número de clusters K")

Act 4.1 K vecinos

Santiago Guerra

2023-09-19

R Markdown

Ejercicio

1.Importar la base de datos

2. Ejercicio

Asignación de grupos