file.choose()
file.choose()
## [1] "C:\\Users\\sguerra\\Downloads\\Ac4.1--K-Vecinos.Rmd"
##read.csv("C:\\Users\\sguerra\\Downloads\\ventas.csv")
##install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##install.packages("ggplot2")
library(ggplot2)
##install.packages("COUNT")
library(COUNT)
## Loading required package: msme
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: lattice
## Loading required package: sandwich
##install.packages("magrittr")
library(magrittr)
##install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ MASS::select() masks dplyr::select()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bd <- read.csv("C:\\Users\\sguerra\\Downloads\\ventas.csv")
summary(bd)
## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. :-9600.00 Length:522064
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 10.09
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hour Price CustomerID Country
## Length:522064 Min. :-11062.060 Min. :12346 Length:522064
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
## Total
## Min. :-11062.06
## 1st Qu.: 3.75
## Median : 9.78
## Mean : 19.69
## 3rd Qu.: 17.40
## Max. :168469.60
##
str(bd)
## 'data.frame': 522064 obs. of 9 variables:
## $ BillNo : chr "536365" "536365" "536365" "536365" ...
## $ Itemname : chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ Date : chr "01/12/2010" "01/12/2010" "01/12/2010" "01/12/2010" ...
## $ Hour : chr "08:26:00" "08:26:00" "08:26:00" "08:26:00" ...
## $ Price : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID: int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ Total : num 15.3 20.3 22 20.3 20.3 ...
##count(bd, BillNo, sort=TRUE)
##count(bd, Country, sort=TRUE)
##count(bd, Itemname, sort=TRUE)
#¿CuĂ¡ntos NA tengo en la base de datos?
sum(is.na(bd))
## [1] 134041
#¿CuĂ¡ntos NA tengo por variable?+
sapply(bd, function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Hour Price CustomerID
## 0 0 0 0 0 0 134041
## Country Total
## 0 0
# Eliminar NA
bd <- na.omit(bd)
# Eliminar Totales negativos
bd <- bd[bd$Total > 0, ]
# Identificar Outliers
boxplot(bd$Total, horizontal = TRUE)
# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID + BillNo, data = bd, sum)
ticket_promedio <- aggregate(Total ~ CustomerID, data= ticket_promedio, mean)
colnames(ticket_promedio) <- c("CustomerID", "TicketPromedio")
# Cantidad de visitas por cliente
visitas <- group_by(bd, CustomerID) %>% summarise(Visitas= n_distinct(BillNo))
# Juntar las tablas ticket promedio y visitas
objetos <- merge(ticket_promedio, visitas, by= "CustomerID")
#Llamar a los renglones como Customer ID
rownames(objetos) <- objetos$CustomerID
objetos <- subset(objetos, select = -c(CustomerID))
# Eliminar datos fuera de lo normal
# Columna de ticket promedio
IQR_TP <- IQR(objetos$TicketPromedio)
IQR_TP
## [1] 248.3318
summary(objetos)
## TicketPromedio Visitas
## Min. : 3.45 Min. : 1.000
## 1st Qu.: 178.30 1st Qu.: 1.000
## Median : 292.00 Median : 2.000
## Mean : 415.62 Mean : 4.227
## 3rd Qu.: 426.63 3rd Qu.: 5.000
## Max. :84236.25 Max. :209.000
LI_ITP <- 178.30 - 1.5*IQR_TP
LI_ITP
## [1] -194.1977
LS_ITP <- 426.63 + 1.5*IQR_TP
LS_ITP
## [1] 799.1277
objetos <- objetos[objetos$TicketPromedio<=799.13, ]
# Columna visitas
IQR_V <- IQR(objetos$Visitas)
IQR_V
## [1] 4
LI_V <- 1 - 1.5*IQR_V
LI_V
## [1] -5
LS_V <- 5 + 1.5*IQR_V
LS_V
## [1] 11
objetos <- objetos[objetos$Visitas <= 11, ]
summary(objetos)
## TicketPromedio Visitas
## Min. : 3.45 Min. : 1.000
## 1st Qu.:168.66 1st Qu.: 1.000
## Median :267.12 Median : 2.000
## Mean :293.86 Mean : 2.971
## 3rd Qu.:384.49 3rd Qu.: 4.000
## Max. :797.45 Max. :11.000
# 0. Normalizar variables
objetos <- as.data.frame(scale(objetos))
# 1. Crear base de datos
df <- objetos
# 2. Determinar el nĂºmero de grupos
grupos <- 4
# 3. Realizar ka clasificaciĂ³n
segmentos <- kmeans(df, grupos)
# 4. Revisar la asignaciĂ³n de grupos
asignaciĂ³n <- cbind(df, cluster= segmentos$cluster)
# 5. GrĂ¡ficar resultados
##install.packages("ggplot2")
library(ggplot2)
##install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data=df,
palette=c("red","blue","yellow","green" ),
ellipse.type="euclid",
star.plot= T,
repel= T,
ggtheme = theme())
# 6. Optimizar cantidad de grupos
##install.packages("cluster")
library(cluster)
##install.packages("data.table")
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab= "NĂºmero de clusters K")