#Se limpio la base previamente
#file.choose()
performance <- read.csv("/Users/Karen/Documents/FORM.Performance.limpia")
#install.packages("foreign")
library(foreign)
#install.packages("dplyr")
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("forcats")
library(forcats) # to work with categorical variables
#install.packages ("ggplot2")
library(ggplot2) # data visualization
#install.packages ("janitor")
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
#install.packages("Hmisc")
library(Hmisc) # several useful functions for data analysis
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#install.packages ("psych")
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#install.packages("naniar")
library(naniar) # summaries and visualization of missing values NAs
#install.packages("dlookr")
library(dlookr) # summaries and visualization of missing values NAs
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:base':
##
## transform
#install.packages ("kableExtra")
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#install.packages ("readr")
library(readr)
#install.packages ("corrplot")
library(corrplot)
## corrplot 0.92 loaded
#install.packages ("jtools")
library(jtools)
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
#install.packages ("lmtest")
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#install.packages ("car")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
#install.packages ("olsrr")
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
#install.packages ("gmodels")
library(gmodels)
describe (performance)
## # A tibble: 6 × 26
## described_…¹ n na mean sd se_mean IQR skewn…² kurto…³ p00 p01
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Target 1440 0 1 0 0 0 NaN NaN 1 1
## 2 Vueltas 1440 0 1.75 0.829 0.0219 1.25 0.494 -1.37 1 1
## 3 Plan.arrival 1440 0 6.62 7.50 0.198 10.8 0.614 -1.12 0 0
## 4 Real.arrival 1440 0 3.82 6.51 0.171 8 1.51 0.965 0 0
## 5 Real.depart… 1440 0 4.14 6.95 0.183 9 1.44 0.706 0 0
## 6 Diference 1440 0 0.316 0.922 0.0243 0.8 2.73 211. -14.4 0
## # … with 15 more variables: p05 <dbl>, p10 <dbl>, p20 <dbl>, p25 <dbl>,
## # p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>, p75 <dbl>,
## # p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>, and abbreviated
## # variable names ¹​described_variables, ²​skewness, ³​kurtosis
Se realizó una tabla de frecuencia de la variable plan arrival con el objetivo de validar las frecuencias en el estimado de llegada para cada uno de los clientes.
proportion <- prop.table(table(performance$Cliente,performance$Plan.arrival))
proportion %>%
kbl() %>%
kable_styling()
| 0 | 8 | 9 | 16 | 20 | |
|---|---|---|---|---|---|
| MAGNA | 0.125 | 0.000 | 0.000 | 0.000 | 0.000 |
| MAHLE | 0.000 | 0.125 | 0.125 | 0.000 | 0.125 |
| PRINTEL | 0.000 | 0.000 | 0.000 | 0.125 | 0.000 |
| VARROC | 0.375 | 0.000 | 0.000 | 0.000 | 0.000 |
Posterior a la tabla de frecuencia, se realizo una tabla cruzada donde se visualizo como es la proporcion respecto a la variable de las estimaciones de llegada de pedidos por cada cliente.
proportion %>%
kbl() %>%
kable_material (c("striped","hover"))
| 0 | 8 | 9 | 16 | 20 | |
|---|---|---|---|---|---|
| MAGNA | 0.125 | 0.000 | 0.000 | 0.000 | 0.000 |
| MAHLE | 0.000 | 0.125 | 0.125 | 0.000 | 0.125 |
| PRINTEL | 0.000 | 0.000 | 0.000 | 0.125 | 0.000 |
| VARROC | 0.375 | 0.000 | 0.000 | 0.000 | 0.000 |
location <- CrossTable(performance$Cliente, performance$Plan.arrival, prop.t=TRUE, prop.r=TRUE, prop.c=TRUE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1440
##
##
## | performance$Plan.arrival
## performance$Cliente | 0 | 8 | 9 | 16 | 20 | Row Total |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## MAGNA | 180 | 0 | 0 | 0 | 0 | 180 |
## | 90.000 | 22.500 | 22.500 | 22.500 | 22.500 | |
## | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.125 |
## | 0.250 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.125 | 0.000 | 0.000 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## MAHLE | 0 | 180 | 180 | 0 | 180 | 540 |
## | 270.000 | 187.500 | 187.500 | 67.500 | 187.500 | |
## | 0.000 | 0.333 | 0.333 | 0.000 | 0.333 | 0.375 |
## | 0.000 | 1.000 | 1.000 | 0.000 | 1.000 | |
## | 0.000 | 0.125 | 0.125 | 0.000 | 0.125 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## PRINTEL | 0 | 0 | 0 | 180 | 0 | 180 |
## | 90.000 | 22.500 | 22.500 | 1102.500 | 22.500 | |
## | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.125 |
## | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.125 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## VARROC | 540 | 0 | 0 | 0 | 0 | 540 |
## | 270.000 | 67.500 | 67.500 | 67.500 | 67.500 | |
## | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.375 |
## | 0.750 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.375 | 0.000 | 0.000 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 720 | 180 | 180 | 180 | 180 | 1440 |
## | 0.500 | 0.125 | 0.125 | 0.125 | 0.125 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
kbl(location) %>%
kable_classic()
|
|
|
|
#Boxplot
#En dicha grafica de boxplot se presenta como se encuentran posicionados los datos de la base, tanto el estimado (Plan arrival), como la llegado real (real arrival) y la salida real (real departure) de los pedidos realizados por todos los clientes
Delivery <- data.frame(performance$Cliente, performance$Plan.arrival, performance$Real.arrival, performance$Real.departure, performance$Date)
colnames(Delivery)<-c('Cliente', 'Plan.arrival', 'Real.arrival', 'Real.departure', 'Date')
Realarrival_boxplot = subset(Delivery, select =-c (Cliente, Date))
boxplot(Realarrival_boxplot,data=data,main="Deliverys",
col=rainbow(ncol(trees)))
#Bar plots 1
#En la siguiente grafica se presenta la llegada real de los pedidos a lo largo del primer semestre del 2022
Realarrival <- data.frame(performance$Date,performance$Real.arrival)
colnames(Realarrival)<-c('Date','Real.arrival')
ggplot(data = Realarrival, aes (x=Date, y=Real.arrival)) +
geom_bar(stat = "identity", fill="#FFAEB9") + scale_fill_grey() + # Add bars to the plot
labs(title = "Real arrival", # Add a title
x = "2022")
#Bar plots 2
#A modo de comparacion, en la siguiente grafica se presenta la salida real de los pedidos a lo largo del primer semestre del 2022
Realdeparture <- data.frame(performance$Date,performance$Real.departure)
colnames(Realdeparture)<-c('Date','Real.departure')
ggplot(data = Realdeparture, aes (x=Date, y=Real.departure)) +
geom_bar(stat = "identity", fill="#7AC5CD") + scale_fill_grey() + # Add bars to the plot
labs(title = "Real departure", # Add a title
x = "2022")
Validando las diversas variables en la base de datos y observando su distribucion, se decidio utilizar la variable de diferencia (entre la llegada real y la salida real) para observar su comportamiento y ver la comparación.
#Histograma
hist(performance$Diference,main="Histograma",xlab="Delivery diference",col='#FF7F24')
#Diagrama de dispersion
qqnorm(performance$Diference, main="Grafica de dispersion", ylab="Delivery diference",col='#FF7F24')
qqline(performance$Diference, col='#FF7F24')
#Graficos de normalidad
plot_normality(performance,Diference, col='#FF7F24')
-MAHLE es el unico cliente con la mayor frecuencia de 20 como estimado de llegada para los pedidos
-El estimado real de los pedidos tiene una mediana mucho mayor que la llegada real.
-La salida real vs la llegada real tiene una diferencia observable en los graficos.
-Entre Febrero-Marzo 2022 fue el pico con mayores pedidos para los clientes.
-La diferencia entre la llegada y salida de pedidos sigue una distribucion normal