library(readxl)
Data_tesis <- read_excel("C:/Users/User/Desktop/Data_tesis.xlsx")
View(Data_tesis)
attach(Data_tesis)
Tabla de Indicadores Importantes
total_registros=nrow(Data_tesis)
atributos_por_registro = ncol(Data_tesis)
precio_promedio=mean(Data_tesis$total,na.rm = TRUE)
mediana_precio=median(Data_tesis$total,na.rm = TRUE)
promedio_pizza=mean(Data_tesis$totalPizza,na.rm = TRUE)
promedio_plancha=mean(Data_tesis$totalPlancha,na.rm = TRUE)
promedio_bebidas=mean(Data_tesis$totalBebida,na.rm = TRUE)
#cantidad_pedidos=length(Data_tesis$total,na.rm = TRUE))
Resultado = data.frame(total_registros, atributos_por_registro, precio_promedio, mediana_precio,promedio_pizza, promedio_plancha, promedio_bebidas)
Resultado
## total_registros atributos_por_registro precio_promedio mediana_precio
## 1 1869 19 25388.87 22000
## promedio_pizza promedio_plancha promedio_bebidas
## 1 NA NA NA
Indentificar de qué tipo son los atributos:
tipos_atributos=sapply(Data_tesis, class)
tipos_atributos
## $fecha
## [1] "POSIXct" "POSIXt"
##
## $hora
## [1] "character"
##
## $nombre
## [1] "character"
##
## $celular
## [1] "numeric"
##
## $direccion
## [1] "character"
##
## $idPedido
## [1] "numeric"
##
## $pedido
## [1] "character"
##
## $totalPizza
## [1] "character"
##
## $totalPlancha
## [1] "character"
##
## $totalBebida
## [1] "character"
##
## $pagaCon
## [1] "character"
##
## $cambio
## [1] "character"
##
## $total
## [1] "numeric"
##
## $comanda
## [1] "character"
##
## $imprimido
## [1] "character"
##
## $metodoPago
## [1] "character"
##
## $X1
## [1] "numeric"
##
## $X2
## [1] "numeric"
##
## $Comentario
## [1] "character"
Tabla de estadística descriptiva
library(summarytools)
tabla_descrip = descr(Data_tesis)
tabla_descrip
## Descriptive Statistics
## Data_tesis
## N: 1869
##
## celular idPedido total X1 X2
## ----------------- --------------- ---------- ----------- --------- ---------
## Mean 3117159851.60 9.95 25388.87 -76.48 3.52
## Std.Dev 328522842.77 7.69 14614.46 0.15 0.56
## Min 44634110.00 1.00 0.00 -76.61 3.33
## Q1 3113659869.00 4.00 15500.00 -76.50 3.46
## Median 3154682799.00 8.00 22000.00 -76.50 3.47
## Q3 3177840588.00 14.00 31500.00 -76.49 3.47
## Max 5731734689.00 44.00 110000.00 -74.14 9.36
## MAD 41676997.95 7.41 11119.50 0.01 0.01
## IQR 64180719.00 10.00 16000.00 0.01 0.01
## CV 0.11 0.77 0.58 0.00 0.16
## Skewness -5.48 1.18 1.72 10.56 10.13
## SE.Skewness 0.06 0.06 0.06 0.06 0.06
## Kurtosis 66.73 1.46 4.65 124.57 101.82
## N.Valid 1869.00 1866.00 1868.00 1869.00 1869.00
## Pct.Valid 100.00 99.84 99.95 100.00 100.00
Diagrama de cajas y bigotes para atributos numéricos Permite identificar la existencia de datos atípicos
# Seleccionar solo una columna específica
columna_total = Data_tesis$total
# Crear el diagrama de cajas y bigotes de la columna seleccionada
boxplot(columna_total)
Detectar si hay datos faltantes (determinar en qué columnas y cuantos datos faltan en un mismo registro)
# Determinar datos faltantes por registro
datos_faltantes = apply(Data_tesis, 1, function(row) sum(is.na(row)))
# Determinar columnas con datos faltantes
columnas_con_faltantes = colnames(Data_tesis)[apply(Data_tesis, 2, function(col) any(is.na(col)))]
datos_faltantes
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 2 0 2 1 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0
## [260] 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1
## [482] 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0
## [593] 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [630] 0 2 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## [667] 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 1 1 1 1 0 0 0 0 0
## [704] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [741] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [778] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [815] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [852] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [889] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [926] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [963] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1000] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1037] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1074] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1111] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1148] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1185] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1222] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1259] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1296] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1333] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1370] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1407] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1444] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1481] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1518] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1555] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1592] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1629] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1666] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1703] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1740] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1777] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1814] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1851] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 6 1 0 6
columnas_con_faltantes
## [1] "idPedido" "pedido" "totalPizza" "totalPlancha" "totalBebida"
## [6] "pagaCon" "cambio" "total" "comanda"
Análisis variables categóricas
library(summarytools)
# Seleccionar la variable categórica
variable_categorica = Data_tesis$total
# Realizar análisis descriptivo de la variable categórica
table_desc = freq(variable_categorica)
# Imprimir la tabla descriptiva
print(table_desc)
## Frequencies
## variable_categorica
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ------------ ------ --------- -------------- --------- --------------
## 0 2 0.107 0.107 0.107 0.107
## 3500 1 0.054 0.161 0.054 0.161
## 5000 3 0.161 0.321 0.161 0.321
## 5700 2 0.107 0.428 0.107 0.428
## 6000 1 0.054 0.482 0.054 0.482
## 6500 19 1.017 1.499 1.017 1.498
## 7000 1 0.054 1.552 0.054 1.552
## 7300 4 0.214 1.767 0.214 1.766
## 7500 1 0.054 1.820 0.054 1.819
## 8000 19 1.017 2.837 1.017 2.836
## 8500 19 1.017 3.854 1.017 3.852
## 9000 14 0.749 4.604 0.749 4.601
## 9500 46 2.463 7.066 2.461 7.063
## 10000 15 0.803 7.869 0.803 7.865
## 10500 16 0.857 8.726 0.856 8.721
## 10700 2 0.107 8.833 0.107 8.828
## 11000 26 1.392 10.225 1.391 10.219
## 11500 9 0.482 10.707 0.482 10.701
## 12000 36 1.927 12.634 1.926 12.627
## 12400 1 0.054 12.687 0.054 12.681
## 12500 8 0.428 13.116 0.428 13.109
## 12800 9 0.482 13.597 0.482 13.590
## 13000 30 1.606 15.203 1.605 15.195
## 13500 16 0.857 16.060 0.856 16.051
## 13600 3 0.161 16.221 0.161 16.212
## 13700 1 0.054 16.274 0.054 16.265
## 14000 66 3.533 19.807 3.531 19.797
## 14300 1 0.054 19.861 0.054 19.850
## 14500 23 1.231 21.092 1.231 21.081
## 14700 1 0.054 21.146 0.054 21.134
## 15000 61 3.266 24.411 3.264 24.398
## 15200 1 0.054 24.465 0.054 24.452
## 15500 18 0.964 25.428 0.963 25.415
## 15700 1 0.054 25.482 0.054 25.468
## 16000 112 5.996 31.478 5.993 31.461
## 16500 19 1.017 32.495 1.017 32.477
## 16700 2 0.107 32.602 0.107 32.584
## 16800 2 0.107 32.709 0.107 32.691
## 17000 80 4.283 36.991 4.280 36.972
## 17100 1 0.054 37.045 0.054 37.025
## 17400 1 0.054 37.099 0.054 37.079
## 17500 12 0.642 37.741 0.642 37.721
## 18000 37 1.981 39.722 1.980 39.700
## 18300 2 0.107 39.829 0.107 39.807
## 18500 10 0.535 40.364 0.535 40.342
## 19000 46 2.463 42.827 2.461 42.804
## 19500 22 1.178 44.004 1.177 43.981
## 19900 2 0.107 44.111 0.107 44.088
## 20000 40 2.141 46.253 2.140 46.228
## 20100 1 0.054 46.306 0.054 46.281
## 20300 1 0.054 46.360 0.054 46.335
## 20500 19 1.017 47.377 1.017 47.352
## 20700 1 0.054 47.430 0.054 47.405
## 20800 3 0.161 47.591 0.161 47.566
## 21000 22 1.178 48.769 1.177 48.743
## 21300 1 0.054 48.822 0.054 48.796
## 21500 20 1.071 49.893 1.070 49.866
## 22000 55 2.944 52.837 2.943 52.809
## 22200 1 0.054 52.891 0.054 52.862
## 22500 26 1.392 54.283 1.391 54.254
## 22800 2 0.107 54.390 0.107 54.361
## 23000 19 1.017 55.407 1.017 55.377
## 23300 1 0.054 55.460 0.054 55.431
## 23500 12 0.642 56.103 0.642 56.073
## 23800 1 0.054 56.156 0.054 56.126
## 24000 37 1.981 58.137 1.980 58.106
## 24500 17 0.910 59.047 0.910 59.016
## 25000 27 1.445 60.493 1.445 60.460
## 25500 19 1.017 61.510 1.017 61.477
## 26000 39 2.088 63.597 2.087 63.563
## 26500 12 0.642 64.240 0.642 64.205
## 27000 24 1.285 65.525 1.284 65.490
## 27500 10 0.535 66.060 0.535 66.025
## 27800 2 0.107 66.167 0.107 66.132
## 28000 30 1.606 67.773 1.605 67.737
## 28500 16 0.857 68.630 0.856 68.593
## 29000 34 1.820 70.450 1.819 70.412
## 29300 4 0.214 70.664 0.214 70.626
## 29500 13 0.696 71.360 0.696 71.322
## 29600 1 0.054 71.413 0.054 71.375
## 30000 22 1.178 72.591 1.177 72.552
## 30500 15 0.803 73.394 0.803 73.355
## 31000 29 1.552 74.946 1.552 74.906
## 31500 4 0.214 75.161 0.214 75.120
## 32000 32 1.713 76.874 1.712 76.833
## 32500 13 0.696 77.570 0.696 77.528
## 33000 24 1.285 78.854 1.284 78.812
## 33500 6 0.321 79.176 0.321 79.133
## 34000 5 0.268 79.443 0.268 79.401
## 34500 6 0.321 79.764 0.321 79.722
## 35000 14 0.749 80.514 0.749 80.471
## 35500 12 0.642 81.156 0.642 81.113
## 36000 14 0.749 81.906 0.749 81.862
## 36500 5 0.268 82.173 0.268 82.129
## 37000 14 0.749 82.923 0.749 82.879
## 37500 3 0.161 83.084 0.161 83.039
## 38000 20 1.071 84.154 1.070 84.109
## 38500 5 0.268 84.422 0.268 84.377
## 39000 5 0.268 84.690 0.268 84.644
## 39500 5 0.268 84.957 0.268 84.912
## 40000 11 0.589 85.546 0.589 85.500
## 40500 1 0.054 85.600 0.054 85.554
## 41000 16 0.857 86.456 0.856 86.410
## 41500 5 0.268 86.724 0.268 86.677
## 42000 13 0.696 87.420 0.696 87.373
## 42500 4 0.214 87.634 0.214 87.587
## 43000 13 0.696 88.330 0.696 88.283
## 43500 5 0.268 88.597 0.268 88.550
## 44000 6 0.321 88.919 0.321 88.871
## 44500 7 0.375 89.293 0.375 89.246
## 45000 10 0.535 89.829 0.535 89.781
## 45500 4 0.214 90.043 0.214 89.995
## 46000 2 0.107 90.150 0.107 90.102
## 46500 2 0.107 90.257 0.107 90.209
## 47000 8 0.428 90.685 0.428 90.637
## 47500 3 0.161 90.846 0.161 90.797
## 48000 6 0.321 91.167 0.321 91.118
## 48200 1 0.054 91.221 0.054 91.172
## 48500 8 0.428 91.649 0.428 91.600
## 49000 4 0.214 91.863 0.214 91.814
## 49500 2 0.107 91.970 0.107 91.921
## 49700 1 0.054 92.024 0.054 91.974
## 50000 17 0.910 92.934 0.910 92.884
## 50500 2 0.107 93.041 0.107 92.991
## 51000 26 1.392 94.433 1.391 94.382
## 51500 3 0.161 94.593 0.161 94.543
## 52000 5 0.268 94.861 0.268 94.810
## 52500 1 0.054 94.914 0.054 94.864
## 53000 1 0.054 94.968 0.054 94.917
## 53500 2 0.107 95.075 0.107 95.024
## 54000 3 0.161 95.236 0.161 95.185
## 54500 3 0.161 95.396 0.161 95.345
## 55000 23 1.231 96.627 1.231 96.576
## 55500 2 0.107 96.734 0.107 96.683
## 56000 6 0.321 97.056 0.321 97.004
## 56500 1 0.054 97.109 0.054 97.057
## 57000 8 0.428 97.537 0.428 97.485
## 58000 5 0.268 97.805 0.268 97.753
## 60000 1 0.054 97.859 0.054 97.806
## 60500 1 0.054 97.912 0.054 97.860
## 61000 6 0.321 98.233 0.321 98.181
## 63000 5 0.268 98.501 0.268 98.448
## 66000 1 0.054 98.555 0.054 98.502
## 67000 3 0.161 98.715 0.161 98.662
## 69000 1 0.054 98.769 0.054 98.716
## 77000 1 0.054 98.822 0.054 98.769
## 78000 2 0.107 98.929 0.107 98.876
## 79000 1 0.054 98.983 0.054 98.930
## 80000 1 0.054 99.036 0.054 98.983
## 81000 2 0.107 99.143 0.107 99.090
## 83000 1 0.054 99.197 0.054 99.144
## 85000 1 0.054 99.251 0.054 99.197
## 86500 1 0.054 99.304 0.054 99.251
## 89000 1 0.054 99.358 0.054 99.304
## 89500 1 0.054 99.411 0.054 99.358
## 94000 1 0.054 99.465 0.054 99.411
## 98000 1 0.054 99.518 0.054 99.465
## 99000 2 0.107 99.625 0.107 99.572
## 99500 1 0.054 99.679 0.054 99.625
## 100000 2 0.107 99.786 0.107 99.732
## 103000 1 0.054 99.839 0.054 99.786
## 105500 1 0.054 99.893 0.054 99.839
## 107000 1 0.054 99.946 0.054 99.893
## 110000 1 0.054 100.000 0.054 99.946
## <NA> 1 0.054 100.000
## Total 1869 100.000 100.000 100.000 100.000
Graficos de variables categóricas
library(ggplot2)
# Seleccionar la variable categórica
variable_categorica = Data_tesis$metodoPago
# Calcular las frecuencias de la variable categórica
frecuencias = table(variable_categorica)
# Asignar colores a cada nivel único
colores <- scales::hue_pal()(length(unique(variable_categorica)))
# Crear el gráfico de torta
grafico_torta <- ggplot(data.frame(frecuencias), aes(x = "", y = Freq, fill = as.factor(variable_categorica))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
geom_text(aes(label = Freq), position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = colores)
# Mostrar el gráfico
print(grafico_torta)
Análisis de sentimiento
## Paso 1 - ahora creo el CORPUS
require(quanteda)
mycorpus <- corpus(Data_tesis$Comentario)
mycorpus
## Corpus consisting of 1,869 documents.
## text1 :
## "Muy buen servicio. Gracias"
##
## text2 :
## "Bueno"
##
## text3 :
## "Excelente"
##
## text4 :
## "Las respuestas muy lentas"
##
## text5 :
## "Súper bueno"
##
## text6 :
## "Excelente ❤"
##
## [ reached max_ndoc ... 1,863 more documents ]
head(summary(mycorpus))
## Text Types Tokens Sentences
## 1 text1 5 5 2
## 2 text2 1 1 1
## 3 text3 1 1 1
## 4 text4 4 4 1
## 5 text5 2 2 1
## 6 text6 2 2 1
## Paso 2 - TOKENIZATION (separar las palabras)
mycorpus.wd <- tokens(mycorpus, what = "word")
mycorpus.wd
## Tokens consisting of 1,869 documents.
## text1 :
## [1] "Muy" "buen" "servicio" "." "Gracias"
##
## text2 :
## [1] "Bueno"
##
## text3 :
## [1] "Excelente"
##
## text4 :
## [1] "Las" "respuestas" "muy" "lentas"
##
## text5 :
## [1] "Súper" "bueno"
##
## text6 :
## [1] "Excelente" "❤"
##
## [ reached max_ndoc ... 1,863 more documents ]
## Paso 3 NORMALIZACION DE TEXTOS
mycorpus_url_numb_punct_symb <- tokens(mycorpus, what = "word", remove_url = T, remove_numbers = T,
remove_punct = T, remove_symbols = T, remove_separators = T,)
## Paso 4 - STOPWORDS
# a="EL PROBLEMA"
# tolower(a)
#stopwords.es <- stopwords(language = "ES")
# stopwords.en <- stopwords(language = "en")
# stopwords.es=c(stopwords.es,"etc","q","k","problema",stopwords.en)
mycorpus_sw <- tokens_remove(mycorpus_url_numb_punct_symb)
mycorpus_sw
## Tokens consisting of 1,869 documents.
## text1 :
## [1] "Muy" "buen" "servicio" "Gracias"
##
## text2 :
## [1] "Bueno"
##
## text3 :
## [1] "Excelente"
##
## text4 :
## [1] "Las" "respuestas" "muy" "lentas"
##
## text5 :
## [1] "Súper" "bueno"
##
## text6 :
## [1] "Excelente"
##
## [ reached max_ndoc ... 1,863 more documents ]
## Paso 5 - STEMMING (opcional)
# encuentro las raices de las palabras
mycorpus_stem<-tokens_wordstem(mycorpus_sw, language = "es")
mycorpus_stem
## Tokens consisting of 1,869 documents.
## text1 :
## [1] "Muy" "buen" "servici" "Graci"
##
## text2 :
## [1] "Buen"
##
## text3 :
## [1] "Excelent"
##
## text4 :
## [1] "Las" "respuest" "muy" "lent"
##
## text5 :
## [1] "Sup" "buen"
##
## text6 :
## [1] "Excelent"
##
## [ reached max_ndoc ... 1,863 more documents ]
## Paso 6 - CREACION DE LA DOCUMENT TERM MATRIX
#dtm_sw <- dfm(mycorpus_stem) ##con steamming
dtm_sw <- dfm(mycorpus_sw) ##sin steamming
#las palabras mas frecuentes
dtm_sw[1:5,1:20]
## Document-feature matrix of: 5 documents, 20 features (88.00% sparse) and 0 docvars.
## features
## docs muy buen servicio gracias bueno excelente las respuestas lentas súper
## text1 1 1 1 1 0 0 0 0 0 0
## text2 0 0 0 0 1 0 0 0 0 0
## text3 0 0 0 0 0 1 0 0 0 0
## text4 1 0 0 0 0 0 1 1 1 0
## text5 0 0 0 0 1 0 0 0 0 1
## [ reached max_nfeat ... 10 more features ]
matriz_textos=data.frame(dtm_sw)
#
# require(table1)
# table1(~`Problemas Colombianos 1`,data=Encuesta)
topfeatures(dtm_sw, 10) #conteo
## bueno excelente gracias muy bien ok super súper
## 484 406 192 176 108 98 84 63
## genial el
## 45 44
round(topfeatures(dtm_sw, 10)/56,2)*100 #porcentaje
## bueno excelente gracias muy bien ok super súper
## 864 725 343 314 193 175 150 112
## genial el
## 80 79
## Paso 7 - Remover documentos con menos de 2 (palabras) y terminos con menos de 2 (documentos)
dtm_f10<-dfm_trim(dtm_sw, min_termfreq = 2, min_docfreq = 2)
dtm_f10
## Document-feature matrix of: 1,869 documents, 160 features (99.12% sparse) and 0 docvars.
## features
## docs muy buen servicio gracias bueno excelente las súper el pedido
## text1 1 1 1 1 0 0 0 0 0 0
## text2 0 0 0 0 1 0 0 0 0 0
## text3 0 0 0 0 0 1 0 0 0 0
## text4 1 0 0 0 0 0 1 0 0 0
## text5 0 0 0 0 1 0 0 1 0 0
## text6 0 0 0 0 0 1 0 0 0 0
## [ reached max_ndoc ... 1,863 more documents, reached max_nfeat ... 150 more features ]
round(topfeatures(dtm_f10, 10)/56,2)*100
## bueno excelente gracias muy bien ok super súper
## 864 725 343 314 193 175 150 112
## genial el
## 80 79
##Visualizacion WordCloud
library(RColorBrewer)
library(quanteda.textplots)
textplot_wordcloud(dtm_sw, min_count = 0, random_order = FALSE,
rotation = .25, min_size = 1, max_size = 5,
color = RColorBrewer::brewer.pal(8,"Dark2"))
Formato Hora
hora_dia=substring(Data_tesis$hora,1,2)
jornada=substring(Data_tesis$hora,7,8)
hora_dia=as.numeric(hora_dia)
validos=which(jornada=="AM"|jornada=="PM")
hora_dia=hora_dia[validos]
jornada=jornada[validos]
hora_dia[jornada=="PM"]=hora_dia[jornada=="PM"]+12
barplot(table(hora_dia))
res=data.frame(hora_dia)
require(ggplot2)
g1=ggplot(res,aes(x=hora_dia))+geom_bar()+theme_bw()+xlab("Hora del DÃa de Pedido")+ylab("Cantidad de Pedidos")+ggtitle("Frecuencia de Pedidos")
require(plotly)
ggplotly(g1)
Dirección - Proceso de Geocodificación
require(tmaptools)
locs=matrix(NA,nrow =dim(Data_tesis)[1],ncol = 2 )
for(i in 1:dim(Data_tesis)[1]){
loc=geocode_OSM(q = paste(gsub("#","",Data_tesis$direccion[i]), ",cali,colombia"))
if(length(loc)>0){
locs[i,1]=loc$coords[1]
locs[i,2]=loc$coords[2]
}
#print(i)
}
locs=data.frame(locs)
locs2=na.omit(locs)
require(leaflet)
leaflet() %>% addTiles() %>% addCircleMarkers(lng = locs2$X1,lat = locs2$X2)
datos_2 = data.frame(Data_tesis,locs)
View(datos_2)
Modelo ARIMA (modelos autorregresivos integrados de media móvil) y Pronóstico de ventas de los próximos 6 meses.
library(fpp2)
library(readxl)
library(dplyr)
library(lubridate)
library(tidyr)
Data_tesis <- read_excel("C:/Users/User/Desktop/Data_tesis.xlsx")
View(Data_tesis)
nrow(Data_tesis)
## [1] 1869
dim(Data_tesis)
## [1] 1869 19
attach(Data_tesis)
Data_tesis$fecha = date(Data_tesis$fecha)
columnas_seleccionadas = Data_tesis[, c("fecha", "total")]
fecha_venta = data.frame(columnas_seleccionadas)
View(fecha_venta)
Ventas_mensuales = fecha_venta %>%
mutate(month = format(fecha, "%m"), year = format(fecha, "%Y"))%>%
group_by(month, year) %>%
summarise(total = sum(total))
Ventas_mensuales = Ventas_mensuales[with(Ventas_mensuales, order(Ventas_mensuales$year)),]
Ventas_mensuales
## # A tibble: 32 × 3
## # Groups: month [12]
## month year total
## <chr> <chr> <dbl>
## 1 08 2020 78000
## 2 09 2020 293600
## 3 10 2020 403100
## 4 11 2020 658100
## 5 12 2020 1068900
## 6 01 2021 997100
## 7 02 2021 466800
## 8 03 2021 695900
## 9 04 2021 1039600
## 10 05 2021 1895600
## # ℹ 22 more rows
Ventas_mensuales = na.omit(Ventas_mensuales)
Y = ts(Ventas_mensuales[,3],start = c(2020,8), end = c(2023,3), frequency=12)
Y
## Jan Feb Mar Apr May Jun Jul Aug Sep
## 2020 78000 293600
## 2021 997100 466800 695900 1039600 1895600 1067000 1000000 1013500 665500
## 2022 2353000 1450500 1835500 2016500 2077500 2377500 2378500 1614500 2070500
## 2023 1879000 1963000 78000
## Oct Nov Dec
## 2020 403100 658100 1068900
## 2021 1293500 1449000 2615700
## 2022 2533000 1853000 2895000
## 2023
autoplot(Y)+
ggtitle("Ventas anuales del restaurante Ricuras de Sebastian")
descom = decompose(Y)
autoplot(descom)
acf(Y)
pacf(Y)
DY = diff(Y)
autoplot(DY)+
ggtitle("Cambios anuales en ventas del restaurante Ricuras de Sebastian")
modelo_arima = auto.arima(Y, d=1, D=1, stepwise = FALSE, approximation = FALSE, trace = TRUE)
##
## ARIMA(0,1,0)(0,1,0)[12] : 572.3964
## ARIMA(0,1,1)(0,1,0)[12] : 570.4888
## ARIMA(0,1,2)(0,1,0)[12] : Inf
## ARIMA(0,1,3)(0,1,0)[12] : 571.7853
## ARIMA(0,1,4)(0,1,0)[12] : Inf
## ARIMA(0,1,5)(0,1,0)[12] : Inf
## ARIMA(1,1,0)(0,1,0)[12] : 569.1525
## ARIMA(1,1,1)(0,1,0)[12] : 571.7483
## ARIMA(1,1,2)(0,1,0)[12] : 572.0016
## ARIMA(1,1,3)(0,1,0)[12] : Inf
## ARIMA(1,1,4)(0,1,0)[12] : Inf
## ARIMA(2,1,0)(0,1,0)[12] : 571.6
## ARIMA(2,1,1)(0,1,0)[12] : 574.8008
## ARIMA(2,1,2)(0,1,0)[12] : Inf
## ARIMA(2,1,3)(0,1,0)[12] : Inf
## ARIMA(3,1,0)(0,1,0)[12] : 574.6283
## ARIMA(3,1,1)(0,1,0)[12] : 577.9797
## ARIMA(3,1,2)(0,1,0)[12] : Inf
## ARIMA(4,1,0)(0,1,0)[12] : 577.2452
## ARIMA(4,1,1)(0,1,0)[12] : 581.6215
## ARIMA(5,1,0)(0,1,0)[12] : 581.5999
##
##
##
## Best model: ARIMA(1,1,0)(0,1,0)[12]
print(modelo_arima)
## Series: Y
## ARIMA(1,1,0)(0,1,0)[12]
##
## Coefficients:
## ar1
## -0.6475
## s.e. 0.2329
##
## sigma^2 = 4.78e+11: log likelihood = -282.2
## AIC=568.4 AICc=569.15 BIC=570.29
checkresiduals(modelo_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)(0,1,0)[12]
## Q* = 4.1449, df = 5, p-value = 0.5287
##
## Model df: 1. Total lags used: 6
fcst = forecast(modelo_arima, h=6, level = c(95))
autoplot(fcst) +
ggtitle("Pronóstico de ventas próximos 6 meses del restaurante Ricuras de Sebastian")
print(summary(fcst))
##
## Forecast method: ARIMA(1,1,0)(0,1,0)[12]
##
## Model Information:
## Series: Y
## ARIMA(1,1,0)(0,1,0)[12]
##
## Coefficients:
## ar1
## -0.6475
## s.e. 0.2329
##
## sigma^2 = 4.78e+11: log likelihood = -282.2
## AIC=568.4 AICc=569.15 BIC=570.29
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -88374.8 518558.3 335365.3 -69.48481 80.81189 0.3618666 -0.1151522
##
## Forecasts:
## Point Forecast Lo 95 Hi 95
## Apr 2023 1728930.0 373789.09 3084071
## May 2023 838082.3 -598765.60 2274930
## Jun 2023 1754447.7 -22726.66 3531622
## Jul 2023 1356322.6 -545754.49 3258400
## Aug 2023 850774.6 -1260448.61 2961998
## Sep 2023 1139415.0 -1105072.24 3383902
pronostico = as.data.frame(fcst)