a
features <- read.csv("C:\\Users\\chema\\Documents\\Clase\\features.csv")
stores <- read.csv("C:\\Users\\chema\\Documents\\Clase\\train.csv")
test <- read.csv("C:\\Users\\chema\\Documents\\Clase\\test.csv")
train <- read.csv("C:\\Users\\chema\\Documents\\Clase\\stores.csv")
library(yaml)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(stores)
## Store Dept Date Weekly_Sales
## Min. : 1.0 Min. : 1.00 Length:421570 Min. : -4989
## 1st Qu.:11.0 1st Qu.:18.00 Class :character 1st Qu.: 2080
## Median :22.0 Median :37.00 Mode :character Median : 7612
## Mean :22.2 Mean :44.26 Mean : 15981
## 3rd Qu.:33.0 3rd Qu.:74.00 3rd Qu.: 20206
## Max. :45.0 Max. :99.00 Max. :693099
## IsHoliday
## Mode :logical
## FALSE:391909
## TRUE :29661
##
##
##
str(stores)
## 'data.frame': 421570 obs. of 5 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05/02/2010" "12/02/2010" "19/02/2010" "26/02/2010" ...
## $ Weekly_Sales: num 24925 46039 41596 19404 21828 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
summary(features)
## Store Date Temperature Fuel_Price
## Min. : 1 Length:8190 Min. : -7.29 Min. :2.472
## 1st Qu.:12 Class :character 1st Qu.: 45.90 1st Qu.:3.041
## Median :23 Mode :character Median : 60.71 Median :3.513
## Mean :23 Mean : 59.36 Mean :3.406
## 3rd Qu.:34 3rd Qu.: 73.88 3rd Qu.:3.743
## Max. :45 Max. :101.95 Max. :4.468
##
## MarkDown1 MarkDown2 MarkDown3 MarkDown4
## Min. : -2781 Min. : -265.76 Min. : -179.26 Min. : 0.22
## 1st Qu.: 1578 1st Qu.: 68.88 1st Qu.: 6.60 1st Qu.: 304.69
## Median : 4744 Median : 364.57 Median : 36.26 Median : 1176.42
## Mean : 7032 Mean : 3384.18 Mean : 1760.10 Mean : 3292.94
## 3rd Qu.: 8923 3rd Qu.: 2153.35 3rd Qu.: 163.15 3rd Qu.: 3310.01
## Max. :103185 Max. :104519.54 Max. :149483.31 Max. :67474.85
## NA's :4158 NA's :5269 NA's :4577 NA's :4726
## MarkDown5 CPI Unemployment IsHoliday
## Min. : -185.2 Min. :126.1 Min. : 3.684 Mode :logical
## 1st Qu.: 1440.8 1st Qu.:132.4 1st Qu.: 6.634 FALSE:7605
## Median : 2727.1 Median :182.8 Median : 7.806 TRUE :585
## Mean : 4132.2 Mean :172.5 Mean : 7.827
## 3rd Qu.: 4832.6 3rd Qu.:213.9 3rd Qu.: 8.567
## Max. :771448.1 Max. :229.0 Max. :14.313
## NA's :4140 NA's :585 NA's :585
str(features)
## 'data.frame': 8190 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05/02/2010" "12/02/2010" "19/02/2010" "26/02/2010" ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
summary(test)
## Store Dept Date IsHoliday
## Min. : 1.00 Min. : 1.00 Length:115064 Mode :logical
## 1st Qu.:11.00 1st Qu.:18.00 Class :character FALSE:106136
## Median :22.00 Median :37.00 Mode :character TRUE :8928
## Mean :22.24 Mean :44.34
## 3rd Qu.:33.00 3rd Qu.:74.00
## Max. :45.00 Max. :99.00
str(test)
## 'data.frame': 115064 obs. of 4 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "02/11/2012" "09/11/2012" "16/11/2012" "23/11/2012" ...
## $ IsHoliday: logi FALSE FALSE FALSE TRUE FALSE FALSE ...
summary(train)
## Store Type Size
## Min. : 1 Length:45 Min. : 34875
## 1st Qu.:12 Class :character 1st Qu.: 70713
## Median :23 Mode :character Median :126512
## Mean :23 Mean :130288
## 3rd Qu.:34 3rd Qu.:202307
## Max. :45 Max. :219622
str(train)
## 'data.frame': 45 obs. of 3 variables:
## $ Store: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : chr "A" "A" "B" "A" ...
## $ Size : int 151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
Paso 1: Definir el área del negocio que buscamos impactar o mejorar y su KPI. El departamento de mercadotecnia de EUA (con muestra de 45 tiendas) en el indicador de Ventas Semanalmente.
Paso 2: Seleccionar plantilla (-s) para crear valor a partir de los datos de los clientes. Visión / Segmentación / Personalización / Contextualización
Paso 3: Generar ideas o conceptos específicos. Elaborar un modelo predictivo de ventas semanales.
Paso 4: Reunir los datos requeridos. Elaborar una base de datos con la variable dependiente (Ventas Semanales) y las variables independientes.
Paso 5: Plan de ejecución. Mercadotecnia elaborará plan para desplegar modelo predicitivo en fases.
Fase 1. Piloto (San Antonio, Tx). Fase 2. Texas Fase 3. EUA Sistemas asegurará la captura del Markdown en las bases de datos.
“Agregar”STORES” a “TRAIN”
bd <- merge(train, stores, by= "Store")
Agregar “FEATURES” a “BD”
bd1 <- bd
bd1 <- merge(bd,features)
Eliminar columnas
bd2 <- bd1
str(bd2)
## 'data.frame': 421570 obs. of 16 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "01/04/2011" "01/04/2011" "01/04/2011" "01/04/2011" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
bd2 <- subset(bd2, select = -c(MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown5))
summary(bd2)
## Store Date IsHoliday Type
## Min. : 1.0 Length:421570 Mode :logical Length:421570
## 1st Qu.:11.0 Class :character FALSE:391909 Class :character
## Median :22.0 Mode :character TRUE :29661 Mode :character
## Mean :22.2
## 3rd Qu.:33.0
## Max. :45.0
## Size Dept Weekly_Sales Temperature
## Min. : 34875 Min. : 1.00 Min. : -4989 Min. : -2.06
## 1st Qu.: 93638 1st Qu.:18.00 1st Qu.: 2080 1st Qu.: 46.68
## Median :140167 Median :37.00 Median : 7612 Median : 62.09
## Mean :136728 Mean :44.26 Mean : 15981 Mean : 60.09
## 3rd Qu.:202505 3rd Qu.:74.00 3rd Qu.: 20206 3rd Qu.: 74.28
## Max. :219622 Max. :99.00 Max. :693099 Max. :100.14
## Fuel_Price CPI Unemployment
## Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891
## Median :3.452 Median :182.3 Median : 7.866
## Mean :3.361 Mean :171.2 Mean : 7.960
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.572
## Max. :4.468 Max. :227.2 Max. :14.313
Cambiar formato de fecha
bd2$Date <- as.Date(bd2$Date, format = "%d/%m/%Y")
str(bd2)
## 'data.frame': 421570 obs. of 11 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2011-04-01" "2011-04-01" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
# install.packages("wordspace")
library(wordspace)
## Loading required package: Matrix
signcount(bd2$Weekly_Sales)
## pos zero neg
## 420212 73 1285
Eliminar ventas menores a $0
bd3 <- bd2
bd3 <- bd3[bd3$Weekly_Sales > 0, ]
summary (bd3)
## Store Date IsHoliday Type
## Min. : 1.0 Min. :2010-02-05 Mode :logical Length:420212
## 1st Qu.:11.0 1st Qu.:2010-10-08 FALSE:390652 Class :character
## Median :22.0 Median :2011-06-17 TRUE :29560 Mode :character
## Mean :22.2 Mean :2011-06-18
## 3rd Qu.:33.0 3rd Qu.:2012-02-24
## Max. :45.0 Max. :2012-10-26
## Size Dept Weekly_Sales Temperature
## Min. : 34875 Min. : 1.00 Min. : 0 Min. : -2.06
## 1st Qu.: 93638 1st Qu.:18.00 1st Qu.: 2120 1st Qu.: 46.68
## Median :140167 Median :37.00 Median : 7662 Median : 62.09
## Mean :136750 Mean :44.24 Mean : 16033 Mean : 60.09
## 3rd Qu.:202505 3rd Qu.:74.00 3rd Qu.: 20271 3rd Qu.: 74.28
## Max. :219622 Max. :99.00 Max. :693099 Max. :100.14
## Fuel_Price CPI Unemployment
## Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891
## Median :3.452 Median :182.4 Median : 7.866
## Mean :3.361 Mean :171.2 Mean : 7.960
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.567
## Max. :4.468 Max. :227.2 Max. :14.313
Agregar número de la semana
bd4 <- bd3
bd4$week_number <- strftime(bd4$Date, format = "%V")
str(bd4)
## 'data.frame': 420212 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2011-04-01" "2011-04-01" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
## $ week_number : chr "13" "13" "13" "13" ...
bd4$week_number <- as.integer(bd4$week_number)
str(bd4)
## 'data.frame': 420212 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2011-04-01" "2011-04-01" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
## $ week_number : int 13 13 13 13 13 13 13 13 13 13 ...
summary(bd4)
## Store Date IsHoliday Type
## Min. : 1.0 Min. :2010-02-05 Mode :logical Length:420212
## 1st Qu.:11.0 1st Qu.:2010-10-08 FALSE:390652 Class :character
## Median :22.0 Median :2011-06-17 TRUE :29560 Mode :character
## Mean :22.2 Mean :2011-06-18
## 3rd Qu.:33.0 3rd Qu.:2012-02-24
## Max. :45.0 Max. :2012-10-26
## Size Dept Weekly_Sales Temperature
## Min. : 34875 Min. : 1.00 Min. : 0 Min. : -2.06
## 1st Qu.: 93638 1st Qu.:18.00 1st Qu.: 2120 1st Qu.: 46.68
## Median :140167 Median :37.00 Median : 7662 Median : 62.09
## Mean :136750 Mean :44.24 Mean : 16033 Mean : 60.09
## 3rd Qu.:202505 3rd Qu.:74.00 3rd Qu.: 20271 3rd Qu.: 74.28
## Max. :219622 Max. :99.00 Max. :693099 Max. :100.14
## Fuel_Price CPI Unemployment week_number
## Min. :2.472 Min. :126.1 Min. : 3.879 Min. : 1.00
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891 1st Qu.:14.00
## Median :3.452 Median :182.4 Median : 7.866 Median :26.00
## Mean :3.361 Mean :171.2 Mean : 7.960 Mean :25.83
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.567 3rd Qu.:38.00
## Max. :4.468 Max. :227.2 Max. :14.313 Max. :52.00
Separar Año, mes y día
bd5 <- bd4
bd5 <- bd5 %>%
dplyr::mutate(year = lubridate::year(Date),
month = lubridate::month(Date),
day = lubridate::day(Date))
summary(bd5)
## Store Date IsHoliday Type
## Min. : 1.0 Min. :2010-02-05 Mode :logical Length:420212
## 1st Qu.:11.0 1st Qu.:2010-10-08 FALSE:390652 Class :character
## Median :22.0 Median :2011-06-17 TRUE :29560 Mode :character
## Mean :22.2 Mean :2011-06-18
## 3rd Qu.:33.0 3rd Qu.:2012-02-24
## Max. :45.0 Max. :2012-10-26
## Size Dept Weekly_Sales Temperature
## Min. : 34875 Min. : 1.00 Min. : 0 Min. : -2.06
## 1st Qu.: 93638 1st Qu.:18.00 1st Qu.: 2120 1st Qu.: 46.68
## Median :140167 Median :37.00 Median : 7662 Median : 62.09
## Mean :136750 Mean :44.24 Mean : 16033 Mean : 60.09
## 3rd Qu.:202505 3rd Qu.:74.00 3rd Qu.: 20271 3rd Qu.: 74.28
## Max. :219622 Max. :99.00 Max. :693099 Max. :100.14
## Fuel_Price CPI Unemployment week_number
## Min. :2.472 Min. :126.1 Min. : 3.879 Min. : 1.00
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891 1st Qu.:14.00
## Median :3.452 Median :182.4 Median : 7.866 Median :26.00
## Mean :3.361 Mean :171.2 Mean : 7.960 Mean :25.83
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.567 3rd Qu.:38.00
## Max. :4.468 Max. :227.2 Max. :14.313 Max. :52.00
## year month day
## Min. :2010 Min. : 1.00 Min. : 1.00
## 1st Qu.:2010 1st Qu.: 4.00 1st Qu.: 8.00
## Median :2011 Median : 6.00 Median :16.00
## Mean :2011 Mean : 6.45 Mean :15.67
## 3rd Qu.:2012 3rd Qu.: 9.00 3rd Qu.:23.00
## Max. :2012 Max. :12.00 Max. :31.00
Analizar las ventas por tienda, para ver que tanta diferencia hay entre ellas:
plot(bd5$Weekly_Sales, bd5$Store, xlab = "Ventas semanales", ylab = "Tienda")
regresion <- lm(Weekly_Sales ~ Store + IsHoliday + Dept + Type + Size + Temperature + Fuel_Price + CPI + Unemployment + week_number + year + month + day, data = bd5)
summary(regresion)
##
## Call:
## lm(formula = Weekly_Sales ~ Store + IsHoliday + Dept + Type +
## Size + Temperature + Fuel_Price + CPI + Unemployment + week_number +
## year + month + day, data = bd5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34331 -12895 -5852 5626 671540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.110e+06 2.999e+05 3.701 0.000214 ***
## Store -1.426e+02 3.087e+00 -46.198 < 2e-16 ***
## IsHolidayTRUE 8.511e+02 1.391e+02 6.119 9.45e-10 ***
## Dept 1.108e+02 1.097e+00 101.013 < 2e-16 ***
## TypeB -3.133e+02 1.078e+02 -2.908 0.003642 **
## TypeC 5.836e+03 1.840e+02 31.709 < 2e-16 ***
## Size 9.920e-02 9.584e-04 103.511 < 2e-16 ***
## Temperature 3.701e+00 2.133e+00 1.735 0.082688 .
## Fuel_Price 4.791e+02 1.480e+02 3.237 0.001207 **
## CPI -2.340e+01 9.996e-01 -23.409 < 2e-16 ***
## Unemployment -2.538e+02 2.062e+01 -12.308 < 2e-16 ***
## week_number 7.678e+02 4.566e+02 1.682 0.092648 .
## year -5.485e+02 1.485e+02 -3.695 0.000220 ***
## month -3.167e+03 1.988e+03 -1.594 0.111036
## day -1.281e+02 6.539e+01 -1.959 0.050115 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21690 on 420197 degrees of freedom
## Multiple R-squared: 0.08982, Adjusted R-squared: 0.08979
## F-statistic: 2962 on 14 and 420197 DF, p-value: < 2.2e-16
Modelo de Predicción 1
datos_nuevos <- data.frame(Store=1, Dept=1, IsHoliday=TRUE, Type="A", Size=151315, week_number=1, Temperature= 59.17, Fuel_Price=3.524, CPI=214.8372, Unemployment=7.682, year=2012, month=1, day=1:7)
prediccion1 <- predict(regresion, datos_nuevos)
predict(regresion, datos_nuevos)
## 1 2 3 4 5 6 7
## 14667.94 14539.85 14411.76 14283.67 14155.58 14027.49 13899.40
Modelo de Predicción 2: “Aumentar la temperatura al doble”
datos_nuevos1 <- data.frame(Store=1, Dept=1, IsHoliday=TRUE, Type="A", Size=151315, week_number=1, Temperature= 100, Fuel_Price=3.524, CPI=214.8372, Unemployment=7.682, year=2012, month=1, day=1:7)
prediccion_temp <- predict(regresion, datos_nuevos1)
predict(regresion, datos_nuevos1)
## 1 2 3 4 5 6 7
## 14819.04 14690.95 14562.86 14434.77 14306.68 14178.59 14050.50
Modelo de predicción 3: “Cambiar los valores a todos los promedios de cada variable” (de acuerdo al summary de la bd5)
datos_nuevos2 <- data.frame(Store=22, Dept=44, IsHoliday=FALSE, Type="A", Size=136750, week_number=25, Temperature= 60.09, Fuel_Price=3.361, CPI=171.2, Unemployment=7.960, year=2012, month=6, day=1:7)
prediccion_promedio <- predict(regresion, datos_nuevos2)
predict(regresion, datos_nuevos2)
## 1 2 3 4 5 6 7
## 17609.05 17480.96 17352.87 17224.78 17096.69 16968.60 16840.51
#Conclusión: Algunos de los datos mas importantes que hemos recopilado son:
Una posible estrategia a seguir para las tiendas es realizar estrategias de marketing durante las temporadas de calor, para poder atraer aun más clientes a sus tiendas. Esto por medio de promociones o productos especiales.