# Importar bases de datos
# file.choose()

stores <- read.csv("C:\\Users\\raulc\\OneDrive\\Escritorio\\stores.csv")
features <- read.csv("C:\\Users\\raulc\\OneDrive\\Escritorio\\features.csv")
train <- read.csv("C:\\Users\\raulc\\OneDrive\\Escritorio\\train.csv")
test <- read.csv("C:\\Users\\raulc\\OneDrive\\Escritorio\\test.csv")

# Instalar paquetes y llamar librerias
# install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Entender las bases de datos
summary(stores)
##      Store        Type                Size       
##  Min.   : 1   Length:45          Min.   : 34875  
##  1st Qu.:12   Class :character   1st Qu.: 70713  
##  Median :23   Mode  :character   Median :126512  
##  Mean   :23                      Mean   :130288  
##  3rd Qu.:34                      3rd Qu.:202307  
##  Max.   :45                      Max.   :219622
count(stores, Type, sort = TRUE)
##   Type  n
## 1    A 22
## 2    B 17
## 3    C  6
str(stores)
## 'data.frame':    45 obs. of  3 variables:
##  $ Store: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Type : chr  "A" "A" "B" "A" ...
##  $ Size : int  151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
summary(features)
##      Store        Date            Temperature       Fuel_Price   
##  Min.   : 1   Length:8190        Min.   : -7.29   Min.   :2.472  
##  1st Qu.:12   Class :character   1st Qu.: 45.90   1st Qu.:3.041  
##  Median :23   Mode  :character   Median : 60.71   Median :3.513  
##  Mean   :23                      Mean   : 59.36   Mean   :3.406  
##  3rd Qu.:34                      3rd Qu.: 73.88   3rd Qu.:3.743  
##  Max.   :45                      Max.   :101.95   Max.   :4.468  
##                                                                  
##    MarkDown1        MarkDown2           MarkDown3           MarkDown4       
##  Min.   : -2781   Min.   :  -265.76   Min.   :  -179.26   Min.   :    0.22  
##  1st Qu.:  1578   1st Qu.:    68.88   1st Qu.:     6.60   1st Qu.:  304.69  
##  Median :  4744   Median :   364.57   Median :    36.26   Median : 1176.42  
##  Mean   :  7032   Mean   :  3384.18   Mean   :  1760.10   Mean   : 3292.94  
##  3rd Qu.:  8923   3rd Qu.:  2153.35   3rd Qu.:   163.15   3rd Qu.: 3310.01  
##  Max.   :103185   Max.   :104519.54   Max.   :149483.31   Max.   :67474.85  
##  NA's   :4158     NA's   :5269        NA's   :4577        NA's   :4726      
##    MarkDown5             CPI         Unemployment    IsHoliday      
##  Min.   :  -185.2   Min.   :126.1   Min.   : 3.684   Mode :logical  
##  1st Qu.:  1440.8   1st Qu.:132.4   1st Qu.: 6.634   FALSE:7605     
##  Median :  2727.1   Median :182.8   Median : 7.806   TRUE :585      
##  Mean   :  4132.2   Mean   :172.5   Mean   : 7.827                  
##  3rd Qu.:  4832.6   3rd Qu.:213.9   3rd Qu.: 8.567                  
##  Max.   :771448.1   Max.   :229.0   Max.   :14.313                  
##  NA's   :4140       NA's   :585     NA's   :585
str(features)
## 'data.frame':    8190 obs. of  12 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "05/02/2010" "12/02/2010" "19/02/2010" "26/02/2010" ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ MarkDown1   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ IsHoliday   : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
summary(test)
##      Store            Dept           Date           IsHoliday      
##  Min.   : 1.00   Min.   : 1.00   Length:115064      Mode :logical  
##  1st Qu.:11.00   1st Qu.:18.00   Class :character   FALSE:106136   
##  Median :22.00   Median :37.00   Mode  :character   TRUE :8928     
##  Mean   :22.24   Mean   :44.34                                     
##  3rd Qu.:33.00   3rd Qu.:74.00                                     
##  Max.   :45.00   Max.   :99.00
str(test)
## 'data.frame':    115064 obs. of  4 variables:
##  $ Store    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date     : chr  "02/11/2012" "09/11/2012" "16/11/2012" "23/11/2012" ...
##  $ IsHoliday: logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
summary(train)
##      Store           Dept           Date            Weekly_Sales   
##  Min.   : 1.0   Min.   : 1.00   Length:421570      Min.   : -4989  
##  1st Qu.:11.0   1st Qu.:18.00   Class :character   1st Qu.:  2080  
##  Median :22.0   Median :37.00   Mode  :character   Median :  7612  
##  Mean   :22.2   Mean   :44.26                      Mean   : 15981  
##  3rd Qu.:33.0   3rd Qu.:74.00                      3rd Qu.: 20206  
##  Max.   :45.0   Max.   :99.00                      Max.   :693099  
##  IsHoliday      
##  Mode :logical  
##  FALSE:391909   
##  TRUE :29661    
##                 
##                 
## 
str(train)
## 'data.frame':    421570 obs. of  5 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "05/02/2010" "12/02/2010" "19/02/2010" "26/02/2010" ...
##  $ Weekly_Sales: num  24925 46039 41596 19404 21828 ...
##  $ IsHoliday   : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
# Observaciones 
# 1. "features", "test" y "train" tienen la fecha como caracter. R: Corregido.
# 2. "features", hay NA's en más de la mitad de los registros de MarkDown (del 1 al 5). R: Corregido.
# 3. En "features", hay 585 NA's en CPI y Unemployment, y hay 585 registros de IsHoliday = TRUE ¿Tiene relación? R: Sin Relación, y Corregido.
# 4. "train" tiene ventas semanales negativas. R: Se eliminaron.

# Herramienta "El Generador de Valor de Datos"
# Paso 1. Definir el área del negocio que buscamos impactar o mejorar y su KPI.
# El departamento de mercadotecnia de EUA (con muestra de 45 tiendas) en el indicador de Ventas Semanales.
# Paso 2. Seleccionar plantilla (-s) para crear valor a partir de los datos de los clientes.
# **Visión** | Segmentación | Personalización | Contextualización
# Paso 3. Generar ideas o conceptos específicos.
# Elaborar un modelo predictivo de ventas semanales.
# Paso 4. Reunir los datos requeridos.
# Elaborar una base de datos con la variable dependiente (Ventas Semanales) y las variables independientes (TBD)
# Paso 5. Plan de ejecución.
# Mercadotecnia elaborará plan para desplegar modelo predictivo en fases:
# Fase 1 . Piloto (San Antonio, Tx).
# Fase 2. Texas
# Fase 3. EUA
# Sistemas asegurará la captura del Markdown en las bases de datos.

# Creación de la base de datos consolidada

# Agregar "STORES" a "TRAIN"
bd <- merge(train, stores, by= "Store")

# Agregar "FEATURES" a "BD"
bd1 <- bd
bd1 <- merge(bd1, features)

# Eliminar columnas 
bd2 <- bd1
str(bd2)
## 'data.frame':    421570 obs. of  16 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "01/04/2011" "01/04/2011" "01/04/2011" "01/04/2011" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  49 26 81 34 59 30 7 85 8 28 ...
##  $ Weekly_Sales: num  13168 5947 28545 9950 317 ...
##  $ Type        : chr  "A" "A" "A" "A" ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Temperature : num  59.2 59.2 59.2 59.2 59.2 ...
##  $ Fuel_Price  : num  3.52 3.52 3.52 3.52 3.52 ...
##  $ MarkDown1   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : num  215 215 215 215 215 ...
##  $ Unemployment: num  7.68 7.68 7.68 7.68 7.68 ...
bd2 <- subset(bd2, select = -c (MarkDown1:MarkDown5))
summary(bd2)
##      Store          Date           IsHoliday            Dept      
##  Min.   : 1.0   Length:421570      Mode :logical   Min.   : 1.00  
##  1st Qu.:11.0   Class :character   FALSE:391909    1st Qu.:18.00  
##  Median :22.0   Mode  :character   TRUE :29661     Median :37.00  
##  Mean   :22.2                                      Mean   :44.26  
##  3rd Qu.:33.0                                      3rd Qu.:74.00  
##  Max.   :45.0                                      Max.   :99.00  
##   Weekly_Sales        Type                Size         Temperature    
##  Min.   : -4989   Length:421570      Min.   : 34875   Min.   : -2.06  
##  1st Qu.:  2080   Class :character   1st Qu.: 93638   1st Qu.: 46.68  
##  Median :  7612   Mode  :character   Median :140167   Median : 62.09  
##  Mean   : 15981                      Mean   :136728   Mean   : 60.09  
##  3rd Qu.: 20206                      3rd Qu.:202505   3rd Qu.: 74.28  
##  Max.   :693099                      Max.   :219622   Max.   :100.14  
##    Fuel_Price         CPI         Unemployment   
##  Min.   :2.472   Min.   :126.1   Min.   : 3.879  
##  1st Qu.:2.933   1st Qu.:132.0   1st Qu.: 6.891  
##  Median :3.452   Median :182.3   Median : 7.866  
##  Mean   :3.361   Mean   :171.2   Mean   : 7.960  
##  3rd Qu.:3.738   3rd Qu.:212.4   3rd Qu.: 8.572  
##  Max.   :4.468   Max.   :227.2   Max.   :14.313
# Cambiar formato de fecha
bd2$Date <- as.Date(bd2$Date, format = "%d/%m/%Y")
str(bd2)
## 'data.frame':    421570 obs. of  11 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2011-04-01" "2011-04-01" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  49 26 81 34 59 30 7 85 8 28 ...
##  $ Weekly_Sales: num  13168 5947 28545 9950 317 ...
##  $ Type        : chr  "A" "A" "A" "A" ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Temperature : num  59.2 59.2 59.2 59.2 59.2 ...
##  $ Fuel_Price  : num  3.52 3.52 3.52 3.52 3.52 ...
##  $ CPI         : num  215 215 215 215 215 ...
##  $ Unemployment: num  7.68 7.68 7.68 7.68 7.68 ...
# install.packages("wordspace")
library(wordspace)
## Loading required package: Matrix
signcount(bd2$Weekly_Sales)
##    pos   zero    neg 
## 420212     73   1285
# Eliminar Ventas menores que 0
bd3 <- bd2
bd3 <- bd3[bd3$Weekly_Sales > 0, ]
summary (bd3)
##      Store           Date            IsHoliday            Dept      
##  Min.   : 1.0   Min.   :2010-02-05   Mode :logical   Min.   : 1.00  
##  1st Qu.:11.0   1st Qu.:2010-10-08   FALSE:390652    1st Qu.:18.00  
##  Median :22.0   Median :2011-06-17   TRUE :29560     Median :37.00  
##  Mean   :22.2   Mean   :2011-06-18                   Mean   :44.24  
##  3rd Qu.:33.0   3rd Qu.:2012-02-24                   3rd Qu.:74.00  
##  Max.   :45.0   Max.   :2012-10-26                   Max.   :99.00  
##   Weekly_Sales        Type                Size         Temperature    
##  Min.   :     0   Length:420212      Min.   : 34875   Min.   : -2.06  
##  1st Qu.:  2120   Class :character   1st Qu.: 93638   1st Qu.: 46.68  
##  Median :  7662   Mode  :character   Median :140167   Median : 62.09  
##  Mean   : 16033                      Mean   :136750   Mean   : 60.09  
##  3rd Qu.: 20271                      3rd Qu.:202505   3rd Qu.: 74.28  
##  Max.   :693099                      Max.   :219622   Max.   :100.14  
##    Fuel_Price         CPI         Unemployment   
##  Min.   :2.472   Min.   :126.1   Min.   : 3.879  
##  1st Qu.:2.933   1st Qu.:132.0   1st Qu.: 6.891  
##  Median :3.452   Median :182.4   Median : 7.866  
##  Mean   :3.361   Mean   :171.2   Mean   : 7.960  
##  3rd Qu.:3.738   3rd Qu.:212.4   3rd Qu.: 8.567  
##  Max.   :4.468   Max.   :227.2   Max.   :14.313
# Agregar número de la semana
bd4 <- bd3
bd4$week_number <- strftime(bd4$Date, format = "%V")
str(bd4)
## 'data.frame':    420212 obs. of  12 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2011-04-01" "2011-04-01" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  49 26 81 34 59 30 7 85 8 28 ...
##  $ Weekly_Sales: num  13168 5947 28545 9950 317 ...
##  $ Type        : chr  "A" "A" "A" "A" ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Temperature : num  59.2 59.2 59.2 59.2 59.2 ...
##  $ Fuel_Price  : num  3.52 3.52 3.52 3.52 3.52 ...
##  $ CPI         : num  215 215 215 215 215 ...
##  $ Unemployment: num  7.68 7.68 7.68 7.68 7.68 ...
##  $ week_number : chr  "13" "13" "13" "13" ...
bd4$week_number <- as.integer(bd4$week_number)
str(bd4)
## 'data.frame':    420212 obs. of  12 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2011-04-01" "2011-04-01" ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Dept        : int  49 26 81 34 59 30 7 85 8 28 ...
##  $ Weekly_Sales: num  13168 5947 28545 9950 317 ...
##  $ Type        : chr  "A" "A" "A" "A" ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Temperature : num  59.2 59.2 59.2 59.2 59.2 ...
##  $ Fuel_Price  : num  3.52 3.52 3.52 3.52 3.52 ...
##  $ CPI         : num  215 215 215 215 215 ...
##  $ Unemployment: num  7.68 7.68 7.68 7.68 7.68 ...
##  $ week_number : int  13 13 13 13 13 13 13 13 13 13 ...
summary(bd4)
##      Store           Date            IsHoliday            Dept      
##  Min.   : 1.0   Min.   :2010-02-05   Mode :logical   Min.   : 1.00  
##  1st Qu.:11.0   1st Qu.:2010-10-08   FALSE:390652    1st Qu.:18.00  
##  Median :22.0   Median :2011-06-17   TRUE :29560     Median :37.00  
##  Mean   :22.2   Mean   :2011-06-18                   Mean   :44.24  
##  3rd Qu.:33.0   3rd Qu.:2012-02-24                   3rd Qu.:74.00  
##  Max.   :45.0   Max.   :2012-10-26                   Max.   :99.00  
##   Weekly_Sales        Type                Size         Temperature    
##  Min.   :     0   Length:420212      Min.   : 34875   Min.   : -2.06  
##  1st Qu.:  2120   Class :character   1st Qu.: 93638   1st Qu.: 46.68  
##  Median :  7662   Mode  :character   Median :140167   Median : 62.09  
##  Mean   : 16033                      Mean   :136750   Mean   : 60.09  
##  3rd Qu.: 20271                      3rd Qu.:202505   3rd Qu.: 74.28  
##  Max.   :693099                      Max.   :219622   Max.   :100.14  
##    Fuel_Price         CPI         Unemployment     week_number   
##  Min.   :2.472   Min.   :126.1   Min.   : 3.879   Min.   : 1.00  
##  1st Qu.:2.933   1st Qu.:132.0   1st Qu.: 6.891   1st Qu.:14.00  
##  Median :3.452   Median :182.4   Median : 7.866   Median :26.00  
##  Mean   :3.361   Mean   :171.2   Mean   : 7.960   Mean   :25.83  
##  3rd Qu.:3.738   3rd Qu.:212.4   3rd Qu.: 8.567   3rd Qu.:38.00  
##  Max.   :4.468   Max.   :227.2   Max.   :14.313   Max.   :52.00
# Separar Año, Mes y Día
bd5 <- bd4
bd5 <- bd5 %>%
  dplyr::mutate(year = lubridate::year(Date), 
                month = lubridate::month(Date), 
                day = lubridate::day(Date))

# Generar regresión
regresion <- lm(Weekly_Sales ~ Store + Dept + IsHoliday + Type + Size + Temperature + Fuel_Price + CPI + Unemployment + week_number + year + month + day, data=bd5)
summary(regresion)
## 
## Call:
## lm(formula = Weekly_Sales ~ Store + Dept + IsHoliday + Type + 
##     Size + Temperature + Fuel_Price + CPI + Unemployment + week_number + 
##     year + month + day, data = bd5)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34331 -12895  -5852   5626 671540 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.110e+06  2.999e+05   3.701 0.000214 ***
## Store         -1.426e+02  3.087e+00 -46.198  < 2e-16 ***
## Dept           1.108e+02  1.097e+00 101.013  < 2e-16 ***
## IsHolidayTRUE  8.511e+02  1.391e+02   6.119 9.45e-10 ***
## TypeB         -3.133e+02  1.078e+02  -2.908 0.003642 ** 
## TypeC          5.836e+03  1.840e+02  31.709  < 2e-16 ***
## Size           9.920e-02  9.584e-04 103.511  < 2e-16 ***
## Temperature    3.701e+00  2.133e+00   1.735 0.082688 .  
## Fuel_Price     4.791e+02  1.480e+02   3.237 0.001207 ** 
## CPI           -2.340e+01  9.996e-01 -23.409  < 2e-16 ***
## Unemployment  -2.538e+02  2.062e+01 -12.308  < 2e-16 ***
## week_number    7.678e+02  4.566e+02   1.682 0.092648 .  
## year          -5.485e+02  1.485e+02  -3.695 0.000220 ***
## month         -3.167e+03  1.988e+03  -1.594 0.111036    
## day           -1.281e+02  6.539e+01  -1.959 0.050115 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21690 on 420197 degrees of freedom
## Multiple R-squared:  0.08982,    Adjusted R-squared:  0.08979 
## F-statistic:  2962 on 14 and 420197 DF,  p-value: < 2.2e-16
# Construir un modelo de predicción
datos_nuevos <- data.frame(Store=1, Dept=1, IsHoliday= TRUE, Type= "A", Size= 151315, Temperature = 59.17, Fuel_Price = 3.524, CPI= 214.8372, Unemployment = 7.682, week_number =1, year =2012, month = 1, day=1)
predict(regresion,datos_nuevos)
##        1 
## 14667.94
# Gráfica Iteractiva
barplot(tapply(bd5$Weekly_Sales, format(bd5$Date, "%m"), FUN=mean))

barplot(tapply(bd5$Weekly_Sales, bd5$Type, FUN=mean))