stores <- read.csv("/Users/joseramonvazquezguzman/Documents/BDWAL/stores (2).csv")
features <- read.csv("/Users/joseramonvazquezguzman/Documents/BDWAL/features (6).csv")
test <- read.csv("/Users/joseramonvazquezguzman/Documents/BDWAL/test (2).csv")
train <- read.csv("/Users/joseramonvazquezguzman/Documents/BDWAL/train (2).csv")
# install.packages("dyplr")
library(dbplyr)
summary(stores)
## Store Type Size
## Min. : 1 Length:45 Min. : 34875
## 1st Qu.:12 Class :character 1st Qu.: 70713
## Median :23 Mode :character Median :126512
## Mean :23 Mean :130288
## 3rd Qu.:34 3rd Qu.:202307
## Max. :45 Max. :219622
# count(stores, Type, sort = TRUE)
str(stores)
## 'data.frame': 45 obs. of 3 variables:
## $ Store: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : chr "A" "A" "B" "A" ...
## $ Size : int 151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
summary(features)
## Store Date Temperature Fuel_Price
## Min. : 1 Length:8190 Min. : -7.29 Min. :2.472
## 1st Qu.:12 Class :character 1st Qu.: 45.90 1st Qu.:3.041
## Median :23 Mode :character Median : 60.71 Median :3.513
## Mean :23 Mean : 59.36 Mean :3.406
## 3rd Qu.:34 3rd Qu.: 73.88 3rd Qu.:3.743
## Max. :45 Max. :101.95 Max. :4.468
##
## MarkDown1 MarkDown2 MarkDown3 MarkDown4
## Min. : -2781 Min. : -265.76 Min. : -179.26 Min. : 0.22
## 1st Qu.: 1578 1st Qu.: 68.88 1st Qu.: 6.60 1st Qu.: 304.69
## Median : 4744 Median : 364.57 Median : 36.26 Median : 1176.42
## Mean : 7032 Mean : 3384.18 Mean : 1760.10 Mean : 3292.94
## 3rd Qu.: 8923 3rd Qu.: 2153.35 3rd Qu.: 163.15 3rd Qu.: 3310.01
## Max. :103185 Max. :104519.54 Max. :149483.31 Max. :67474.85
## NA's :4158 NA's :5269 NA's :4577 NA's :4726
## MarkDown5 CPI Unemployment IsHoliday
## Min. : -185.2 Min. :126.1 Min. : 3.684 Mode :logical
## 1st Qu.: 1440.8 1st Qu.:132.4 1st Qu.: 6.634 FALSE:7605
## Median : 2727.1 Median :182.8 Median : 7.806 TRUE :585
## Mean : 4132.2 Mean :172.5 Mean : 7.827
## 3rd Qu.: 4832.6 3rd Qu.:213.9 3rd Qu.: 8.567
## Max. :771448.1 Max. :229.0 Max. :14.313
## NA's :4140 NA's :585 NA's :585
str(features)
## 'data.frame': 8190 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05/02/2010" "12/02/2010" "19/02/2010" "26/02/2010" ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
summary(test)
## Store Dept Date IsHoliday
## Min. : 1.00 Min. : 1.00 Length:115064 Mode :logical
## 1st Qu.:11.00 1st Qu.:18.00 Class :character FALSE:106136
## Median :22.00 Median :37.00 Mode :character TRUE :8928
## Mean :22.24 Mean :44.34
## 3rd Qu.:33.00 3rd Qu.:74.00
## Max. :45.00 Max. :99.00
str(test)
## 'data.frame': 115064 obs. of 4 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "02/11/2012" "09/11/2012" "16/11/2012" "23/11/2012" ...
## $ IsHoliday: logi FALSE FALSE FALSE TRUE FALSE FALSE ...
summary(train)
## Store Dept Date Weekly_Sales
## Min. : 1.0 Min. : 1.00 Length:421570 Min. : -4989
## 1st Qu.:11.0 1st Qu.:18.00 Class :character 1st Qu.: 2080
## Median :22.0 Median :37.00 Mode :character Median : 7612
## Mean :22.2 Mean :44.26 Mean : 15981
## 3rd Qu.:33.0 3rd Qu.:74.00 3rd Qu.: 20206
## Max. :45.0 Max. :99.00 Max. :693099
## IsHoliday
## Mode :logical
## FALSE:391909
## TRUE :29661
##
##
##
str(train)
## 'data.frame': 421570 obs. of 5 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05/02/2010" "12/02/2010" "19/02/2010" "26/02/2010" ...
## $ Weekly_Sales: num 24924 46039 41596 19404 21828 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
# Agregar "STORES" a "TRAIN"
bd <- merge(train, stores, by= "Store")
# Agregar "Features" a "BD"
bd1 <-bd
bd1 <- merge(bd1, features)
# Eliminar columnas
bd2 <- bd1
bd2 <- subset(bd2, select = -c (MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5))
summary(bd2)
## Store Date IsHoliday Dept
## Min. : 1.0 Length:421570 Mode :logical Min. : 1.00
## 1st Qu.:11.0 Class :character FALSE:391909 1st Qu.:18.00
## Median :22.0 Mode :character TRUE :29661 Median :37.00
## Mean :22.2 Mean :44.26
## 3rd Qu.:33.0 3rd Qu.:74.00
## Max. :45.0 Max. :99.00
## Weekly_Sales Type Size Temperature
## Min. : -4989 Length:421570 Min. : 34875 Min. : -2.06
## 1st Qu.: 2080 Class :character 1st Qu.: 93638 1st Qu.: 46.68
## Median : 7612 Mode :character Median :140167 Median : 62.09
## Mean : 15981 Mean :136728 Mean : 60.09
## 3rd Qu.: 20206 3rd Qu.:202505 3rd Qu.: 74.28
## Max. :693099 Max. :219622 Max. :100.14
## Fuel_Price CPI Unemployment
## Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891
## Median :3.452 Median :182.3 Median : 7.866
## Mean :3.361 Mean :171.2 Mean : 7.960
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.572
## Max. :4.468 Max. :227.2 Max. :14.313
# Cambiar formato de fecha
bd2$Date <- as.Date(bd2$Date, format = "%d/%m/%Y")
str(bd2)
## 'data.frame': 421570 obs. of 11 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2011-04-01" "2011-04-01" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
# install.packages("wordspace")
library(wordspace)
## Loading required package: Matrix
signcount(bd2$Weekly_Sales)
## pos zero neg
## 420212 73 1285
# Eliminar Ventas menores que 0
bd3 <- bd2
bd3 <- bd3[bd3$Weekly_Sales > 0, ]
summary (bd3)
## Store Date IsHoliday Dept
## Min. : 1.0 Min. :2010-02-05 Mode :logical Min. : 1.00
## 1st Qu.:11.0 1st Qu.:2010-10-08 FALSE:390652 1st Qu.:18.00
## Median :22.0 Median :2011-06-17 TRUE :29560 Median :37.00
## Mean :22.2 Mean :2011-06-18 Mean :44.24
## 3rd Qu.:33.0 3rd Qu.:2012-02-24 3rd Qu.:74.00
## Max. :45.0 Max. :2012-10-26 Max. :99.00
## Weekly_Sales Type Size Temperature
## Min. : 0 Length:420212 Min. : 34875 Min. : -2.06
## 1st Qu.: 2120 Class :character 1st Qu.: 93638 1st Qu.: 46.68
## Median : 7662 Mode :character Median :140167 Median : 62.09
## Mean : 16033 Mean :136750 Mean : 60.09
## 3rd Qu.: 20271 3rd Qu.:202505 3rd Qu.: 74.28
## Max. :693099 Max. :219622 Max. :100.14
## Fuel_Price CPI Unemployment
## Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891
## Median :3.452 Median :182.4 Median : 7.866
## Mean :3.361 Mean :171.2 Mean : 7.960
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.567
## Max. :4.468 Max. :227.2 Max. :14.313
# Agregar número de la semana
bd4 <- bd3
bd4$week_number <- strftime(bd4$Date, format = "%V")
str(bd4)
## 'data.frame': 420212 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2011-04-01" "2011-04-01" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
## $ week_number : chr "13" "13" "13" "13" ...
bd4$week_number <- as.integer(bd4$week_number)
str(bd4)
## 'data.frame': 420212 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2011-04-01" "2011-04-01" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : int 49 26 81 34 59 30 7 85 8 28 ...
## $ Weekly_Sales: num 13168 5947 28545 9950 317 ...
## $ Type : chr "A" "A" "A" "A" ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ CPI : num 215 215 215 215 215 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
## $ week_number : int 13 13 13 13 13 13 13 13 13 13 ...
summary(bd4)
## Store Date IsHoliday Dept
## Min. : 1.0 Min. :2010-02-05 Mode :logical Min. : 1.00
## 1st Qu.:11.0 1st Qu.:2010-10-08 FALSE:390652 1st Qu.:18.00
## Median :22.0 Median :2011-06-17 TRUE :29560 Median :37.00
## Mean :22.2 Mean :2011-06-18 Mean :44.24
## 3rd Qu.:33.0 3rd Qu.:2012-02-24 3rd Qu.:74.00
## Max. :45.0 Max. :2012-10-26 Max. :99.00
## Weekly_Sales Type Size Temperature
## Min. : 0 Length:420212 Min. : 34875 Min. : -2.06
## 1st Qu.: 2120 Class :character 1st Qu.: 93638 1st Qu.: 46.68
## Median : 7662 Mode :character Median :140167 Median : 62.09
## Mean : 16033 Mean :136750 Mean : 60.09
## 3rd Qu.: 20271 3rd Qu.:202505 3rd Qu.: 74.28
## Max. :693099 Max. :219622 Max. :100.14
## Fuel_Price CPI Unemployment week_number
## Min. :2.472 Min. :126.1 Min. : 3.879 Min. : 1.00
## 1st Qu.:2.933 1st Qu.:132.0 1st Qu.: 6.891 1st Qu.:14.00
## Median :3.452 Median :182.4 Median : 7.866 Median :26.00
## Mean :3.361 Mean :171.2 Mean : 7.960 Mean :25.83
## 3rd Qu.:3.738 3rd Qu.:212.4 3rd Qu.: 8.567 3rd Qu.:38.00
## Max. :4.468 Max. :227.2 Max. :14.313 Max. :52.00
# Separar Año, Mes y Día
bd5 <- bd4
bd5 <- bd5 %>%
dplyr::mutate(year = lubridate::year(Date), month = lubridate::month(Date), day = lubridate::day(Date))
regresion <- lm(Weekly_Sales ~ Store + Dept + IsHoliday + Type + Size + Temperature + Fuel_Price + CPI + Unemployment + week_number + year + month + day, data=bd5)
summary(regresion)
##
## Call:
## lm(formula = Weekly_Sales ~ Store + Dept + IsHoliday + Type +
## Size + Temperature + Fuel_Price + CPI + Unemployment + week_number +
## year + month + day, data = bd5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34331 -12895 -5852 5626 671540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.110e+06 2.999e+05 3.701 0.000214 ***
## Store -1.426e+02 3.087e+00 -46.198 < 2e-16 ***
## Dept 1.108e+02 1.097e+00 101.013 < 2e-16 ***
## IsHolidayTRUE 8.511e+02 1.391e+02 6.119 9.45e-10 ***
## TypeB -3.133e+02 1.078e+02 -2.908 0.003642 **
## TypeC 5.836e+03 1.840e+02 31.709 < 2e-16 ***
## Size 9.920e-02 9.584e-04 103.511 < 2e-16 ***
## Temperature 3.701e+00 2.133e+00 1.735 0.082688 .
## Fuel_Price 4.791e+02 1.480e+02 3.237 0.001207 **
## CPI -2.340e+01 9.996e-01 -23.409 < 2e-16 ***
## Unemployment -2.538e+02 2.062e+01 -12.308 < 2e-16 ***
## week_number 7.678e+02 4.566e+02 1.682 0.092648 .
## year -5.485e+02 1.485e+02 -3.695 0.000220 ***
## month -3.167e+03 1.988e+03 -1.594 0.111036
## day -1.281e+02 6.539e+01 -1.959 0.050115 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21690 on 420197 degrees of freedom
## Multiple R-squared: 0.08982, Adjusted R-squared: 0.08979
## F-statistic: 2962 on 14 and 420197 DF, p-value: < 2.2e-16
datos_nuevos <- data.frame(Store=1, IsHoliday= TRUE, Dept=1, Type="A", Size= 151315, week_number =1, Temperature = 59.17, Fuel_Price = 3.524, CPI= 214.8372, Unemployment = 7.682, year =2012, month = 1, day=1)
predict(regresion,datos_nuevos)
## 1
## 14667.94
Crear estrategias de ventas con base a la festividad para aumentar las ventas semanales, involucrar al área de marketing con promociones que atraigan a los consumidores tanto frecuentes como no e incluyendo la cantidad de dinero que gastan en la tienda en sus visitas.
Se realizo una limpieza de base de datos, para poder desarrollar una regresión lineal, esta nos va a permitir conocer el comportamiento de la variable dependiente que buscamos analizar desde otras independientes. Se trabajaron 4 bases de dato, finalmente concluimos después de realizar el análisis se pudo pronosticar las ventas semanales con la magnitud que una empresa como walmart puede tener y en base a estas tener mejores estrategias que los lleven a mantener al cliente consumiendo constantemente; quedarse en el subconsciente de las personas y mantener o mejorar sus ventas.