#1 Cargar paquetes necesarios
#2 Cargar base de datos
df <- read.csv("Datta/Sample - Superstore.csv")
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 2 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 3 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045
## 4 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 5 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 6 6 CA-2014-115812 6/9/2014 6/14/2014 Standard Class BH-11710
## Customer.Name Segment Country City State
## 1 Claire Gute Consumer United States Henderson Kentucky
## 2 Claire Gute Consumer United States Henderson Kentucky
## 3 Darrin Van Huff Corporate United States Los Angeles California
## 4 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 5 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 6 Brosina Hoffman Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 42420 South FUR-CH-10000454 Furniture Chairs
## 3 90036 West OFF-LA-10000240 Office Supplies Labels
## 4 33311 South FUR-TA-10000577 Furniture Tables
## 5 33311 South OFF-ST-10000760 Office Supplies Storage
## 6 90032 West FUR-FU-10001487 Furniture Furnishings
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5 Eldon Fold 'N Roll Cart System 22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood 48.8600
## Quantity Discount Profit
## 1 2 0.00 41.9136
## 2 3 0.00 219.5820
## 3 2 0.00 6.8714
## 4 5 0.45 -383.0310
## 5 2 0.20 2.5164
## 6 7 0.00 14.1694
#Nota :la prediccion que va,os hacer concidera valores corrientes
df_mod <- df %>% # Llama a la base de datos
filter(State == "Indiana") %>%
select(Order.Date, Sales) %>% #Selecciona dos columnas Order.Data y Sales
mutate(Order.Date = mdy(Order.Date)) %>% # Cambia formato a a fechas
mutate(Fecha = floor_date(Order.Date, unit = "month")) %>% # Asigna las fechas a primero de mes
group_by(Fecha) %>%
summarise(Ventas = sum(Sales)) %>%
mutate(Mes = seq_along(Fecha)) # Crear variable x para facilitar la regresion
head(df_mod)
## # A tibble: 6 × 3
## Fecha Ventas Mes
## <date> <dbl> <int>
## 1 2014-01-01 5.94 1
## 2 2014-02-01 82.6 2
## 3 2014-03-01 16.3 3
## 4 2014-04-01 32.4 4
## 5 2014-06-01 648. 5
## 6 2014-10-01 464. 6
df_mod %>%
ggplot(., aes(x=Mes, y=Ventas)) +
geom_point() +
geom_line() +
geom_smooth(method = "lm", formula = "y ~ x")
#5 Grafica de caja y bigotes
# Esta grafica me sirve para dtectar valores atipicos valores extremos
df_mod %>%
ggplot(., aes(x=Ventas)) +
geom_boxplot()
#6 Analisis de normalidad
#Revisar que la variable y tenga una distribusion mas o menos normal
df_mod %>%
ggplot(., aes(x = Ventas)) +
geom_histogram(bins = 30, aes(y=..density..)) +
geom_density()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#7 Analisis de correlacion
#Calculamos el coeficiente de correlacion de Pearson
with(df_mod, cor.test(Mes, Ventas))
##
## Pearson's product-moment correlation
##
## data: Mes and Ventas
## t = 1.7929, df = 34, p-value = 0.08188
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0383295 0.5676329
## sample estimates:
## cor
## 0.2939073
# La correlacion va de -1 a 1
# Valores extremos indican correlacion mas fuerte
# Puede ser positiva (suben o bajan juntas) o
# negativa (si una sube la otra baja y viseversa)
#8 Modelo de regresion
modelo_lineal <- lm(Ventas ~ Mes, data = df_mod)
stargazer(modelo_lineal, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## Ventas
## -----------------------------------------------
## Mes 88.177*
## (49.180)
##
## Constant -143.623
## (1,043.456)
##
## -----------------------------------------------
## Observations 36
## R2 0.086
## Adjusted R2 0.060
## Residual Std. Error 3,065.369 (df = 34)
## F Statistic 3.215* (df = 1; 34)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# El modelo que buscamos estimar es: y= a+bx
# En este caso
# Consta =a
# Pendiente = b
# Mes = x
# Obtener los coeficientes del modelo es decir a y b
coeficientes <- coef(modelo_lineal)
intercepto <- coeficientes["(Intercept)"]
pendiente <- coeficientes["Mes"]
# Crear el texto de la ecuacion
ecuaucion <- paste0("y = ",
round(intercepto, 2),
" + ",
round(pendiente, 2),
"x")
ecuaucion
## [1] "y = -143.62 + 88.18x"
# Vamos a extender la serie temporal a 6 mese
df_futuro <- data.frame(Fecha = seq.Date(from = max(df_mod$Fecha) + months(1),
by = "month",
length.out = 6),
Mes = seq(from = 49, to = 54, by = 1))
# Generar las predicciones para 6 mese con intervalos de confianza
predicciones <- predict(modelo_lineal,
newdata = df_futuro,
interval = "confidence")
predicciones
## fit lwr upr
## 1 4177.044 956.7400 7397.347
## 2 4265.220 950.1519 7580.289
## 3 4353.397 943.2680 7763.527
## 4 4441.574 936.1123 7947.036
## 5 4529.751 928.7065 8130.796
## 6 4617.928 921.0699 8314.786
# Convertir a data frame
df_predicciones <- as.data.frame(predicciones)
colnames(df_predicciones) <- c("Ventas", "Bajo", "Alto")
# Unir las predicciones con las fechas
df_futuro <- cbind(df_futuro, df_predicciones) #cbind unir por columnas
# Unir con la base de datos original
df_total <- bind_rows(df_mod, df_futuro)
tail(df_total,7)
## # A tibble: 7 × 5
## Fecha Ventas Mes Bajo Alto
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2017-12-01 856. 36 NA NA
## 2 2018-01-01 4177. 49 957. 7397.
## 3 2018-02-01 4265. 50 950. 7580.
## 4 2018-03-01 4353. 51 943. 7764.
## 5 2018-04-01 4442. 52 936. 7947.
## 6 2018-05-01 4530. 53 929. 8131.
## 7 2018-06-01 4618. 54 921. 8315.
df_total %>%
ggplot(., aes(x = Fecha, y = Ventas)) +
geom_point(data = df_mod) +
geom_line(data = df_mod) +
geom_smooth(method = "lm", formula = "y ~ x", color ="blue", data = df_mod) +
geom_ribbon(data = df_futuro, aes(ymin = Bajo, ymax = Alto),
fill = "lightblue") +
geom_point(data = df_futuro, aes(y = Ventas), color = "red") +
geom_line(data = df_futuro, aes(y = Ventas), color = "red",linetype ="dashed") +
labs(title = "Predicciones de ventas enero-junio 2018",
x = "Mes",
y = "Ventas")