)
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read.csv("/Users/brisnaordaz/Downloads/walmart.csv")
summary(df)
## Store Date Weekly_Sales Holiday_Flag
## Min. : 1 Length:6435 Min. : 209986 Min. :0.00000
## 1st Qu.:12 Class :character 1st Qu.: 553350 1st Qu.:0.00000
## Median :23 Mode :character Median : 960746 Median :0.00000
## Mean :23 Mean :1046965 Mean :0.06993
## 3rd Qu.:34 3rd Qu.:1420159 3rd Qu.:0.00000
## Max. :45 Max. :3818686 Max. :1.00000
## Temperature Fuel_Price CPI Unemployment
## Min. : -2.06 Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.: 47.46 1st Qu.:2.933 1st Qu.:131.7 1st Qu.: 6.891
## Median : 62.67 Median :3.445 Median :182.6 Median : 7.874
## Mean : 60.66 Mean :3.359 Mean :171.6 Mean : 7.999
## 3rd Qu.: 74.94 3rd Qu.:3.735 3rd Qu.:212.7 3rd Qu.: 8.622
## Max. :100.14 Max. :4.468 Max. :227.2 Max. :14.313
str(df)
## 'data.frame': 6435 obs. of 8 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: int 0 1 0 0 0 0 0 0 0 0 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
df$fecha <- as.Date(df$Date, format= "%d-%m-%Y")
df$Year <- as.integer(format(df$fecha, "%Y"))
df$Month <- as.integer(format(df$fecha, "%m"))
df$WeekYear <- as.integer(format(df$fecha, "%W")) # semana del año
df$WeekDay <- as.integer(format(df$fecha, "%u")) # día de la semana (1=lunes)
df$Day <- as.integer(format(df$fecha, "%d"))
summary(df)
## Store Date Weekly_Sales Holiday_Flag
## Min. : 1 Length:6435 Min. : 209986 Min. :0.00000
## 1st Qu.:12 Class :character 1st Qu.: 553350 1st Qu.:0.00000
## Median :23 Mode :character Median : 960746 Median :0.00000
## Mean :23 Mean :1046965 Mean :0.06993
## 3rd Qu.:34 3rd Qu.:1420159 3rd Qu.:0.00000
## Max. :45 Max. :3818686 Max. :1.00000
## Temperature Fuel_Price CPI Unemployment
## Min. : -2.06 Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.: 47.46 1st Qu.:2.933 1st Qu.:131.7 1st Qu.: 6.891
## Median : 62.67 Median :3.445 Median :182.6 Median : 7.874
## Mean : 60.66 Mean :3.359 Mean :171.6 Mean : 7.999
## 3rd Qu.: 74.94 3rd Qu.:3.735 3rd Qu.:212.7 3rd Qu.: 8.622
## Max. :100.14 Max. :4.468 Max. :227.2 Max. :14.313
## fecha Year Month WeekYear
## Min. :2010-02-05 Min. :2010 Min. : 1.000 Min. : 1.00
## 1st Qu.:2010-10-08 1st Qu.:2010 1st Qu.: 4.000 1st Qu.:14.00
## Median :2011-06-17 Median :2011 Median : 6.000 Median :26.00
## Mean :2011-06-17 Mean :2011 Mean : 6.448 Mean :25.82
## 3rd Qu.:2012-02-24 3rd Qu.:2012 3rd Qu.: 9.000 3rd Qu.:38.00
## Max. :2012-10-26 Max. :2012 Max. :12.000 Max. :52.00
## WeekDay Day
## Min. :5 Min. : 1.00
## 1st Qu.:5 1st Qu.: 8.00
## Median :5 Median :16.00
## Mean :5 Mean :15.68
## 3rd Qu.:5 3rd Qu.:23.00
## Max. :5 Max. :31.00
str(df)
## 'data.frame': 6435 obs. of 14 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: int 0 1 0 0 0 0 0 0 0 0 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ fecha : Date, format: "2010-02-05" "2010-02-12" ...
## $ Year : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Month : int 2 2 2 2 3 3 3 3 4 4 ...
## $ WeekYear : int 5 6 7 8 9 10 11 12 13 14 ...
## $ WeekDay : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 5 12 19 26 5 12 19 26 2 9 ...
regression <- lm(Weekly_Sales~., data=df)
summary(regression)
##
## Call:
## lm(formula = Weekly_Sales ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1535955 -374908 -32975 360767 1915428
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 975487.1 222042.4 4.393 1.14e-05 ***
## Store -15328.8 514.2 -29.812 < 2e-16 ***
## Date01-06-2012 75296.6 110146.7 0.684 0.494251
## Date01-07-2011 94893.2 110402.9 0.860 0.390088
## Date01-10-2010 213582.2 119303.7 1.790 0.073463 .
## Date02-03-2012 38530.4 108673.5 0.355 0.722938
## Date02-04-2010 359086.7 116350.0 3.086 0.002036 **
## Date02-07-2010 350395.5 118923.6 2.946 0.003227 **
## Date02-09-2011 53703.6 110440.5 0.486 0.626795
## Date02-12-2011 177489.5 108941.7 1.629 0.103317
## Date03-02-2012 81889.5 108692.9 0.753 0.451236
## Date03-06-2011 77967.0 110318.8 0.707 0.479753
## Date03-08-2012 105049.4 110833.0 0.948 0.343259
## Date03-09-2010 324889.4 119861.6 2.711 0.006736 **
## Date03-12-2010 324000.1 115070.6 2.816 0.004883 **
## Date04-02-2011 166214.6 111834.7 1.486 0.137263
## Date04-03-2011 130594.2 109046.5 1.198 0.231117
## Date04-05-2012 3915.2 110389.4 0.035 0.971708
## Date04-06-2010 373857.5 118191.2 3.163 0.001568 **
## Date04-11-2011 130515.2 108517.7 1.203 0.229134
## Date05-02-2010 361044.1 118702.0 3.042 0.002363 **
## Date05-03-2010 300660.3 118196.2 2.544 0.010991 *
## Date05-08-2011 86650.4 110922.4 0.781 0.434726
## Date05-10-2012 18606.8 110235.2 0.169 0.865966
## Date05-11-2010 252458.5 115991.8 2.177 0.029554 *
## Date06-01-2012 91989.2 109534.8 0.840 0.401043
## Date06-04-2012 124820.9 110541.1 1.129 0.258865
## Date06-05-2011 -12079.0 110404.6 -0.109 0.912883
## Date06-07-2012 214939.0 111022.0 1.936 0.052912 .
## Date06-08-2010 341131.3 119621.8 2.852 0.004362 **
## Date07-01-2011 105014.2 112137.9 0.936 0.349065
## Date07-05-2010 297704.3 115066.5 2.587 0.009697 **
## Date07-09-2012 52448.2 111271.9 0.471 0.637406
## Date07-10-2011 108580.0 109017.0 0.996 0.319293
## Date08-04-2011 35785.1 108649.2 0.329 0.741892
## Date08-06-2012 120564.3 109855.4 1.097 0.272473
## Date08-07-2011 114437.7 110524.9 1.035 0.300521
## Date08-10-2010 265032.4 118105.8 2.244 0.024865 *
## Date09-03-2012 39825.9 108870.8 0.366 0.714520
## Date09-04-2010 288410.6 116096.6 2.484 0.013009 *
## Date09-07-2010 331508.5 119355.2 2.777 0.005494 **
## Date09-09-2011 72545.5 109797.8 0.661 0.508817
## Date09-12-2011 317290.3 109279.1 2.903 0.003703 **
## Date10-02-2012 153098.1 108672.8 1.409 0.158944
## Date10-06-2011 63356.7 110289.5 0.574 0.565679
## Date10-08-2012 87017.7 111007.3 0.784 0.433133
## Date10-09-2010 292667.1 119845.0 2.442 0.014632 *
## Date10-12-2010 427929.5 113679.7 3.764 0.000169 ***
## Date11-02-2011 187141.3 111416.2 1.680 0.093073 .
## Date11-03-2011 44380.6 108517.4 0.409 0.682574
## Date11-05-2012 9076.2 110118.3 0.082 0.934314
## Date11-06-2010 327505.6 118713.2 2.759 0.005818 **
## Date11-11-2011 134999.4 108589.1 1.243 0.213836
## Date12-02-2010 335905.9 119198.9 2.818 0.004847 **
## Date12-03-2010 272112.0 117312.5 2.320 0.020397 *
## Date12-08-2011 55380.6 110705.4 0.500 0.616915
## Date12-10-2012 -31839.3 110180.8 -0.289 0.772611
## Date12-11-2010 250869.5 115378.0 2.174 0.029718 *
## Date13-01-2012 7926.9 109135.4 0.073 0.942100
## Date13-04-2012 -35054.8 110914.4 -0.316 0.751973
## Date13-05-2011 -40830.1 110811.8 -0.368 0.712540
## Date13-07-2012 94837.2 110806.0 0.856 0.392094
## Date13-08-2010 294551.4 118943.9 2.476 0.013298 *
## Date14-01-2011 51037.1 112188.8 0.455 0.649182
## Date14-05-2010 220721.6 114570.2 1.927 0.054085 .
## Date14-09-2012 -46784.8 110500.8 -0.423 0.672026
## Date14-10-2011 53713.4 109092.9 0.492 0.622479
## Date15-04-2011 -8328.6 109035.8 -0.076 0.939116
## Date15-06-2012 106569.6 110105.3 0.968 0.333138
## Date15-07-2011 50691.2 110553.6 0.459 0.646594
## Date15-10-2010 201634.6 116525.2 1.730 0.083609 .
## Date16-03-2012 13453.0 109345.1 0.123 0.902086
## Date16-04-2010 230439.8 115421.3 1.997 0.045921 *
## Date16-07-2010 301232.6 119993.8 2.510 0.012084 *
## Date16-09-2011 5313.3 109559.0 0.048 0.961321
## Date16-12-2011 427288.6 109355.3 3.907 9.43e-05 ***
## Date17-02-2012 142092.0 108598.0 1.308 0.190778
## Date17-06-2011 66338.0 110170.6 0.602 0.547103
## Date17-08-2012 62239.0 110882.5 0.561 0.574609
## Date17-09-2010 231529.5 119415.5 1.939 0.052563 .
## Date17-12-2010 558210.6 113207.0 4.931 8.40e-07 ***
## Date18-02-2011 223238.8 110758.1 2.016 0.043889 *
## Date18-03-2011 43985.9 108496.9 0.405 0.685188
## Date18-05-2012 17826.8 110086.3 0.162 0.871363
## Date18-06-2010 328722.4 119120.2 2.760 0.005804 **
## Date18-11-2011 92568.8 108630.8 0.852 0.394168
## Date19-02-2010 342874.5 119555.7 2.868 0.004146 **
## Date19-03-2010 242273.1 116509.2 2.079 0.037618 *
## Date19-08-2011 83785.4 110436.9 0.759 0.448077
## Date19-10-2012 -48174.1 110156.5 -0.437 0.661891
## Date19-11-2010 217175.7 114666.7 1.894 0.058274 .
## Date20-01-2012 4030.5 109029.3 0.037 0.970512
## Date20-04-2012 -63902.2 110904.3 -0.576 0.564505
## Date20-05-2011 -72410.6 110611.2 -0.655 0.512723
## Date20-07-2012 88384.4 110725.4 0.798 0.424767
## Date20-08-2010 313664.4 119403.2 2.627 0.008637 **
## Date21-01-2011 50522.8 111575.2 0.453 0.650698
## Date21-05-2010 226271.9 115365.4 1.961 0.049882 *
## Date21-09-2012 -59567.5 110719.7 -0.538 0.590594
## Date21-10-2011 70009.2 108839.9 0.643 0.520098
## Date22-04-2011 60062.2 109459.6 0.549 0.583220
## Date22-06-2012 105844.3 110409.5 0.959 0.337771
## Date22-07-2011 34516.8 110939.2 0.311 0.755710
## Date22-10-2010 194691.0 116127.5 1.677 0.093685 .
## Date23-03-2012 -37392.0 109691.3 -0.341 0.733202
## Date23-04-2010 221544.1 115507.1 1.918 0.055155 .
## Date23-07-2010 271506.5 120069.7 2.261 0.023778 *
## Date23-09-2011 -8791.6 109299.7 -0.080 0.935893
## Date23-12-2011 812809.2 109678.9 7.411 1.42e-13 ***
## Date24-02-2012 32950.0 108515.8 0.304 0.761410
## Date24-06-2011 43464.6 110187.8 0.394 0.693255
## Date24-08-2012 50952.6 110436.1 0.461 0.644545
## Date24-09-2010 192374.1 119324.9 1.612 0.106971
## Date24-12-2010 979574.2 112953.2 8.672 < 2e-16 ***
## Date25-02-2011 112027.2 110473.7 1.014 0.310592
## Date25-03-2011 -450.2 108478.3 -0.004 0.996689
## Date25-05-2012 56303.4 110197.4 0.511 0.609417
## Date25-06-2010 302928.0 119197.0 2.541 0.011065 *
## Date25-11-2011 552182.2 108770.8 5.077 3.95e-07 ***
## Date26-02-2010 245325.7 119258.8 2.057 0.039719 *
## Date26-03-2010 220269.0 116353.1 1.893 0.058389 .
## Date26-08-2011 103349.1 110547.0 0.935 0.349882
## Date26-10-2012 -16132.6 109582.2 -0.147 0.882964
## Date26-11-2010 675089.3 114553.1 5.893 3.98e-09 ***
## Date27-01-2012 -48529.9 108870.5 -0.446 0.655788
## Date27-04-2012 -84460.2 110656.8 -0.763 0.445335
## Date27-05-2011 -18527.2 110383.3 -0.168 0.866711
## Date27-07-2012 26042.1 110755.9 0.235 0.814115
## Date27-08-2010 291374.4 119472.8 2.439 0.014762 *
## Date28-01-2011 22910.6 111630.2 0.205 0.837394
## Date28-05-2010 302829.4 116780.6 2.593 0.009532 **
## Date28-09-2012 -58980.5 110390.3 -0.534 0.593159
## Date28-10-2011 68759.9 108641.6 0.633 0.526818
## Date29-04-2011 -63232.2 109895.5 -0.575 0.565052
## Date29-06-2012 101225.7 110831.7 0.913 0.361105
## Date29-07-2011 -6502.8 110860.4 -0.059 0.953227
## Date29-10-2010 205316.7 116023.8 1.770 0.076841 .
## Date30-03-2012 -47259.8 110230.3 -0.429 0.668129
## Date30-04-2010 196695.5 115370.8 1.705 0.088263 .
## Date30-07-2010 263622.5 119726.1 2.202 0.027710 *
## Date30-09-2011 -2719.4 109695.0 -0.025 0.980223
## Date30-12-2011 129165.5 109892.5 1.175 0.239887
## Date31-08-2012 44711.4 110852.9 0.403 0.686712
## Date31-12-2010 69615.4 112698.7 0.618 0.536787
## Holiday_Flag NA NA NA NA
## Temperature -743.4 683.8 -1.087 0.277041
## Fuel_Price 253287.8 52973.5 4.781 1.78e-06 ***
## CPI -1761.4 229.7 -7.670 1.99e-14 ***
## Unemployment -28545.7 4190.0 -6.813 1.05e-11 ***
## fecha NA NA NA NA
## Year NA NA NA NA
## Month NA NA NA NA
## WeekYear NA NA NA NA
## WeekDay NA NA NA NA
## Day NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 514400 on 6287 degrees of freedom
## Multiple R-squared: 0.1881, Adjusted R-squared: 0.1692
## F-statistic: 9.912 on 147 and 6287 DF, p-value: < 2.2e-16
df_ajustada <- df %>% select(-Date,-Fuel_Price,-Year:-Day)
regresion_ajustada <- lm(Weekly_Sales~.,data = df_ajustada)
summary(regresion_ajustada)
##
## Call:
## lm(formula = Weekly_Sales ~ ., data = df_ajustada)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1033045 -392197 -38179 371833 2712887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1829724.13 367412.35 4.980 6.52e-07 ***
## Store -15391.78 522.42 -29.463 < 2e-16 ***
## Holiday_Flag 71943.20 25917.09 2.776 0.00552 **
## Temperature -967.11 375.41 -2.576 0.01001 *
## CPI -2343.03 180.28 -12.997 < 2e-16 ***
## Unemployment -21609.39 3903.11 -5.536 3.21e-08 ***
## fecha 13.19 23.74 0.556 0.57844
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 523100 on 6428 degrees of freedom
## Multiple R-squared: 0.1415, Adjusted R-squared: 0.1407
## F-statistic: 176.6 on 6 and 6428 DF, p-value: < 2.2e-16