1. Introduction

Dataset Info

This is the historical data that covers sales from 2010-02-05 to 2012-11-01, in the file Walmart_Store_sales. Within this file you will find the following fields:

Source

This dataset was taking from Kaggle. You can explore it HERE.

Objectives

  1. Data exploration

Libraries

#Importing libraries

libraries <- c("ggplot2", "dplyr", "lubridate", 
               "readr", "plotly", "corrplot", 
               "forcats", "scatterplot3d", "tidyverse")
sapply(libraries, require, character.only = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 4.1.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: readr
## Warning: package 'readr' was built under R version 4.1.3
## Loading required package: plotly
## Warning: package 'plotly' was built under R version 4.1.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: corrplot
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
## Loading required package: forcats
## Warning: package 'forcats' was built under R version 4.1.3
## Loading required package: scatterplot3d
## Warning: package 'scatterplot3d' was built under R version 4.1.3
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.1.3
## Error: package or namespace load failed for 'tidyverse' in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):
##  namespace 'purrr' 0.3.4 is already loaded, but >= 1.0.1 is required
##       ggplot2         dplyr     lubridate         readr        plotly 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##      corrplot       forcats scatterplot3d     tidyverse 
##          TRUE          TRUE          TRUE         FALSE
setwd('E:/Waltmart_dataset')

Data loading and visualization

## Data upload 

walmart_data <- read.csv('Walmart.csv')
head(walmart_data)
##   Store       Date Weekly_Sales Holiday_Flag Temperature Fuel_Price      CPI
## 1     1 05-02-2010      1643691            0       42.31      2.572 211.0964
## 2     1 12-02-2010      1641957            1       38.51      2.548 211.2422
## 3     1 19-02-2010      1611968            0       39.93      2.514 211.2891
## 4     1 26-02-2010      1409728            0       46.63      2.561 211.3196
## 5     1 05-03-2010      1554807            0       46.50      2.625 211.3501
## 6     1 12-03-2010      1439542            0       57.79      2.667 211.3806
##   Unemployment
## 1        8.106
## 2        8.106
## 3        8.106
## 4        8.106
## 5        8.106
## 6        8.106

Database structure

num_row <- nrow(walmart_data)
num_col <- ncol(walmart_data)
sprintf("The dataset has %s samples and %s features", num_row, num_col)
## [1] "The dataset has 6435 samples and 8 features"

Count null values

## Count null values 

apply(X = is.na(walmart_data), MARGIN = 2, FUN = sum)
##        Store         Date Weekly_Sales Holiday_Flag  Temperature   Fuel_Price 
##            0            0            0            0            0            0 
##          CPI Unemployment 
##            0            0

Inference: There aren’t null values.

Count unique values

## Count unique values

sort(sapply(walmart_data, function(x) length(unique(x))))
## Holiday_Flag        Store         Date Unemployment   Fuel_Price          CPI 
##            2           45          143          349          892         2145 
##  Temperature Weekly_Sales 
##         3528         6435

Data general info

## Data info 

str(walmart_data)
## 'data.frame':    6435 obs. of  8 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...

Inference: Date data type is incorrect.

Statistical summary

## Statistical summary 

summary(walmart_data)
##      Store        Date            Weekly_Sales      Holiday_Flag    
##  Min.   : 1   Length:6435        Min.   : 209986   Min.   :0.00000  
##  1st Qu.:12   Class :character   1st Qu.: 553350   1st Qu.:0.00000  
##  Median :23   Mode  :character   Median : 960746   Median :0.00000  
##  Mean   :23                      Mean   :1046965   Mean   :0.06993  
##  3rd Qu.:34                      3rd Qu.:1420159   3rd Qu.:0.00000  
##  Max.   :45                      Max.   :3818686   Max.   :1.00000  
##   Temperature       Fuel_Price         CPI         Unemployment   
##  Min.   : -2.06   Min.   :2.472   Min.   :126.1   Min.   : 3.879  
##  1st Qu.: 47.46   1st Qu.:2.933   1st Qu.:131.7   1st Qu.: 6.891  
##  Median : 62.67   Median :3.445   Median :182.6   Median : 7.874  
##  Mean   : 60.66   Mean   :3.359   Mean   :171.6   Mean   : 7.999  
##  3rd Qu.: 74.94   3rd Qu.:3.735   3rd Qu.:212.7   3rd Qu.: 8.622  
##  Max.   :100.14   Max.   :4.468   Max.   :227.2   Max.   :14.313

  1. Manipulating data

## Date data type

class(walmart_data$Date)
## [1] "character"

Inference: It’s necesary change to type data.

## Change data type

walmart_data<-walmart_data %>%
  mutate(Date=dmy(Date)) 

## Verification of change 

sprintf("The data type of the Date variable is: %s", class(walmart_data$Date) )
## [1] "The data type of the Date variable is: Date"
## Classifying fuel prices

walmart_data <-
  walmart_data %>% mutate(Sts_Fuel_Price = ifelse(Fuel_Price < mean(walmart_data$Fuel_Price), "Low", "High"))
## Classifying temperature

walmart_data <-
  walmart_data %>% mutate(Sts_Temperature = case_when( 
    Temperature <= quantile(walmart_data$Temperature, c(0.25)) ~ "Cool",
    Temperature > quantile(walmart_data$Temperature, c(0.50)) & 
     Temperature <= quantile(walmart_data$Temperature, c(0.75)) ~ "Cold", 
    T ~ "Hot"))
## Classifying CPI

walmart_data <- 
  walmart_data %>% mutate(Sts_CPI = ifelse(
    CPI > mean(walmart_data$CPI), "High", "Low"
  ))
## Filtering day, month and year from Date

Weekday <- day(walmart_data$Date)
Month <- month(walmart_data$Date)
Year <- year(walmart_data$Date)

## Add Day column 

walmart_data <- walmart_data %>%
  cbind(Weekday)

## Add Month column

walmart_data <- walmart_data %>%
  cbind(Month)

## Add Year column 
walmart_data <- walmart_data %>%
  cbind(Year)

  1. Exploration Data Analysis (EDA)

Response variable: Weekly_Sales.

# Target variable distribution

ggplot(walmart_data, aes(x = Weekly_Sales)) +
  
geom_histogram(aes(y=..density..),
               color = 'black', 
               fill = "#24F1B0", 
               bins = 30) +
  
  geom_density(alpha = 0.2, fill="white") +
  
ggtitle('Target Variable Distribution') +
labs(y = 'Frequency')
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Inference: The Target Variable seems to be be normally distributed, averaging around 20 units.

## Select numeric features

walmart_numeric_features <- walmart_data %>% 
  select(Store, Weekly_Sales, Holiday_Flag, Temperature,
         Fuel_Price, CPI, Unemployment, Weekday, Month, Year)
## Correlation between numeric variables 

round(abs(cor(walmart_numeric_features)), 3)
##              Store Weekly_Sales Holiday_Flag Temperature Fuel_Price   CPI
## Store        1.000        0.335        0.000       0.023      0.060 0.209
## Weekly_Sales 0.335        1.000        0.037       0.064      0.009 0.073
## Holiday_Flag 0.000        0.037        1.000       0.155      0.078 0.002
## Temperature  0.023        0.064        0.155       1.000      0.145 0.177
## Fuel_Price   0.060        0.009        0.078       0.145      1.000 0.171
## CPI          0.209        0.073        0.002       0.177      0.171 1.000
## Unemployment 0.224        0.106        0.011       0.101      0.035 0.302
## Weekday      0.000        0.017        0.045       0.027      0.028 0.003
## Month        0.000        0.076        0.123       0.236      0.042 0.005
## Year         0.000        0.018        0.057       0.064      0.779 0.075
##              Unemployment Weekday Month  Year
## Store               0.224   0.000 0.000 0.000
## Weekly_Sales        0.106   0.017 0.076 0.018
## Holiday_Flag        0.011   0.045 0.123 0.057
## Temperature         0.101   0.027 0.236 0.064
## Fuel_Price          0.035   0.028 0.042 0.779
## CPI                 0.302   0.003 0.005 0.075
## Unemployment        1.000   0.004 0.013 0.242
## Weekday             0.004   1.000 0.015 0.006
## Month               0.013   0.015 1.000 0.194
## Year                0.242   0.006 0.194 1.000
## Heatmap 

#heatmap(abs(cor(walmart_numeric_features)), Rowv = NA, Colv = NA, revC = T)
## Corrplot

corrplot(cor(walmart_numeric_features), method='shade', shade.col= NA,
         tl.col = "black", tl.srt = 50, addCoef.col = 'black')

Inference: The correlation between the dependent variable ( Weekly_Sales) and the dependent variables is relatively low.

# Weekly sales for year

walmart_data%>%
  group_by(Date)%>%
  
  summarise(Weekly_Sales = mean(Weekly_Sales, na.rm=T)) %>%
  
  ggplot(aes(Date, Weekly_Sales)) +
  
  geom_point(aes(color=Weekly_Sales>1200000),
             show.legend = F) +
  geom_line(color='grey') +
  
  labs(title = 'Weekly Sales for year',
       y='Weekly sales',
       x='Date') +

    theme_bw()

Inference: some dates have extreme high weekly sales.

# Holiday and Non-Holiday sales

holidays <-function(x){
  cut(x,
      breaks = ymd(20100101, 20101126, 20101127, 20101224,20101225,
                   20111125, 20111126, 20111223, 20111224,20121123,
                   20121124, 20121221, 20121222, 20130101),
      
      labels = c('Normal Day','Thanksgiving', 
                 'Normal Day', 'A week before Christmas',
                 'Normal Day', 'Thanksgiving', 
                 'Normal Day', 'A week before Christmas',
                 'Normal Day', 'Thanksgiving', 
                 'Normal Day', 'A week before Christmas',
                 'Normal Day'))
}

walmart_data <- walmart_data %>%
  mutate(festival = holidays(Date))

walmart_data %>%
  group_by(Date)%>%
  
  mutate(Weekly_Sales = mean(Weekly_Sales, na.rm=T)) %>%
  
  ggplot(aes(Date, Weekly_Sales)) +
  
  geom_point(aes(color = festival, 
                 shape = festival),
             size = 2)+
  geom_line(group=1, color='grey')

Inference : festival will lead to high weekly sales, especially a week before the Chritmas and the Thanksgiving.

## Annual Sales 
Annual_Sales <- 
  aggregate(Weekly_Sales ~ Year, walmart_data, sum)

Annual_Sales <- Annual_Sales[order(-Annual_Sales$Weekly_Sales), ]

print(Annual_Sales)
##   Year Weekly_Sales
## 2 2011   2448200007
## 1 2010   2288886120
## 3 2012   2000132859
## Annual sales pie plot

sales <- Annual_Sales$Weekly_Sales
labels <- c('2010', '2011', '2012' )
porcent <- round(sales/sum(sales)*100)
labels <- paste(labels, porcent, "%", sep = " ")

pie(sales,
    labels = labels,
    col = rainbow(length(labels)),
    main = "Annual Sales")

legend("topright", c("2010", "2011", "2012"),
       cex = 0.8,
       fill = rainbow(length(labels)))

# Scatterplot: Year-weekly sales

walmart_data %>%
  
  ggplot(aes(x = Year,
             y = Weekly_Sales)) +
  geom_point(alpha = 0.1, 
             colour = 'blue') +
  
  labs(title = 
         'Year / Weekly_Sales Scatterplot',
       x='Year',
       y='Weekly_Sales')

Inferece:

Annual Stores Sales

## Top 10 stores

Store_Sales <- 
  aggregate(Weekly_Sales ~ Store, walmart_data, sum)

Top_10_Stores <- head(Store_Sales[order(-Store_Sales$Weekly_Sales), ], 10)

print(Top_10_Stores)
##    Store Weekly_Sales
## 20    20    301397792
## 4      4    299543953
## 14    14    288999911
## 13    13    286517704
## 2      2    275382441
## 10    10    271617714
## 27    27    253855917
## 6      6    223756131
## 1      1    222402809
## 39    39    207445542
## Barplot (Relative Frequency)

barplot(prop.table(Top_10_Stores$Weekly_Sales) * 100, 
        names.arg = c(Top_10_Stores$Store),
        col = 'blue', 
        main = "Top 10 best-selling stores ", 
        xlab = "Stores",
        ylab = "Weekly Sales")

Inference: The 20th store reported more annual sales.

Holiday Flag

# Holiday flags classes

walmart_data %>% 
  count(Holiday_Flag) %>%
  plot_ly( x = ~Holiday_Flag, 
           y= ~n, 
           type = 'bar',
           text = ~n,
           marker = list(color = 'rgb(158,202,225)',
                         line = list(color = 'rgb(8,48,107)',
                                     width = 0.5)))%>%
  layout(title = "Holiday Flag Classes",
         xaxis = list(title = "Holiday_Flag"),
         yaxis = list(title = "Frequency"))
# Boxplot holiday flags-Weekly sales

walmart_data %>% 
  ggplot(aes(as_factor(Holiday_Flag), Weekly_Sales)) +
  
  geom_boxplot() + 
  labs(title = "Boxplot Holiday Flag",
       x = "Holiday Flag", 
       y = "Weekly Sales") + 
 
   theme_bw()

## Scatterplot Weekly Sales / Holiday Flag

walmart_data %>% 
  ggplot(aes(x = Holiday_Flag, 
             y = Weekly_Sales)) +
  geom_point(alpha = 0.1, 
             colour = 'blue') +
  
  labs(title = 
         'Weekly sales / Holiday Flag Scatterplot',
       y='Weekly sales',
       x='Holiday Flag')

# Holiday flags sales

Sales_Holidays <-
  aggregate(Weekly_Sales ~ Holiday_Flag, walmart_data, sum)

Sales_Holidays <-
  Sales_Holidays[order(-Sales_Holidays$Holiday_Flag),]

print(Sales_Holidays)
##   Holiday_Flag Weekly_Sales
## 2            1    505299552
## 1            0   6231919436
Inference:

Fuel Price

# Weekly Sales / Fuel_Price

Sales_vs_Fuel_Price <- 
  aggregate(Weekly_Sales ~ Sts_Fuel_Price, walmart_data, sum)

print(Sales_vs_Fuel_Price)
##   Sts_Fuel_Price Weekly_Sales
## 1           High   3628932238
## 2            Low   3108286749
## Fuel_Price distribution

hist_fp <- ggplot(walmart_data, aes(x=Fuel_Price)) +
  
  geom_histogram(aes(y=..density..),
                 position = "identity",
                 binwidth = 0.10,
                 colour = 'black', 
                 fill = "lightblue") +
  
  geom_density(alpha = 0.2, fill="white") 

box_fp <- ggplot(walmart_data, aes(x="", y=Fuel_Price)) + 
  
  geom_boxplot(fill='lightblue', color ='black') + 
  
  coord_flip() + 
  theme_bw() + 
  xlab("") + 
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

subplot(hist_fp, box_fp, nrows = 2) %>%
  layout( title = 'Fuel Price distribution',
         lot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'))
## Warning: 'layout' objects don't have these attributes: 'lot_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
# Scatterplot: weekly sales / fuel price 

walmart_data %>% 
  ggplot(aes(x = Fuel_Price, 
             y = Weekly_Sales)) +
  geom_point(alpha = 0.1, 
             colour = 'blue') +
  
  labs(title = 
         'Weekly sales / Fuel Price Scatterplot',
       y='Weekly sales',
       x='Fuel Price')

Inference: There are more people to shop in specific fuel price.

  1. CPI

# Total Sales and CPI

Sales_vs_CPI <- 
  aggregate(Weekly_Sales ~ Sts_CPI, walmart_data, sum)

print(Sales_vs_CPI)
##   Sts_CPI Weekly_Sales
## 1    High   3330247700
## 2     Low   3406971287
## Histogram: CPI

hist_cpi <- ggplot(walmart_data, aes(x=CPI)) +
  
  geom_histogram(aes(y=..density..),
                 position = "identity",
                 binwidth = 2,
                 colour = 'blue', 
                 fill = "lightblue") +
  
  geom_density(alpha = 0.2, fill="white") 

box_cpi <- ggplot(walmart_data, aes(x="", y=CPI)) + 
  
  geom_boxplot(fill='lightblue', color ='black') + 
  
  coord_flip() + 
  theme_bw() + 
  xlab("") + 
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

subplot(hist_cpi, box_cpi, nrows = 2) %>%
  layout(title = 'CPI Distribution', 
         lot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'))
## Warning: 'layout' objects don't have these attributes: 'lot_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
# Scatterplot: weekly sales / CPI

walmart_data %>% 
  ggplot(aes(x = CPI,
             y = Weekly_Sales)) +
  geom_point(alpha = 0.1, 
             colour = 'blue') +
  
  labs(title = 
         'Weekly sales / CPI Scatterplot',
       y='Weekly sales',
       x='CPI')

Inference: Seems that different range of CPI have same sales distributions. Inference: Walmart reported higher profits when fuel prices and CPI were high and when the temperature was cold.

Temperature

## Weekly_Sales / Temperature

Sales_vs_Temperature <- 
  aggregate(Weekly_Sales ~ Sts_Temperature, walmart_data, sum)

print(Sales_vs_Temperature)
##   Sts_Temperature Weekly_Sales
## 1            Cold   1696418667
## 2            Cool   1758497608
## 3             Hot   3282302713
## Temperature distribution

#library(htmltools)
hist_temp <- ggplot(walmart_data, 
                    aes(x=Temperature)) +
  
  geom_histogram(aes(y=..density..),
                 position = "identity",
                 bins = 30,
                 colour = 'blue', 
                 fill = "lightblue") +
  
  geom_density(alpha = 0.2, fill="white") 

box_temp <- ggplot(walmart_data, aes(x="", y=Temperature)) + 
  
  geom_boxplot(fill='lightblue', color ='black') + 
  
  coord_flip() + 
  theme_bw() + 
  xlab("") + 
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

subplot(hist_temp, box_temp, nrows = 2) %>%
  plotly::layout(title = 'Temperature Distribution',
         lot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'))
## Warning: 'layout' objects don't have these attributes: 'lot_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
## Distribution Weekly_sale/Temperature

walmart_data %>% 
  ggplot(aes(x = Temperature, 
             y = Weekly_Sales)) +
  geom_point(alpha = 0.1, 
             colour = 'blue') +
  
  labs(title = 
         'Weekly sales / Temperature Correlation',
       y='Weekly sales',
       x='Temperature')

Inference: Temperature: Some people go shopping when the temperature is between 40 and 75 degrees Farenheint.

  1. Unemployment

## Unemployment distribution 

hist_u <- ggplot(walmart_data, aes(x=Unemployment)) +
  
  geom_histogram(aes(y=..density..),
                 position = "identity",
                 binwidth = 0.10,
                 colour = 'blue', 
                 fill = "lightblue") +
  
  geom_density(alpha = 0.2, fill="white") 

box_u <- ggplot(walmart_data, aes(x="", y=Unemployment)) + 
  
  geom_boxplot(fill='lightblue', color ='black') + 
  
  coord_flip() + 
  theme_bw() + 
  xlab("") + 
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

subplot(hist_u, box_u, nrows = 2) %>%
  layout(title = 'Unemployment distribution ',
         lot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'))
## Warning: 'layout' objects don't have these attributes: 'lot_bgcolor'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
# Scatterplot Weekly Sales / Unemployment

walmart_data %>% 
  ggplot(aes(x = Unemployment,
             y = Weekly_Sales)) +
  geom_point(alpha = 0.1, 
             colour = 'blue') +
  
  labs(title = 
         'Weekly sales / Unemployment Scatterplot',
       y='Weekly sales',
       x='Unemployment')

Inference: Lower unemployment rate has higher weekly sales.

Sales Analysis by Year.

## Filtering sales by years

Sales_2010 <- subset(walmart_data, Year == 2010)
Sales_2011 <- subset(walmart_data, Year == 2011)
Sales_2012 <- subset(walmart_data, Year == 2012)

Sales analysis 2010

## Total sales 2010

Total_Sales_2010 <- 
  aggregate(Weekly_Sales ~ Store, Sales_2010, sum)

Sales_Top5_2010 <- 
  head(Total_Sales_2010[order(-Total_Sales_2010$Weekly_Sales), ], 5)

print(Sales_Top5_2010)
##    Store Weekly_Sales
## 14    14    105462242
## 20    20    101733081
## 4      4     95680471
## 2      2     95277864
## 13    13     95272735
## Total sales per month (2010)

Total_Sales_per_Month_2010 <- 
  aggregate(Weekly_Sales ~ Month, Sales_2010, sum)
Total_Sales_per_Month_2010 <- Total_Sales_per_Month_2010[order(-Total_Sales_per_Month_2010$Weekly_Sales),]

print(Total_Sales_per_Month_2010)
##    Month Weekly_Sales
## 11    12    288760533
## 6      7    232580126
## 3      4    231412368
## 9     10    217161824
## 10    11    202853370
## 5      6    192246172
## 1      2    190332983
## 7      8    187640111
## 4      5    186710934
## 2      3    181919803
## 8      9    177267896
## Total sales per holidays (2010)

Sales_Holidays_2010 <-
  aggregate(Weekly_Sales ~ Holiday_Flag, Sales_2010, sum)

Sales_Holidays_2010 <-
  Sales_Holidays_2010[order(-Sales_Holidays_2010$Holiday_Flag),]

print(Sales_Holidays_2010)
##   Holiday_Flag Weekly_Sales
## 2            1    200224598
## 1            0   2088661523
## Total sales and fuel prices (2010)

Sales_vs_Fuel_Price_2010 <- 
  aggregate(Weekly_Sales ~ Sts_Fuel_Price, Sales_2010, sum)

print(Sales_vs_Fuel_Price_2010)
##   Sts_Fuel_Price Weekly_Sales
## 1            Low   2288886120
## Total sales and CPI (2010)

Sales_vs_CPI_2010 <- 
  aggregate(Weekly_Sales ~ Sts_CPI, Sales_2010, sum)

Sales_vs_CPI_2010 <- 
  Sales_vs_CPI_2010[order(-Sales_vs_CPI_2010$Weekly_Sales),]

print(Sales_vs_CPI_2010)
##   Sts_CPI Weekly_Sales
## 2     Low   1164241673
## 1    High   1124644447
## Total sales and temperature (2010)

Sales_vs_Temperature_2010 <- 
  aggregate(Weekly_Sales ~ Sts_Temperature, Sales_2010, sum)

Sales_vs_Temperature_2010 <-
  Sales_vs_Temperature_2010[order(-Sales_vs_Temperature_2010$Weekly_Sales),]

print(Sales_vs_Temperature_2010)
##   Sts_Temperature Weekly_Sales
## 3             Hot   1117750478
## 2            Cool    624263738
## 1            Cold    546871904
Inference:

Analysis Sales 2011

## Total sales (2011)

Total_Sales_2011 <- 
  aggregate(Weekly_Sales ~ Store, Sales_2011, sum)

Sales_Top_5_2011 <- 
  head(Total_Sales_2011[order(-Total_Sales_2011$Weekly_Sales), ], 5)

print(Sales_Top_5_2011)
##    Store Weekly_Sales
## 4      4    111092293
## 20    20    109837002
## 14    14    106096271
## 13    13    104537513
## 10    10     98916895
## Total sales per month (2011)

Total_Sales_per_Month_2011 <- 
  aggregate(Weekly_Sales ~ Month, Sales_2011, sum)
Total_Sales_per_Month_2011 <- Total_Sales_per_Month_2011[order(-Total_Sales_per_Month_2011$Weekly_Sales),]

print(Total_Sales_per_Month_2011)
##    Month Weekly_Sales
## 12    12    288078102
## 7      7    229911399
## 4      4    226526511
## 9      9    220847738
## 11    11    210162355
## 6      6    189773385
## 8      8    188599332
## 2      2    186331328
## 10    10    183261283
## 5      5    181648158
## 3      3    179356448
## 1      1    163703967
## Total sales per holidays (2011)

Sales_Holidays_2011 <-
  aggregate(Weekly_Sales ~ Holiday_Flag, Sales_2011, sum)

Sales_Holidays_2011 <-
  Sales_Holidays_2011[order(-Sales_Holidays_2011$Holiday_Flag),]

print(Sales_Holidays_2011)
##   Holiday_Flag Weekly_Sales
## 2            1    206735487
## 1            0   2241464521
## Total sales and Fuel prices (2011)

Sales_vs_Fuel_Price_2011 <- 
  aggregate(Weekly_Sales ~ Sts_Fuel_Price, Sales_2011, sum)

Sales_vs_Fuel_Price_2011 <- 
  Sales_vs_Fuel_Price_2011[order(-Sales_vs_Fuel_Price_2011$Weekly_Sales), ]

print(Sales_vs_Fuel_Price_2011)
##   Sts_Fuel_Price Weekly_Sales
## 1           High   1847344472
## 2            Low    600855535
## Total sales and CPI (2011)

Sales_vs_CPI_2011 <- 
  aggregate(Weekly_Sales ~ Sts_CPI, Sales_2011, sum)

Sales_vs_CPI_2011 <-
  Sales_vs_CPI_2011[order(-Sales_vs_CPI_2011$Weekly_Sales),]

print(Sales_vs_CPI_2011)
##   Sts_CPI Weekly_Sales
## 2     Low   1234010115
## 1    High   1214189892
## Total sales and temperature (2011)

Sales_vs_Temperature_2011 <- 
  aggregate(Weekly_Sales ~ Sts_Temperature, Sales_2011, sum)

Sales_vs_Temperature_2011 <-
  Sales_vs_Temperature_2011[order(-Sales_vs_Temperature_2011$Weekly_Sales),]

print(Sales_vs_Temperature_2011)
##   Sts_Temperature Weekly_Sales
## 3             Hot   1087110090
## 2            Cool    772472295
## 1            Cold    588617623
Inference:

Analysis Sales 2012

## Total sales (2012)

Total_Sales_per_Store_2012 <- 
  aggregate(Weekly_Sales ~ Store, Sales_2012, sum)

Sales_Top5_2012 <- 
  head(Total_Sales_per_Store_2012[order(-Total_Sales_per_Store_2012$Weekly_Sales),], 5)

print(Sales_Top5_2012)
##    Store Weekly_Sales
## 4      4     92771189
## 20    20     89827709
## 13    13     86707455
## 2      2     81496695
## 10    10     78228617
## Total sales per month (2012)

Total_Sales_per_Month_2012 <- 
  aggregate(Weekly_Sales ~ Month, Sales_2012, sum)

Total_Sales_per_Month_2012 <- Total_Sales_per_Month_2012[order(-Total_Sales_per_Month_2012$Weekly_Sales),]

print(Total_Sales_per_Month_2012)
##    Month Weekly_Sales
## 6      6    240610329
## 8      8    236850766
## 3      3    231509650
## 2      2    192063580
## 4      4    188920906
## 5      5    188766479
## 7      7    187509452
## 10    10    184361680
## 9      9    180645544
## 1      1    168894472
## Total sales per holidays (2012)

Sales_Holidays_2012 <-
  aggregate(Weekly_Sales ~ Holiday_Flag, Sales_2012, sum)

Sales_Holidays_2012 <-
  Sales_Holidays_2012[order(-Sales_Holidays_2012$Holiday_Flag),]

print(Sales_Holidays_2012)
##   Holiday_Flag Weekly_Sales
## 2            1     98339467
## 1            0   1901793392
## Total sales and Fuel prices (2012)

Sales_vs_Fuel_Price_2012 <- 
  aggregate(Weekly_Sales ~ Sts_Fuel_Price, Sales_2012, sum)

Sales_vs_Fuel_Price_2010 <-  
  Sales_vs_Fuel_Price_2012[order(-Sales_vs_Fuel_Price_2012$Weekly_Sales),]

print(Sales_vs_Fuel_Price_2012)
##   Sts_Fuel_Price Weekly_Sales
## 1           High   1781587766
## 2            Low    218545093
## Total sales and CPI

Sales_vs_CPI_2012 <- 
  aggregate(Weekly_Sales ~ Sts_CPI, Sales_2012, sum)

Sales_vs_CPI_2012 <- 
  Sales_vs_CPI_2012[order(-Sales_vs_CPI_2012$Weekly_Sales),]

print(Sales_vs_CPI_2012)
##   Sts_CPI Weekly_Sales
## 2     Low   1008719499
## 1    High    991413361
## Total sales and temperature (2012)

Sales_vs_Temperature_2012 <- 
  aggregate(Weekly_Sales ~ Sts_Temperature, Sales_2012, sum)

Sales_vs_Temperature_2012 <-
  Sales_vs_Temperature_2012[order(~Sales_vs_Temperature_2012),]
## Warning in is.na(x): is.na() aplicado a un objeto que no es (lista o vector) de
## tipo 'language
print(Sales_vs_Temperature_2012)
##   Sts_Temperature Weekly_Sales
## 1            Cold    560929140
## 2            Cool    361761575
unique(Sales_2012$Month)
##  [1]  1  2  3  4  5  6  7  8  9 10
Inference:

  1. Conclusions