Read the file into R

setwd("~/SIP/SIP Phase 2/R Programming/Udemy Class Material/Week 4/Project")
sales.df <- read.csv(paste("Features data set.csv", sep=""))
View(sales.df)

subsetting data-set

markdown <- subset(sales.df, MarkDown1 != "NA" & MarkDown2 != "NA" & MarkDown3 != "NA" & MarkDown4 != "NA" & MarkDown5 != "NA")
cpi <- subset(sales.df, sales.df$CPI != "NA")

Overview of the sales

library(lattice)
## Warning: package 'lattice' was built under R version 3.4.3
histogram(sales.df$Weekly_Sales, type = "count", xlab="Weekly Sales ($)", col = "gray50")

mean(sales.df$Weekly_Sales)
## [1] 14512.6
boxplot(sales.df$Weekly_Sales ~ sales.df$Store, horizontal = TRUE, xlab="Weekly Sales ($)", ylab="Store")

Overview of the discounts from the weeks in which discounts were allowed

library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplotMatrix(formula = ~Weekly_Sales + MarkDown1 + MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5, data = markdown, diagonal = "histogram")

Effect of Temperature on Weekly Sales of store

scatterplot(sales.df$Temperature, sales.df$Weekly_Sales, xlab = "Temperature", ylab = "Weekly Sales of Stores")

cor(sales.df$Temperature, sales.df$Weekly_Sales)
## [1] 0.07103998
cor.test(sales.df$Temperature, sales.df$Weekly_Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  sales.df$Temperature and sales.df$Weekly_Sales
## t = 6.4445, df = 8188, p-value = 1.225e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04945811 0.09255555
## sample estimates:
##        cor 
## 0.07103998

The plot and correlation test show a weak positive correlation between the sales and the temperature

Effect of fuel price on weekly sales of the stores

scatterplot(sales.df$Fuel_Price, sales.df$Weekly_Sales, xlab = "Fuel Price", ylab = "Weekly Sales of Stores")

cor(sales.df$Fuel_Price, sales.df$Weekly_Sales)
## [1] -0.0728054
cor.test(sales.df$Fuel_Price, sales.df$Weekly_Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  sales.df$Fuel_Price and sales.df$Weekly_Sales
## t = -6.6055, df = 8188, p-value = 4.209e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09431466 -0.05122821
## sample estimates:
##        cor 
## -0.0728054

Effect of CPI on the sales of the stores

scatterplot(sales.df$CPI, sales.df$Weekly_Sales, xlab = "CPI", ylab = "Weekly Sales of Stores")

cor(cpi$CPI, cpi$Weekly_Sales)
## [1] 0.13416
cor.test(cpi$CPI, cpi$Weekly_Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  cpi$CPI and cpi$Weekly_Sales
## t = 11.805, df = 7603, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1120222 0.1561647
## sample estimates:
##     cor 
## 0.13416

Positive correlation between Weekly Sales and CPI

Effect of Unemployement and its distribution

scatterplot(sales.df$Unemployment, sales.df$Weekly_Sales, xlab = "Unemployment", ylab = "Weekly Sales of the Store")

cor(cpi$Unemployment, cpi$Weekly_Sales)
## [1] 0.04730927
cor.test(cpi$Unemployment, cpi$Weekly_Sales)
## 
##  Pearson's product-moment correlation
## 
## data:  cpi$Unemployment and cpi$Weekly_Sales
## t = 4.1298, df = 7603, p-value = 3.67e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02486012 0.06971074
## sample estimates:
##        cor 
## 0.04730927

Effect of Holiday on the sales

boxplot(sales.df$Weekly_Sales ~ sales.df$IsHoliday, horizontal = TRUE, xlab = "Weekly Sales of the Stores ($)", ylab = "Is Holiday?")

holiday <- aggregate(Weekly_Sales ~ IsHoliday, data = sales.df, mean)
holiday
##   IsHoliday Weekly_Sales
## 1     FALSE     14521.51
## 2      TRUE     14396.81

There is no significant difference between sales on a normal day and sales on a holiday.

Regression analysis when there is discounts/ markdown

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
cormark <- markdown[c("Weekly_Sales", "Temperature", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5")]
corrgram(cormark, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt)

fit1 <- lm(Weekly_Sales~Temperature+Fuel_Price+MarkDown1+MarkDown2+MarkDown3+MarkDown4+MarkDown5, data=markdown)
summary(fit1)
## 
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown1 + 
##     MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5, data = markdown)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19731 -10204  -5321   4584 148193 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.835e+04  4.586e+03   8.363  < 2e-16 ***
## Temperature  1.041e+02  1.805e+01   5.769 8.96e-09 ***
## Fuel_Price  -8.159e+03  1.264e+03  -6.456 1.29e-10 ***
## MarkDown1   -2.352e-03  5.192e-02  -0.045   0.9639    
## MarkDown2   -4.593e-02  3.681e-02  -1.248   0.2122    
## MarkDown3   -2.682e-02  2.648e-02  -1.013   0.3114    
## MarkDown4   -1.467e-01  7.108e-02  -2.064   0.0391 *  
## MarkDown5   -4.853e-02  1.949e-02  -2.490   0.0129 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15850 on 2512 degrees of freedom
## Multiple R-squared:  0.03633,    Adjusted R-squared:  0.03365 
## F-statistic: 13.53 on 7 and 2512 DF,  p-value: < 2.2e-16
fit2 <- lm(Weekly_Sales~Temperature+Fuel_Price+MarkDown4+MarkDown5, data = markdown)
summary(fit2)
## 
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown4 + 
##     MarkDown5, data = markdown)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19622 -10217  -5311   4605 148263 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.652e+04  4.385e+03   8.329  < 2e-16 ***
## Temperature  1.122e+02  1.711e+01   6.558 6.61e-11 ***
## Fuel_Price  -7.849e+03  1.232e+03  -6.373 2.20e-10 ***
## MarkDown4   -1.421e-01  4.093e-02  -3.471 0.000526 ***
## MarkDown5   -4.798e-02  1.946e-02  -2.465 0.013773 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15850 on 2515 degrees of freedom
## Multiple R-squared:  0.03542,    Adjusted R-squared:  0.03389 
## F-statistic: 23.09 on 4 and 2515 DF,  p-value: < 2.2e-16

Regression model shows that there is very poor effect of markdown on the sales of the stores

Regression over cpi and unmeployement

corrgram(cpi, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt)

fit3 <- lm(Weekly_Sales ~ Temperature + Fuel_Price + MarkDown1 + MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5+CPI+Unemployment, data=cpi)
summary(fit3)
## 
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown1 + 
##     MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5 + CPI + Unemployment, 
##     data = cpi)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20700 -10023  -5052   4287  83355 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.806e+04  6.115e+03   6.223 5.89e-10 ***
## Temperature   1.185e+02  2.084e+01   5.685 1.50e-08 ***
## Fuel_Price   -8.362e+03  1.508e+03  -5.545 3.33e-08 ***
## MarkDown1    -5.693e-04  5.390e-02  -0.011   0.9916    
## MarkDown2    -4.854e-02  3.682e-02  -1.318   0.1876    
## MarkDown3    -2.946e-02  2.623e-02  -1.123   0.2614    
## MarkDown4    -1.296e-01  7.320e-02  -1.770   0.0769 .  
## MarkDown5    -4.369e-02  1.938e-02  -2.255   0.0243 *  
## CPI          -4.155e+00  1.014e+01  -0.410   0.6820    
## Unemployment  1.756e+02  2.223e+02   0.790   0.4296    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15640 on 2059 degrees of freedom
##   (5536 observations deleted due to missingness)
## Multiple R-squared:  0.03861,    Adjusted R-squared:  0.03441 
## F-statistic: 9.189 on 9 and 2059 DF,  p-value: 9.251e-14
fit4 <- lm(Weekly_Sales~Temperature+Fuel_Price+MarkDown5+CPI+Unemployment, data = cpi)
summary(fit4)
## 
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown5 + 
##     CPI + Unemployment, data = cpi)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20778 -10783  -5724   5358 184977 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.338e+04  4.839e+03   6.898 6.23e-12 ***
## Temperature   1.103e+02  1.707e+01   6.460 1.20e-10 ***
## Fuel_Price   -7.978e+03  1.185e+03  -6.735 1.91e-11 ***
## MarkDown5    -6.194e-02  2.022e-02  -3.063  0.00221 ** 
## CPI           4.701e+00  8.223e+00   0.572  0.56761    
## Unemployment  4.691e+02  1.795e+02   2.614  0.00900 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16700 on 3459 degrees of freedom
##   (4140 observations deleted due to missingness)
## Multiple R-squared:  0.02764,    Adjusted R-squared:  0.02623 
## F-statistic: 19.66 on 5 and 3459 DF,  p-value: < 2.2e-16

This regression model also proves to be a poor estimate of factors that influence the sales of the stores. There must be other factors that must be considered in estimating the sales of the stores. For e.g. Employees and their efficiency, population of people at the stores, avg. foot traffic at the stores…