Read the file into R
setwd("~/SIP/SIP Phase 2/R Programming/Udemy Class Material/Week 4/Project")
sales.df <- read.csv(paste("Features data set.csv", sep=""))
View(sales.df)
subsetting data-set
markdown <- subset(sales.df, MarkDown1 != "NA" & MarkDown2 != "NA" & MarkDown3 != "NA" & MarkDown4 != "NA" & MarkDown5 != "NA")
cpi <- subset(sales.df, sales.df$CPI != "NA")
Overview of the sales
library(lattice)
## Warning: package 'lattice' was built under R version 3.4.3
histogram(sales.df$Weekly_Sales, type = "count", xlab="Weekly Sales ($)", col = "gray50")

mean(sales.df$Weekly_Sales)
## [1] 14512.6
boxplot(sales.df$Weekly_Sales ~ sales.df$Store, horizontal = TRUE, xlab="Weekly Sales ($)", ylab="Store")

Overview of the discounts from the weeks in which discounts were allowed
library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplotMatrix(formula = ~Weekly_Sales + MarkDown1 + MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5, data = markdown, diagonal = "histogram")

Effect of Temperature on Weekly Sales of store
scatterplot(sales.df$Temperature, sales.df$Weekly_Sales, xlab = "Temperature", ylab = "Weekly Sales of Stores")

cor(sales.df$Temperature, sales.df$Weekly_Sales)
## [1] 0.07103998
cor.test(sales.df$Temperature, sales.df$Weekly_Sales)
##
## Pearson's product-moment correlation
##
## data: sales.df$Temperature and sales.df$Weekly_Sales
## t = 6.4445, df = 8188, p-value = 1.225e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04945811 0.09255555
## sample estimates:
## cor
## 0.07103998
The plot and correlation test show a weak positive correlation between the sales and the temperature
Effect of fuel price on weekly sales of the stores
scatterplot(sales.df$Fuel_Price, sales.df$Weekly_Sales, xlab = "Fuel Price", ylab = "Weekly Sales of Stores")

cor(sales.df$Fuel_Price, sales.df$Weekly_Sales)
## [1] -0.0728054
cor.test(sales.df$Fuel_Price, sales.df$Weekly_Sales)
##
## Pearson's product-moment correlation
##
## data: sales.df$Fuel_Price and sales.df$Weekly_Sales
## t = -6.6055, df = 8188, p-value = 4.209e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09431466 -0.05122821
## sample estimates:
## cor
## -0.0728054
Effect of CPI on the sales of the stores
scatterplot(sales.df$CPI, sales.df$Weekly_Sales, xlab = "CPI", ylab = "Weekly Sales of Stores")

cor(cpi$CPI, cpi$Weekly_Sales)
## [1] 0.13416
cor.test(cpi$CPI, cpi$Weekly_Sales)
##
## Pearson's product-moment correlation
##
## data: cpi$CPI and cpi$Weekly_Sales
## t = 11.805, df = 7603, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1120222 0.1561647
## sample estimates:
## cor
## 0.13416
Positive correlation between Weekly Sales and CPI
Effect of Unemployement and its distribution
scatterplot(sales.df$Unemployment, sales.df$Weekly_Sales, xlab = "Unemployment", ylab = "Weekly Sales of the Store")

cor(cpi$Unemployment, cpi$Weekly_Sales)
## [1] 0.04730927
cor.test(cpi$Unemployment, cpi$Weekly_Sales)
##
## Pearson's product-moment correlation
##
## data: cpi$Unemployment and cpi$Weekly_Sales
## t = 4.1298, df = 7603, p-value = 3.67e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02486012 0.06971074
## sample estimates:
## cor
## 0.04730927
Effect of Holiday on the sales
boxplot(sales.df$Weekly_Sales ~ sales.df$IsHoliday, horizontal = TRUE, xlab = "Weekly Sales of the Stores ($)", ylab = "Is Holiday?")

holiday <- aggregate(Weekly_Sales ~ IsHoliday, data = sales.df, mean)
holiday
## IsHoliday Weekly_Sales
## 1 FALSE 14521.51
## 2 TRUE 14396.81
There is no significant difference between sales on a normal day and sales on a holiday.
Regression analysis when there is discounts/ markdown
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
cormark <- markdown[c("Weekly_Sales", "Temperature", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5")]
corrgram(cormark, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt)

fit1 <- lm(Weekly_Sales~Temperature+Fuel_Price+MarkDown1+MarkDown2+MarkDown3+MarkDown4+MarkDown5, data=markdown)
summary(fit1)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown1 +
## MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5, data = markdown)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19731 -10204 -5321 4584 148193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.835e+04 4.586e+03 8.363 < 2e-16 ***
## Temperature 1.041e+02 1.805e+01 5.769 8.96e-09 ***
## Fuel_Price -8.159e+03 1.264e+03 -6.456 1.29e-10 ***
## MarkDown1 -2.352e-03 5.192e-02 -0.045 0.9639
## MarkDown2 -4.593e-02 3.681e-02 -1.248 0.2122
## MarkDown3 -2.682e-02 2.648e-02 -1.013 0.3114
## MarkDown4 -1.467e-01 7.108e-02 -2.064 0.0391 *
## MarkDown5 -4.853e-02 1.949e-02 -2.490 0.0129 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15850 on 2512 degrees of freedom
## Multiple R-squared: 0.03633, Adjusted R-squared: 0.03365
## F-statistic: 13.53 on 7 and 2512 DF, p-value: < 2.2e-16
fit2 <- lm(Weekly_Sales~Temperature+Fuel_Price+MarkDown4+MarkDown5, data = markdown)
summary(fit2)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown4 +
## MarkDown5, data = markdown)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19622 -10217 -5311 4605 148263
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.652e+04 4.385e+03 8.329 < 2e-16 ***
## Temperature 1.122e+02 1.711e+01 6.558 6.61e-11 ***
## Fuel_Price -7.849e+03 1.232e+03 -6.373 2.20e-10 ***
## MarkDown4 -1.421e-01 4.093e-02 -3.471 0.000526 ***
## MarkDown5 -4.798e-02 1.946e-02 -2.465 0.013773 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15850 on 2515 degrees of freedom
## Multiple R-squared: 0.03542, Adjusted R-squared: 0.03389
## F-statistic: 23.09 on 4 and 2515 DF, p-value: < 2.2e-16
Regression model shows that there is very poor effect of markdown on the sales of the stores
Regression over cpi and unmeployement
corrgram(cpi, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt)

fit3 <- lm(Weekly_Sales ~ Temperature + Fuel_Price + MarkDown1 + MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5+CPI+Unemployment, data=cpi)
summary(fit3)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown1 +
## MarkDown2 + MarkDown3 + MarkDown4 + MarkDown5 + CPI + Unemployment,
## data = cpi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20700 -10023 -5052 4287 83355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.806e+04 6.115e+03 6.223 5.89e-10 ***
## Temperature 1.185e+02 2.084e+01 5.685 1.50e-08 ***
## Fuel_Price -8.362e+03 1.508e+03 -5.545 3.33e-08 ***
## MarkDown1 -5.693e-04 5.390e-02 -0.011 0.9916
## MarkDown2 -4.854e-02 3.682e-02 -1.318 0.1876
## MarkDown3 -2.946e-02 2.623e-02 -1.123 0.2614
## MarkDown4 -1.296e-01 7.320e-02 -1.770 0.0769 .
## MarkDown5 -4.369e-02 1.938e-02 -2.255 0.0243 *
## CPI -4.155e+00 1.014e+01 -0.410 0.6820
## Unemployment 1.756e+02 2.223e+02 0.790 0.4296
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15640 on 2059 degrees of freedom
## (5536 observations deleted due to missingness)
## Multiple R-squared: 0.03861, Adjusted R-squared: 0.03441
## F-statistic: 9.189 on 9 and 2059 DF, p-value: 9.251e-14
fit4 <- lm(Weekly_Sales~Temperature+Fuel_Price+MarkDown5+CPI+Unemployment, data = cpi)
summary(fit4)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Fuel_Price + MarkDown5 +
## CPI + Unemployment, data = cpi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20778 -10783 -5724 5358 184977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.338e+04 4.839e+03 6.898 6.23e-12 ***
## Temperature 1.103e+02 1.707e+01 6.460 1.20e-10 ***
## Fuel_Price -7.978e+03 1.185e+03 -6.735 1.91e-11 ***
## MarkDown5 -6.194e-02 2.022e-02 -3.063 0.00221 **
## CPI 4.701e+00 8.223e+00 0.572 0.56761
## Unemployment 4.691e+02 1.795e+02 2.614 0.00900 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16700 on 3459 degrees of freedom
## (4140 observations deleted due to missingness)
## Multiple R-squared: 0.02764, Adjusted R-squared: 0.02623
## F-statistic: 19.66 on 5 and 3459 DF, p-value: < 2.2e-16
This regression model also proves to be a poor estimate of factors that influence the sales of the stores. There must be other factors that must be considered in estimating the sales of the stores. For e.g. Employees and their efficiency, population of people at the stores, avg. foot traffic at the stores…