The objective of this project is to develop a statistical Model based on the dataset available.
I am using the historical sales data for 45 Walmart stores located in different regions to predicting the Weekly sales for Store #1 only.
Walmart runs several promotional markdown events throughout the year.These markdowns precede prominent holidays, the four largest of all,which are the Super Bowl, Labour Day, Thanksgiving, and Christmas.
This is the historical data which covers sales from 2010-02-05 (FEB) to 2012-11-01 (NOV), in the file Walmart_Store_sales. Within this file you will find the following fields:
library(tidyverse)
library(ggplot2) #data visualization
library(tibble) #handle tibbles
library(dplyr) #data manipulation
library(skimr) #for better summary
library(caret) # for data splitting, pre-processing,feature selection etc.
library(lubridate) # to work in ease with dates
library(lmtest) #Testing linear regression Model
#For multicollinearity (VIF)
library(sp)
library(raster)
library(usdm)
library(car)
Reading data from CSV file
Walmart <- read.csv("C:/data/Walmart_Store_sales.csv")
Look at the First 6 rows of Wallmart dataset.
head(Walmart)
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
## 1 1 05-02-2010 1643691 0 42.31 2.572 211.0964
## 2 1 12-02-2010 1641957 1 38.51 2.548 211.2422
## 3 1 19-02-2010 1611968 0 39.93 2.514 211.2891
## 4 1 26-02-2010 1409728 0 46.63 2.561 211.3196
## 5 1 05-03-2010 1554807 0 46.50 2.625 211.3501
## 6 1 12-03-2010 1439542 0 57.79 2.667 211.3806
## Unemployment
## 1 8.106
## 2 8.106
## 3 8.106
## 4 8.106
## 5 8.106
## 6 8.106
Which store has maximum sales?
df<- Walmart
df$Date <- as.Date(df$Date, format = "%d-%m-%Y")
df_max_sale <- df %>%
subset(., select = 1:3) %>%
arrange(desc(Weekly_Sales)) #Returns the maximum sales in decreasing order
head(df_max_sale, 1) # Returns the max weekly sales with the Store with date
## Store Date Weekly_Sales
## 1 14 2010-12-24 3818686
Which store has maximum standard deviation
df_sd_max <- df %>%
subset(., select = c(1,3)) %>% #select only Stores and Weekly sales
group_by(Store) %>% # group by store as required Standard deviation of all stores
mutate(Std_dev = sd(Weekly_Sales)) %>% # finding std dev of sales store wise.
filter(Std_dev == max(Std_dev)) %>% #finding the max standard deviation
arrange(desc(Std_dev)) #arranging the standard deviation in decreasing order
df_sd_max[1:1,] #prints max Std Dev of the Store
## # A tibble: 1 x 3
## # Groups: Store [1]
## Store Weekly_Sales Std_dev
## <int> <dbl> <dbl>
## 1 14 2623470. 317570.
Which store/s has good quarterly growth rate in Q3’2012 ?
#Step 1: create a new column for Quarter.Year
df$Quarter_Year <- quarter(df$Date,with_year = T)
#------------------------
#Step 2: For quarter 2 2012 Sum of WeeklySales when data is groupBy Store.
q2_2012 <- df %>%
group_by(Store) %>%
filter(Quarter_Year == 2012.2) %>%
summarize(Weekly_Sales_q2 = sum(Weekly_Sales))
#-----------------------
#Step 3: For quarter 3 2012 Sum of WeeklySales when data is groupBy Store.
q3_2012 <- df %>%
group_by(Store) %>%
filter(Quarter_Year == 2012.3) %>%
summarize(Weekly_Sales_q3 = sum(Weekly_Sales))
#---------------------------
#Step 4: Merge Quarter 2 and Quarter 3 dataframes
q2_q3_2012 <- merge(q2_2012,q3_2012)
#---------------------------
#Step 5: create a new column and growth_rate
growth_rate_final <- q2_q3_2012%>%
mutate(growth_rate = (Weekly_Sales_q3-Weekly_Sales_q2) / Weekly_Sales_q2 * 100) %>%
arrange(desc(growth_rate))
#----------------------------
#Step 6: Top five stores which have good growth_rate
head(growth_rate_final,5)
## Store Weekly_Sales_q2 Weekly_Sales_q3 growth_rate
## 1 7 7290859 8262787 13.330776
## 2 16 6564336 7121542 8.488378
## 3 35 10838313 11322421 4.466637
## 4 26 13155336 13675692 3.955478
## 5 39 20214128 20715116 2.478404
Some holidays have a negative impact on sales. Find out holidays which have higher sales than the mean sales in non-holiday season for all stores together
# Step 1: Find the MEAN of WeeklySales for non holiday only.
mean_nonHoliday<- df %>%
group_by(Holiday_Flag) %>%
summarize(mean_sales = mean(Weekly_Sales)) %>%
filter(Holiday_Flag == 0)
mean_nonHoliday
## # A tibble: 1 x 2
## Holiday_Flag mean_sales
## <int> <dbl>
## 1 0 1041256.
#-------------------
# Step 2: We need to compare the mean of non holiday to all the sales of holiday hence we extract the value and store it in a variable for ease to use
as.data.frame(mean_nonHoliday[1,2])[[1]]-> mean_value
mean_value
## [1] 1041256
#-------------------
#Step 3: Return / Filter only those values which are Holiday sales and its corresponding weekly sale more than mean of non holiday sales
holiday_sales_compared <- df %>%
group_by(Store) %>%
filter(Holiday_Flag == 1 & Weekly_Sales > mean_value) %>%
subset(., select= c(3, 1, 2))
#WE GET 220 STORES WHICH HAVE SALES MORE THAN
holiday_sales_compared
## # A tibble: 220 x 3
## # Groups: Store [31]
## Weekly_Sales Store Date
## <dbl> <int> <date>
## 1 1641957. 1 2010-02-12
## 2 1507461. 1 2010-09-10
## 3 1955624. 1 2010-11-26
## 4 1367320. 1 2010-12-31
## 5 1649615. 1 2011-02-11
## 6 1540471. 1 2011-09-09
## 7 2033321. 1 2011-11-25
## 8 1497463. 1 2011-12-30
## 9 1802477. 1 2012-02-10
## 10 1661767. 1 2012-09-07
## # ... with 210 more rows
Provide a monthly and semester view of sales in units and give insights
# Plotting data of Weekly sales as per Semester.
ggplot(df, aes(Date, Weekly_Sales, fill =month(Date))) +
geom_bar(size = 0.8, stat = "summary",
fun.y = "sum")+
theme_classic()+
scale_x_date(date_labels = "%m/%y",date_breaks = "1 month")+
theme(axis.text.x = element_text(
colour = 'black', angle = 50, size = 10,
hjust = 0.5, vjust = 0.5, face = "bold"),
axis.title.x=element_blank(),axis.text.y = element_text(
colour = 'black', angle = 50, size = 10,
hjust = 0.5, vjust = 0.5, face = "bold")) +
theme(axis.title.x = element_text(colour = "blue", face = "bold"),
axis.title.y = element_text(colour = "blue", face = "bold"))+
ggtitle("Monthly Sales")+
ggeasy::easy_center_title()
## No summary function supplied, defaulting to `mean_se()`
Gathering information about Wallmart Dataset
skim(Walmart)
| Name | Walmart |
| Number of rows | 6435 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Date | 0 | 1 | 10 | 10 | 0 | 143 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Store | 0 | 1 | 23.00 | 12.99 | 1.00 | 12.00 | 23.00 | 34.00 | 45.00 | ▇▇▇▇▇ |
| Weekly_Sales | 0 | 1 | 1046964.88 | 564366.62 | 209986.25 | 553350.10 | 960746.04 | 1420158.66 | 3818686.45 | ▇▆▂▁▁ |
| Holiday_Flag | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| Temperature | 0 | 1 | 60.66 | 18.44 | -2.06 | 47.46 | 62.67 | 74.94 | 100.14 | ▁▃▆▇▃ |
| Fuel_Price | 0 | 1 | 3.36 | 0.46 | 2.47 | 2.93 | 3.44 | 3.73 | 4.47 | ▆▆▇▇▁ |
| CPI | 0 | 1 | 171.58 | 39.36 | 126.06 | 131.74 | 182.62 | 212.74 | 227.23 | ▇▁▁▂▆ |
| Unemployment | 0 | 1 | 8.00 | 1.88 | 3.88 | 6.89 | 7.87 | 8.62 | 14.31 | ▂▇▆▁▁ |
par(mfrow=c(3,2)) # plot many plots at once.
hist(Walmart$Store, col = '#B22222', main = "Stores")
hist(Walmart$Temperature, col = '#CD0000', main = "Temperature")
hist(Walmart$Fuel_Price, col = '#CD0000', main = "Fuel Price")
hist(Walmart$CPI, col = '#B22222', main = "CPI")
hist(Walmart$Unemployment, col = '#B22222', main = "Unemployment")
hist(Walmart$Weekly_Sales, col = '#CD0000', main = "Weekly Sales")
par(mfrow=c(2,1))
hist(Walmart$Weekly_Sales, col = '#CD0000', main = "Weekly Sales")
hist(log(Walmart$Weekly_Sales), col = '#CD0000', main = "Weekly Sales")
We can use this transformation when building model as Weekly sales has very large values as compared to other Variables
Walmart$Date <- as.Date(Walmart$Date, format = "%d-%m-%Y")
Walmart$Week<- format(as.Date(Walmart$Date), "%V")
Walmart$Month <- month(Walmart$Date)
Walmart$Year <- year(Walmart$Date)
glimpse(Walmart)
## Rows: 6,435
## Columns: 11
## $ Store <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ Date <date> 2010-02-05, 2010-02-12, 2010-02-19, 2010-02-26, 2010-03-~
## $ Weekly_Sales <dbl> 1643691, 1641957, 1611968, 1409728, 1554807, 1439542, 147~
## $ Holiday_Flag <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Temperature <dbl> 42.31, 38.51, 39.93, 46.63, 46.50, 57.79, 54.58, 51.45, 6~
## $ Fuel_Price <dbl> 2.572, 2.548, 2.514, 2.561, 2.625, 2.667, 2.720, 2.732, 2~
## $ CPI <dbl> 211.0964, 211.2422, 211.2891, 211.3196, 211.3501, 211.380~
## $ Unemployment <dbl> 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 7~
## $ Week <chr> "05", "06", "07", "08", "09", "10", "11", "12", "13", "14~
## $ Month <dbl> 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, ~
## $ Year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 201~
cor(Walmart[,-c(2,4,9)])
## Store Weekly_Sales Temperature Fuel_Price CPI
## Store 1.00000000 -0.335332015 -0.02265908 0.060022955 -0.209491930
## Weekly_Sales -0.33533201 1.000000000 -0.06381001 0.009463786 -0.072634162
## Temperature -0.02265908 -0.063810013 1.00000000 0.144981806 0.176887676
## Fuel_Price 0.06002295 0.009463786 0.14498181 1.000000000 -0.170641795
## CPI -0.20949193 -0.072634162 0.17688768 -0.170641795 1.000000000
## Unemployment 0.22353127 -0.106176090 0.10115786 -0.034683745 -0.302020064
## Month 0.00000000 0.076143320 0.23586176 -0.042155900 0.004979672
## Year 0.00000000 -0.018377543 0.06426923 0.779470302 0.074795731
## Unemployment Month Year
## Store 0.22353127 0.000000000 0.00000000
## Weekly_Sales -0.10617609 0.076143320 -0.01837754
## Temperature 0.10115786 0.235861759 0.06426923
## Fuel_Price -0.03468374 -0.042155900 0.77947030
## CPI -0.30202006 0.004979672 0.07479573
## Unemployment 1.00000000 -0.012745591 -0.24181349
## Month -0.01274559 1.000000000 -0.19446452
## Year -0.24181349 -0.194464521 1.00000000
heatmap(cor(Walmart[,-c(2,4,9)]), scale = "none")
ggplot(Walmart, aes(x = Store,y = Weekly_Sales, fill = Month )) +
geom_col()+
facet_wrap(~Year)+
theme(axis.text.x = element_text( size = 7) )
ggplot(Walmart, aes(x = Week,y = Weekly_Sales, fill = Holiday_Flag )) +
geom_col()+
theme(axis.text.x = element_text( size = 7))+
facet_wrap(~Year)
Here we see that having a week with holiday resulted in good sales around November (Thanksgiving), other Holiday Week Weekly looks same as any other day.
ggplot(Walmart, aes(x = Week,y = Weekly_Sales, fill =Month)) +
geom_col()+
theme(axis.text.x = element_text(colour = 'black', face = "bold", size = 7), axis.text.y = element_text(colour = 'black',face = "bold", size = 7))+
facet_wrap(~Year)
# weekly sales of Stores when grouped by Month
Walmart<-Walmart %>%
group_by(Store ,Month) %>%
mutate(sum_month_store_wise = sum(Weekly_Sales))
#Monthly mean of Sales
Walmart<-Walmart %>%
group_by(Month) %>%
mutate(mean_Monthly = mean(sum_month_store_wise))
# upper threshold limit
# we mean here again to get single value as Threshold limit
Walmart$Upper_limit <- mean(Walmart$mean_Monthly) * 1.05
# lower threshold limit
# we mean here again to get single value as Threshold limit
Walmart$Lower_limit <- mean(Walmart$mean_Monthly) * 0.95
Walmart$Store_Type <- "Medium" # 16th column
Walmart[which(Walmart$sum_month_store_wise > Walmart$Upper_limit), 16] = "High"
Walmart[which(Walmart$sum_month_store_wise < Walmart$Lower_limit), 16] = "Low"
#counts of Medium, High, Low stores
Walmart %>%
group_by(Store_Type) %>%
count(Store_Type)
## # A tibble: 3 x 2
## # Groups: Store_Type [3]
## Store_Type n
## <chr> <int>
## 1 High 2622
## 2 Low 3351
## 3 Medium 462
# Letting go all other stores as we have to predict for Only store 1
Walmart<- Walmart %>%
filter(Store==1)
# delete all un neccesary variables
Walmart<- Walmart[,-12:-15]
# get mean Weekly Sales when group by month
Walmart<- Walmart %>%
group_by(Month) %>%
mutate(WeeklySales = mean(Weekly_Sales))
# mean of Weekly_Sales when grouped by Year
Walmart<-Walmart %>%
group_by(Year) %>%
mutate(mean_yearly = mean(WeeklySales))
# upper threshold limit
Walmart<- Walmart %>%
group_by(Year) %>%
mutate(Upper_limit = mean_yearly *1.05)
# Lower threshold limit
Walmart<- Walmart %>%
group_by(Year) %>%
mutate(Lower_limit = mean_yearly *0.95)
# Making Week_Type
Walmart$Week_type <- "Medium" # 17th column
Walmart[which(Walmart$Weekly_Sales > Walmart$Upper_limit), 17] = "High"
Walmart[which(Walmart$Weekly_Sales < Walmart$Lower_limit), 17] = "Low"
# Drop unwanted Variables
Walmart<- Walmart[,-c(13:16)]
#counts of Medium, High, Low Weeks
Walmart %>%
group_by(Week_type) %>%
count(Week_type)
## # A tibble: 3 x 2
## # Groups: Week_type [3]
## Week_type n
## <chr> <int>
## 1 High 29
## 2 Low 45
## 3 Medium 69
#counts of Medium, High, Low Stores
Walmart %>%
group_by(Store_Type) %>%
count(Store_Type)
## # A tibble: 3 x 2
## # Groups: Store_Type [3]
## Store_Type n
## <chr> <int>
## 1 High 127
## 2 Low 8
## 3 Medium 8
#creating Column for Markdown Events.
# Creating Super_Bowl variable
Walmart$Super_Bowl <- ifelse((Walmart$Date == "2010-02-10" |
Walmart$Date == "2011-02-11"|
Walmart$Date == "2012-02-10"), 1, 0)
# Creating Labour_Day variable
Walmart$Labour_day <- ifelse((Walmart$Date == "2010-09-10" |
Walmart$Date == "2011-09-09"|
Walmart$Date == "2012-09-07"), 1, 0)
# Creating Thanksgiving variable
Walmart$Thanksgiving <- ifelse((Walmart$Date == "2010-11-26" |
Walmart$Date == "2011-11-25"|
Walmart$Date == "2012-11-23"), 1, 0)
# Creating Christmas variable
Walmart$Christmas<- ifelse((Walmart$Date == "2010-12-31" |
Walmart$Date == "2011-12-30"|
Walmart$Date == "2012-12-28"), 1, 0)
#remove unnecessary variables
Walmart <- subset(Walmart, select = -c(Store, Date, Week, Month,
Year))
par(mfrow=c(2,3))
boxplot(Walmart$Weekly_Sales,col="#CD6600",main = "Weekly Sales")
boxplot(Walmart$Unemployment,col="#CD6600",main = "Unemployment")
boxplot(Walmart$Temperature,col="#CD6600",main = "Temperature")
boxplot(Walmart$Fuel_Price,col = "#CD6600",main = "Fuel Price")
boxplot(Walmart$CPI,col = "#CD6600",main = "Fuel Price")
#Recall Weekly sales is right skewed (from EDA)
hist(Walmart$Weekly_Sales,col="#8B2323",main = "Weekly Sales")
# Outliers treatment
uv1 = quantile(Walmart$Weekly_Sales, 0.75)
Walmart$Weekly_Sales[Walmart$Weekly_Sales> uv1] <- uv1
uv = quantile(Walmart$Unemployment, 0.75)
Walmart$Unemployment[Walmart$Unemployment> uv] <- uv
#Log transformation for skewness
Walmart$Weekly_Sales <- log(Walmart$Weekly_Sales)
Walmart$CPI <- log(Walmart$CPI)
Walmart$Temperature <- log(Walmart$Temperature)
#Factor the Categorical Variables
Walmart$Week_type <- factor(Walmart$Week_type, levels = c("Low", "Medium", "High"),labels = 0,1,2)
Walmart$Store_Type <- factor(Walmart$Store_Type, levels = c("Low", "Medium", "High"),labels = 0,1,2)
We have Holiday_Flag variable that divides our dataset into two halves, hence lets use that ratio to distribute test and train
#selecting 75% for Train(ratio maintained 93:7 No holiday : holiday)
set.seed(7)
t <- createDataPartition(Walmart$Holiday_Flag, p = 0.75)
T1=t[[1]]
train_set <- Walmart[T1,]
test_set <- Walmart[-T1,]
dev.off()
## null device
## 1
#1. with all variables
fit1 <- lm(Weekly_Sales~., data=train_set)
summary(fit1)
#MAPE
mean(abs(fit1$residuals/train_set$Weekly_Sales))
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit1) # fail to reject H0 : No Auto correlation
#homoskedasticity
plot(fit1)
# 2. All except Super_Bowl
fit2=lm(Weekly_Sales~.-Super_Bowl, data=train_set)
summary(fit2)
#MAPE
mean(abs(fit2$residuals/train_set$Weekly_Sales))
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit2)# fail to reject H0 : No Auto correlation
# homoskedasticity
plot(fit2)
# 3. All except Super_Bowl and Holiday_Flag
fit3=lm(Weekly_Sales~.-Super_Bowl-Holiday_Flag, data=train_set)
summary(fit3)
# MAPE
mean(abs(fit3$residuals/train_set$Weekly_Sales))
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit3)# fail to reject H0 : No Auto correlation
# homoskedasticity
plot(fit3)
# 4. All except Super_Bowl, Holiday_Flag and Labour_day
fit4=lm(Weekly_Sales~.-Super_Bowl-Holiday_Flag-Labour_day,data=train_set)
summary(fit4)
#MAPE
mean(abs(fit4$residuals/train_set$Weekly_Sales))
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit4) # fail to reject H0 : No Auto correlation
#homoskedasticity
plot(fit4)
# 5. All except Super_Bowl, Holiday_Flag, Labour_day & Thanksgiving
fit5=lm(Weekly_Sales~.-Super_Bowl-Holiday_Flag-Labour_day-Thanksgiving,data=train_set)
summary(fit5)
#MAPE
mean(abs(fit5$residuals/train_set$Weekly_Sales))
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit5)# fail to reject H0 : No Auto correlation
# homoskedasticity
plot(fit5)
# 6. All except Super_Bowl, Holiday_Flag, Labour_day, Thanksgiving & Unemployment
fit6=lm(Weekly_Sales~.-Super_Bowl-Holiday_Flag-Labour_day-Thanksgiving-Unemployment,data=train_set)
summary(fit6)
##
## Call:
## lm(formula = Weekly_Sales ~ . - Super_Bowl - Holiday_Flag - Labour_day -
## Thanksgiving - Unemployment, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.055144 -0.015633 0.000232 0.013900 0.052893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.166967 1.051576 14.423 < 2e-16 ***
## Temperature -0.029338 0.013437 -2.183 0.031378 *
## Fuel_Price 0.009586 0.008811 1.088 0.279253
## CPI -0.157242 0.200557 -0.784 0.434896
## Store_Type.L 0.032699 0.008236 3.970 0.000136 ***
## Store_Type.Q -0.031253 0.009172 -3.407 0.000949 ***
## Week_type.L 0.086286 0.005897 14.632 < 2e-16 ***
## Week_type.Q -0.020303 0.003870 -5.246 8.86e-07 ***
## Christmas -0.048830 0.018341 -2.662 0.009057 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02418 on 99 degrees of freedom
## Multiple R-squared: 0.8373, Adjusted R-squared: 0.8241
## F-statistic: 63.68 on 8 and 99 DF, p-value: < 2.2e-16
# MAPE
mean(abs(fit6$residuals/train_set$Weekly_Sales))
## [1] 0.00129871
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit6) # fail to reject H0 : No Auto correlation
##
## Durbin-Watson test
##
## data: fit6
## DW = 1.9546, p-value = 0.2482
## alternative hypothesis: true autocorrelation is greater than 0
# homoskedasticity
plot(fit6)
# 7. All except Super_Bowl, Holiday_Flag, Labour_day, Thanksgiving, Unemployment, CPI
fit7=lm(Weekly_Sales~.-Super_Bowl-Holiday_Flag-Labour_day-Thanksgiving-Unemployment-CPI,data=train_set)
summary(fit7)
#MAPE
mean(abs(fit7$residuals/train_set$Weekly_Sales))
# Auto correlation H0: There is no auto correlation in residuals
dwtest(fit7) # fail to reject H0 : No Auto correlation
# homoskedasticity
plot(fit7)
#*8. All except Super_Bowl, Holiday_Flag, Labour_day, Thanksgiving, Unemployment, CPI, Fuel_Price
fit8=lm(Weekly_Sales~.-Super_Bowl-Holiday_Flag-Labour_day-Thanksgiving-Unemployment-CPI-Fuel_Price, data=train_set)
summary(fit8)
# MAPE
mean(abs(fit8$residuals/train_set$Weekly_Sales))
# Auto correlation
# H0: There is no auto correlation in residuals
dwtest(fit8)# fail to reject H0 : No Auto correlation
#homoskedasticity
plot(fit8)
1. Difference between R-square and Adjusted R-squared is less. 2. Variables in fit 6 are the most significant, achieved after checking various criteria such as R-squared, adj R-squared, Std Error, Homoskedasticity, Normality, Aurto – correlation, AIC, MAPE, multicollinearity etc.
VIF is used to check MULTICOLLINEARITY between two independent Variables.
vif(fit1)
# With a Threshold of 5% we can say that VIF is less than 5% as per formula GVIF^(1/(2*Df)). That means no independent Variables show multicollinearity.We can keep all the independent Variables.
fit_VIF=lm(Weekly_Sales~., data=train_set)
summary(fit_VIF)
#MAPE
mean(abs(fit_VIF$residuals/train_set$Weekly_Sales))
# ***** Auto correlation****
# H0: There is no auto correlation in residuals
dwtest(fit_VIF) # fail to reject H0 : No Auto correlation
# homoskedasticity
plot(fit_VIF)
summary(fit_VIF)
##
## Call:
## lm(formula = Weekly_Sales ~ ., data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053595 -0.013427 -0.000024 0.014355 0.052481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.866388 1.793816 9.403 3.43e-15 ***
## Holiday_Flag 0.027979 0.026145 1.070 0.287296
## Temperature -0.025794 0.015081 -1.710 0.090496 .
## Fuel_Price 0.013807 0.009480 1.456 0.148609
## CPI -0.456768 0.321150 -1.422 0.158252
## Unemployment -0.015185 0.014051 -1.081 0.282580
## Store_Type.L 0.031379 0.008666 3.621 0.000475 ***
## Store_Type.Q -0.035992 0.010180 -3.536 0.000634 ***
## Week_type.L 0.088658 0.006337 13.992 < 2e-16 ***
## Week_type.Q -0.019610 0.004007 -4.894 4.08e-06 ***
## Super_Bowl -0.033204 0.035569 -0.934 0.352951
## Labour_day -0.036612 0.030253 -1.210 0.229239
## Thanksgiving -0.048744 0.039294 -1.241 0.217877
## Christmas -0.070533 0.030559 -2.308 0.023187 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02441 on 94 degrees of freedom
## Multiple R-squared: 0.8426, Adjusted R-squared: 0.8208
## F-statistic: 38.71 on 13 and 94 DF, p-value: < 2.2e-16
This method also doesn’t solve the purpose as we have some independent variables with very high p-value after the model is generated. When we remove them we will end up higher R2 value. Hence we don’t select this model.
fit_step=step(fit1)
summary(fit_step)
# MAPE
mean(abs(fit_step$residuals/train_set$Weekly_Sales))
dwtest(fit_step)# fail to reject H0 : No Auto correlation
# homoskedasticity
plot(fit_step)
summary(fit_step)
##
## Call:
## lm(formula = Weekly_Sales ~ Temperature + Store_Type + Week_type +
## Christmas, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.057102 -0.014215 0.001512 0.014857 0.050606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.344596 0.050305 285.152 < 2e-16 ***
## Temperature -0.027525 0.012461 -2.209 0.02944 *
## Store_Type.L 0.032774 0.007969 4.113 7.98e-05 ***
## Store_Type.Q -0.030239 0.008957 -3.376 0.00104 **
## Week_type.L 0.085994 0.004908 17.522 < 2e-16 ***
## Week_type.Q -0.020551 0.003847 -5.342 5.70e-07 ***
## Christmas -0.049743 0.018098 -2.749 0.00709 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02408 on 101 degrees of freedom
## Multiple R-squared: 0.8353, Adjusted R-squared: 0.8256
## F-statistic: 85.4 on 6 and 101 DF, p-value: < 2.2e-16
This model is significant but it removes too many variables from the dataset. Moreover it has less low r2 value and lesser adj r2 as compared to fit6. So finally we will select model fit6
#*Evaluating the Model (performance metrics)*
summary(fit6)
##
## Call:
## lm(formula = Weekly_Sales ~ . - Super_Bowl - Holiday_Flag - Labour_day -
## Thanksgiving - Unemployment, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.055144 -0.015633 0.000232 0.013900 0.052893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.166967 1.051576 14.423 < 2e-16 ***
## Temperature -0.029338 0.013437 -2.183 0.031378 *
## Fuel_Price 0.009586 0.008811 1.088 0.279253
## CPI -0.157242 0.200557 -0.784 0.434896
## Store_Type.L 0.032699 0.008236 3.970 0.000136 ***
## Store_Type.Q -0.031253 0.009172 -3.407 0.000949 ***
## Week_type.L 0.086286 0.005897 14.632 < 2e-16 ***
## Week_type.Q -0.020303 0.003870 -5.246 8.86e-07 ***
## Christmas -0.048830 0.018341 -2.662 0.009057 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02418 on 99 degrees of freedom
## Multiple R-squared: 0.8373, Adjusted R-squared: 0.8241
## F-statistic: 63.68 on 8 and 99 DF, p-value: < 2.2e-16
#MAPE
mean(abs(fit6$residuals/train_set$Weekly_Sales))
## [1] 0.00129871
# Not including Weekly_Sales when predicting the sales from Test_set
test_set_final <- test_set[,-1]
#prediction of training model
train_predict<- predict(fit6, train_set)# Train Set Prediction
#prediction of training model
test_predict<- predict(fit6, test_set_final)# Test Set Prediction
#creating new col of prediction values of train data set
train_prediction <- train_set %>%
mutate(pred.train = train_predict)
#creating new col of prediction values of test data set
test_prediction<- test_set %>%
mutate(pred.test = test_predict)
# Displaying scatter plot of train prediction
plot(train_prediction$Weekly_Sales, train_prediction$pred.train, col = c("red", "blue"), main = "Actual Vs Prediction - Train")
#*Displaying scatter plot of test prediction with original values of Weekly sales.
plot(test_prediction$Weekly_Sales, test_prediction$pred.test, col = c("red", "blue"), main = "Actual Vs Prediction - Test")
# Root-Mean-Square Error for train_prediction
Metrics::rmse(train_prediction$Weekly_Sales, predict(fit6,data.frame(train_prediction)))
## [1] 0.02315078
# Root-Mean-Square Error for test_prediction
Metrics::rmse(test_prediction$Weekly_Sales, predict(fit6,data.frame(test_prediction)))
## [1] 0.02679141