Objective: Our objective here is to analyse and predict the sales in Big Mart considering different variable such as product visibility, fat content, store location, type of outlet etc. We will use different statistical models to support this analysis and suggestions on how to improve sales at Big Mart. See project report for further analysis. –
setwd("C:/Users/Abhi/Desktop/Data Analytics/Project")
BigMart<-read.csv(paste("Big Mart Sales Project.csv", sep="TRUE"))
head(BigMart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 1 FDA15 9.300 Low Fat 0.01604730
## 2 DRC01 5.920 Regular 0.01927822
## 3 FDN15 17.500 Low Fat 0.01676007
## 4 FDX07 19.200 Regular 0.00000000
## 5 NCD19 8.930 Low Fat 0.00000000
## 6 FDP36 10.395 Regular 0.00000000
## Item_Type Item_MRP Outlet_Identifier
## 1 Dairy 249.8092 OUT049
## 2 Soft Drinks 48.2692 OUT018
## 3 Meat 141.6180 OUT049
## 4 Fruits and Vegetables 182.0950 OUT010
## 5 Household 53.8614 OUT013
## 6 Baking Goods 51.4008 OUT018
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 1 1999 Medium Tier 1
## 2 2009 Medium Tier 3
## 3 1999 Medium Tier 1
## 4 1998 Tier 3
## 5 1987 High Tier 3
## 6 2009 Medium Tier 3
## Outlet_Type Item_Outlet_Sales
## 1 Supermarket Type1 3735.1380
## 2 Supermarket Type2 443.4228
## 3 Supermarket Type1 2097.2700
## 4 Grocery Store 732.3800
## 5 Supermarket Type1 994.7052
## 6 Supermarket Type2 556.6088
str(BigMart)
## 'data.frame': 8523 obs. of 12 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "","High","Medium",..: 3 3 3 1 2 3 2 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
summary(BigMart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.858 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
lp<- BigMart[order(-BigMart$Item_Outlet_Sales), ]
lp[1:10, 10:12]
## Outlet_Location_Type Outlet_Type Item_Outlet_Sales
## 7189 Tier 3 Supermarket Type3 13086.965
## 5224 Tier 3 Supermarket Type3 12117.560
## 1451 Tier 3 Supermarket Type3 11445.102
## 4350 Tier 3 Supermarket Type3 10993.690
## 6542 Tier 3 Supermarket Type3 10306.584
## 4889 Tier 3 Supermarket Type1 10256.649
## 1010 Tier 3 Supermarket Type3 10236.675
## 7192 Tier 3 Supermarket Type3 10072.888
## 4290 Tier 1 Supermarket Type1 9779.936
## 7753 Tier 3 Supermarket Type3 9678.069
lp<- BigMart[order(BigMart$Item_Outlet_Sales), ]
lp[1:10, 10:12]
## Outlet_Location_Type Outlet_Type Item_Outlet_Sales
## 907 Tier 3 Grocery Store 33.2900
## 6951 Tier 3 Grocery Store 33.2900
## 2572 Tier 1 Grocery Store 33.9558
## 3054 Tier 1 Grocery Store 34.6216
## 7389 Tier 3 Grocery Store 35.2874
## 4266 Tier 3 Grocery Store 36.6190
## 7613 Tier 1 Grocery Store 36.6190
## 2056 Tier 1 Grocery Store 37.2848
## 490 Tier 1 Grocery Store 37.9506
## 575 Tier 1 Grocery Store 37.9506
library(car)
attach(BigMart)
boxplot(Item_Outlet_Sales ~ Outlet_Location_Type, horizontal=TRUE,
ylab="Sales", xlab="Ourlet location", las=1,
main="Comparision of sales at different outlet locations",
col=c("yellow","pink")
)
attach(BigMart)
## The following objects are masked from BigMart (pos = 3):
##
## Item_Fat_Content, Item_Identifier, Item_MRP,
## Item_Outlet_Sales, Item_Type, Item_Visibility, Item_Weight,
## Outlet_Establishment_Year, Outlet_Identifier,
## Outlet_Location_Type, Outlet_Size, Outlet_Type
boxplot(Item_Outlet_Sales ~ Outlet_Type, horizontal=TRUE,
ylab="Sales", xlab="Ourlet location", las=1,
main="Comparision of sales at different outlet locations",
col=c("yellow","pink")
)
pairs(formula = ~ Item_Fat_Content + Item_Type + Item_Outlet_Sales + Item_Visibility, cex=0.6,col = c("green", "red"),
data=BigMart,main="Scatter plot matrix")
ttable <- with(BigMart, table(BigMart$Item_Type))
ttable
##
## Baking Goods Breads Breakfast
## 648 251 110
## Canned Dairy Frozen Foods
## 649 682 856
## Fruits and Vegetables Hard Drinks Health and Hygiene
## 1232 214 520
## Household Meat Others
## 910 425 169
## Seafood Snack Foods Soft Drinks
## 64 1200 445
## Starchy Foods
## 148
ttable1 <- with(BigMart, table(BigMart$Outlet_Location_Type))
ttable1
##
## Tier 1 Tier 2 Tier 3
## 2388 2785 3350
ttable3 <- with(BigMart, table(BigMart$Outlet_Type))
ttable3
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 1083 5577 928 935
ttable4 <- with(BigMart, table(BigMart$Outlet_Size))
ttable4
##
## High Medium Small
## 2410 932 2793 2388
ttable5 <- with(BigMart, table(BigMart$Outlet_Establishment_Year))
ttable5
##
## 1985 1987 1997 1998 1999 2002 2004 2007 2009
## 1463 932 930 555 930 929 930 926 928
ttable6 <- with(BigMart, table(BigMart$Item_Fat_Content))
ttable6
##
## LF low fat Low Fat reg Regular
## 316 112 5089 117 2889
ttable7 <- with(BigMart, table(BigMart$Outlet_Identifier))
ttable7
##
## OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035 OUT045 OUT046 OUT049
## 555 932 926 928 528 935 930 929 930 930
ttable8 <- xtabs(~ Item_Type + Item_Fat_Content , data=BigMart)
ttable8
## Item_Fat_Content
## Item_Type LF low fat Low Fat reg Regular
## Baking Goods 20 8 301 13 306
## Breads 8 6 126 3 108
## Breakfast 2 0 39 2 67
## Canned 17 10 314 11 297
## Dairy 24 12 382 6 258
## Frozen Foods 22 4 424 19 387
## Fruits and Vegetables 39 11 580 25 577
## Hard Drinks 8 7 199 0 0
## Health and Hygiene 29 10 481 0 0
## Household 54 16 840 0 0
## Meat 10 1 159 7 248
## Others 11 2 156 0 0
## Seafood 4 1 32 0 27
## Snack Foods 31 16 645 23 485
## Soft Drinks 28 7 339 1 70
## Starchy Foods 9 1 72 7 59
ttable9 <- xtabs(~ Item_Fat_Content + Outlet_Type, data=BigMart)
ttable9
## Outlet_Type
## Item_Fat_Content Grocery Store Supermarket Type1 Supermarket Type2
## LF 45 192 42
## low fat 18 70 9
## Low Fat 642 3347 547
## reg 14 79 18
## Regular 364 1889 312
## Outlet_Type
## Item_Fat_Content Supermarket Type3
## LF 37
## low fat 15
## Low Fat 553
## reg 6
## Regular 324
ttable10 <- xtabs(~ Item_Outlet_Sales + Outlet_Type, data=BigMart)
head(ttable10)
## Outlet_Type
## Item_Outlet_Sales Grocery Store Supermarket Type1 Supermarket Type2
## 33.29 2 0 0
## 33.9558 1 0 0
## 34.6216 1 0 0
## 35.2874 1 0 0
## 36.619 2 0 0
## 37.2848 1 0 0
## Outlet_Type
## Item_Outlet_Sales Supermarket Type3
## 33.29 0
## 33.9558 0
## 34.6216 0
## 35.2874 0
## 36.619 0
## 37.2848 0
lm1<- lm(Item_Outlet_Sales ~ Outlet_Size+Outlet_Location_Type+Outlet_Type, data = BigMart)
summary(lm1)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ Outlet_Size + Outlet_Location_Type +
## Outlet_Type, data = BigMart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3452.4 -1064.0 -187.1 672.1 9392.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 167.90 88.13 1.905 0.05681 .
## Outlet_SizeHigh 22.13 113.75 0.195 0.84576
## Outlet_SizeMedium 242.94 91.30 2.661 0.00781 **
## Outlet_SizeSmall 172.43 59.79 2.884 0.00393 **
## Outlet_Location_TypeTier 2 161.00 69.00 2.333 0.01966 *
## Outlet_Location_TypeTier 3 171.45 108.43 1.581 0.11386
## Outlet_TypeSupermarket Type1 1937.51 81.08 23.896 < 2e-16 ***
## Outlet_TypeSupermarket Type2 1413.20 121.29 11.652 < 2e-16 ***
## Outlet_TypeSupermarket Type3 3111.74 121.21 25.671 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1488 on 8514 degrees of freedom
## Multiple R-squared: 0.2404, Adjusted R-squared: 0.2397
## F-statistic: 336.9 on 8 and 8514 DF, p-value: < 2.2e-16
lm2 <- lm(Item_Outlet_Sales ~ Item_Visibility+Item_Fat_Content+Item_Type, data = BigMart)
summary(lm2)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ Item_Visibility + Item_Fat_Content +
## Item_Type, data = BigMart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2588.3 -1280.0 -395.1 892.6 10600.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2150.4486 118.1314 18.204 < 2e-16 ***
## Item_Visibility -4363.3529 356.2054 -12.250 < 2e-16 ***
## Item_Fat_Contentlow fat -0.5135 185.9362 -0.003 0.997796
## Item_Fat_ContentLow Fat 78.6273 98.0024 0.802 0.422402
## Item_Fat_Contentreg -145.7729 183.8594 -0.793 0.427887
## Item_Fat_ContentRegular 149.8037 101.7608 1.472 0.141027
## Item_TypeBreads 240.6050 125.6100 1.915 0.055463 .
## Item_TypeBreakfast 218.9292 174.3768 1.255 0.209334
## Item_TypeCanned 267.9014 93.8065 2.856 0.004302 **
## Item_TypeDairy 298.6998 92.7822 3.219 0.001290 **
## Item_TypeFrozen Foods 165.4176 87.9758 1.880 0.060106 .
## Item_TypeFruits and Vegetables 333.2940 81.9670 4.066 4.82e-05 ***
## Item_TypeHard Drinks 199.0411 134.8087 1.476 0.139855
## Item_TypeHealth and Hygiene 27.7581 101.6905 0.273 0.784885
## Item_TypeHousehold 303.3366 89.3042 3.397 0.000685 ***
## Item_TypeMeat 165.8670 105.5726 1.571 0.116192
## Item_TypeOthers -34.0289 147.3961 -0.231 0.817423
## Item_TypeSeafood 400.2604 221.3749 1.808 0.070632 .
## Item_TypeSnack Foods 318.5573 82.4004 3.866 0.000111 ***
## Item_TypeSoft Drinks 52.0696 104.9553 0.496 0.619828
## Item_TypeStarchy Foods 427.6159 153.9879 2.777 0.005499 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1689 on 8502 degrees of freedom
## Multiple R-squared: 0.02266, Adjusted R-squared: 0.02036
## F-statistic: 9.854 on 20 and 8502 DF, p-value: < 2.2e-16
cm1 <- BigMart[, c(2,4,6,12)]
cor(cm1)
## Item_Weight Item_Visibility Item_MRP
## Item_Weight 1 NA NA
## Item_Visibility NA 1.000000000 -0.001314848
## Item_MRP NA -0.001314848 1.000000000
## Item_Outlet_Sales NA -0.128624612 0.567574447
## Item_Outlet_Sales
## Item_Weight NA
## Item_Visibility -0.1286246
## Item_MRP 0.5675744
## Item_Outlet_Sales 1.0000000
library(corrgram)
corrgram(cm1, order=TRUE,
lower.panel=panel.shade,
upper.panel=panel.pie,
text.panel=panel.txt,
main="Corrgram")
library(lattice)
histogram(~Item_Outlet_Sales | Item_Fat_Content + Item_Type, data=BigMart,
type="count",
layout=c(3,3),
col=c("burlywood", "darkolivegreen", "red", "yellow", "peachpuff", "blue", "Green"))