#Project Title: Big Market Sales Analysis
#NAME: Sree Lakshmi Addepalli
#EMAIL: lakshi.ana@gmail.com
#COLLEGE / COMPANY: VESIT
#Cleaning the training data
setwd("C:/Users/Lakshmi/Desktop/LakshmiCapstoneProject/bigmart-sales-data")
BigMartSalesData <- read.csv(file="Train.csv",head=TRUE,sep=",")
summary(BigMartSalesData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.858 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
#setting the Missing Values of weights with weight of its item identifier if missing else setting it to mean value
test <- function(x){
x[is.na(x)] <- 0
z <- max(x)
}
y <- aggregate(BigMartSalesData$Item_Weight~BigMartSalesData$Item_Identifier,BigMartSalesData,test)
head(y)
## BigMartSalesData$Item_Identifier BigMartSalesData$Item_Weight
## 1 DRA12 11.600
## 2 DRA24 19.350
## 3 DRA59 8.270
## 4 DRB01 7.390
## 5 DRB13 6.115
## 6 DRB24 8.785
items_weight_identifier <- vector(mode="list" , length = length(y$`BigMartSalesData$Item_Identifier`))
names(items_weight_identifier) <- y$`BigMartSalesData$Item_Identifier`
for(i in 1:length(y$`BigMartSalesData$Item_Identifier`))
{
items_weight_identifier[[y$`BigMartSalesData$Item_Identifier`[i]]] <- y$`BigMartSalesData$Item_Weight`[i]
}
length(items_weight_identifier)
## [1] 1559
for(i in 1:length(BigMartSalesData$Item_Weight))
{
if(is.na(BigMartSalesData$Item_Weight[i]))
{
p <- BigMartSalesData$Item_Identifier[i]
if(p %in% names(items_weight_identifier)) {
BigMartSalesData$Item_Weight[i] <- items_weight_identifier[[p]]
}
else
{
BigMartSalesData$Item_Weight[i] <- 12.858
}
}
}
summary(BigMartSalesData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.785 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.650 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.875 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
#Reducing the categories to two variables namely"Low Fat" and "Regular"
BigMartSalesData$Item_Fat_Content <- as.character(BigMartSalesData$Item_Fat_Content)
BigMartSalesData$Item_Fat_Content[BigMartSalesData$Item_Fat_Content=="LF"] <-"Low Fat"
BigMartSalesData$Item_Fat_Content[BigMartSalesData$Item_Fat_Content=="low Fat"] <-"Low Fat"
BigMartSalesData$Item_Fat_Content[BigMartSalesData$Item_Fat_Content=="low fat"] <-"Low Fat"
BigMartSalesData$Item_Fat_Content[BigMartSalesData$Item_Fat_Content=="reg"] <-"Regular"
BigMartSalesData$Item_Fat_Content <- as.factor(BigMartSalesData$Item_Fat_Content)
summary(BigMartSalesData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 Low Fat:5517 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.785 Regular:3006 1st Qu.:0.02699
## DRE49 : 9 Median :12.650 Median :0.05393
## DRN47 : 9 Mean :12.875 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
#setting the visibility level to mean value
for(i in 1:length(BigMartSalesData$Item_Visibility))
{
if(BigMartSalesData$Item_Visibility[i]==0)
{
BigMartSalesData$Item_Visibility[i] <- 0.06613
}
}
#Classifying the outlet size of missing variables to "High", "Medium" , "Low" respectively
library(caTools)
set.seed(100)
BigMartSalesData$Outlet_Size <- as.character(BigMartSalesData$Outlet_Size)
BigMartSubsetSalesData <- subset(BigMartSalesData, BigMartSalesData$Outlet_Size != "")
spl <- sample.split(BigMartSubsetSalesData$Outlet_Size, SplitRatio = 0.8)
Train <- subset(BigMartSubsetSalesData, spl == TRUE)
Test <- subset(BigMartSubsetSalesData, spl == FALSE)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
Train$Outlet_Size <- as.factor(Train$Outlet_Size)
Test$Outlet_Size <- as.factor(Test$Outlet_Size)
SizeForest <- randomForest(Outlet_Size ~.-Item_Outlet_Sales -Item_Identifier,data = Train,nodesize = 25, ntree = 100)
PredictForest <- predict(SizeForest, newdata = Test)
table(Test$Outlet_Size, PredictForest)
## PredictForest
## High Medium Small
## High 186 0 0
## Medium 0 559 0
## Small 0 0 478
BigMartSalesData$Outlet_Size <- predict(SizeForest, newdata =BigMartSalesData)
summary(BigMartSalesData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 Low Fat:5517 Min. :0.003575
## FDW13 : 10 1st Qu.: 8.785 Regular:3006 1st Qu.:0.033085
## DRE49 : 9 Median :12.650 Median :0.062517
## DRN47 : 9 Mean :12.875 Mean :0.070213
## FDD38 : 9 3rd Qu.:16.850 3rd Qu.:0.094585
## FDF52 : 9 Max. :21.350 Max. :0.328391
## (Other):8467
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 High : 932 Tier 1:2388
## 1st Qu.:1987 Medium:5203 Tier 2:2785
## Median :1999 Small :2388 Tier 3:3350
## Mean :1998
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
write.csv(BigMartSalesData,"BigMartSalesCleanData.csv")
#similarly doing it for the test set
#Cleaning the test data
setwd("C:/Users/Lakshmi/Desktop/LakshmiCapstoneProject/bigmart-sales-data")
BigMartSalesTestData <- read.csv(file="Test.csv",head=TRUE,sep=",")
summary(BigMartSalesTestData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 LF : 206 Min. :0.00000
## FDK57 : 8 1st Qu.: 8.645 low fat: 66 1st Qu.:0.02705
## FDN52 : 8 Median :12.500 Low Fat:3396 Median :0.05415
## FDP15 : 8 Mean :12.696 reg : 78 Mean :0.06568
## FDQ60 : 8 3rd Qu.:16.700 Regular:1935 3rd Qu.:0.09346
## FDW10 : 8 Max. :21.350 Max. :0.32364
## (Other):5633 NA's :976
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :1606 Tier 1:1592
## 1st Qu.:1987 High : 621 Tier 2:1856
## Median :1999 Medium:1862 Tier 3:2233
## Mean :1998 Small :1592
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
#setting the Missing Values of weights with weight of its item identifier if missing else setting it to mean value
test <- function(x){
x[is.na(x)] <- 0
z <- max(x)
}
y <- aggregate(BigMartSalesTestData$Item_Weight~BigMartSalesTestData$Item_Identifier,BigMartSalesTestData,test)
head(y)
## BigMartSalesTestData$Item_Identifier BigMartSalesTestData$Item_Weight
## 1 DRA12 11.600
## 2 DRA24 19.350
## 3 DRA59 8.270
## 4 DRB01 7.390
## 5 DRB13 6.115
## 6 DRB24 8.785
items_weight_identifier_test <- vector(mode="list" , length = length(y$`BigMartSalesTestData$Item_Identifier`))
names(items_weight_identifier_test) <- y$`BigMartSalesTestData$Item_Identifier`
for(i in 1:length(y$`BigMartSalesTestData$Item_Identifier`))
{
items_weight_identifier_test[[y$`BigMartSalesTestData$Item_Identifier`[i]]] <- y$`BigMartSalesTestData$Item_Weight`[i]
}
length(items_weight_identifier_test)
## [1] 1543
for(i in 1:length(BigMartSalesTestData$Item_Weight))
{
if(is.na(BigMartSalesTestData$Item_Weight[i]))
{
p <- BigMartSalesTestData$Item_Identifier[i]
if(p %in% names(items_weight_identifier_test)) {
BigMartSalesTestData$Item_Weight[i] <- items_weight_identifier_test[[p]]
}
else
{
BigMartSalesTestData$Item_Weight[i] <- 12.696
}
}
}
summary(BigMartSalesTestData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 LF : 206 Min. :0.00000
## FDK57 : 8 1st Qu.: 8.630 low fat: 66 1st Qu.:0.02705
## FDN52 : 8 Median :12.350 Low Fat:3396 Median :0.05415
## FDP15 : 8 Mean :12.676 reg : 78 Mean :0.06568
## FDQ60 : 8 3rd Qu.:16.700 Regular:1935 3rd Qu.:0.09346
## FDW10 : 8 Max. :21.350 Max. :0.32364
## (Other):5633
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :1606 Tier 1:1592
## 1st Qu.:1987 High : 621 Tier 2:1856
## Median :1999 Medium:1862 Tier 3:2233
## Mean :1998 Small :1592
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
#Reducing the categories to two variables namely"Low Fat" and "Regular"
BigMartSalesTestData$Item_Fat_Content <- as.character(BigMartSalesTestData$Item_Fat_Content)
BigMartSalesTestData$Item_Fat_Content[BigMartSalesTestData$Item_Fat_Content=="LF"] <-"Low Fat"
BigMartSalesTestData$Item_Fat_Content[BigMartSalesTestData$Item_Fat_Content=="low Fat"] <-"Low Fat"
BigMartSalesTestData$Item_Fat_Content[BigMartSalesTestData$Item_Fat_Content=="low fat"] <-"Low Fat"
BigMartSalesTestData$Item_Fat_Content[BigMartSalesTestData$Item_Fat_Content=="reg"] <-"Regular"
BigMartSalesTestData$Item_Fat_Content <- as.factor(BigMartSalesTestData$Item_Fat_Content)
summary(BigMartSalesTestData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 Low Fat:3668 Min. :0.00000
## FDK57 : 8 1st Qu.: 8.630 Regular:2013 1st Qu.:0.02705
## FDN52 : 8 Median :12.350 Median :0.05415
## FDP15 : 8 Mean :12.676 Mean :0.06568
## FDQ60 : 8 3rd Qu.:16.700 3rd Qu.:0.09346
## FDW10 : 8 Max. :21.350 Max. :0.32364
## (Other):5633
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :1606 Tier 1:1592
## 1st Qu.:1987 High : 621 Tier 2:1856
## Median :1999 Medium:1862 Tier 3:2233
## Mean :1998 Small :1592
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
#setting the visibility level to mean value
for(i in 1:length(BigMartSalesTestData$Item_Visibility))
{
if(BigMartSalesTestData$Item_Visibility[i]==0)
{
BigMartSalesTestData$Item_Visibility[i] <- 0.06568
}
}
summary(BigMartSalesTestData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 Low Fat:3668 Min. :0.003591
## FDK57 : 8 1st Qu.: 8.630 Regular:2013 1st Qu.:0.033208
## FDN52 : 8 Median :12.350 Median :0.062137
## FDP15 : 8 Mean :12.676 Mean :0.069765
## FDQ60 : 8 3rd Qu.:16.700 3rd Qu.:0.093463
## FDW10 : 8 Max. :21.350 Max. :0.323637
## (Other):5633
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :1606 Tier 1:1592
## 1st Qu.:1987 High : 621 Tier 2:1856
## Median :1999 Medium:1862 Tier 3:2233
## Mean :1998 Small :1592
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
#Classifying the outlet size of missing variables to "High", "Medium" , "Low" respectively
library(caTools)
set.seed(100)
BigMartSalesTestData$Outlet_Size <- as.character(BigMartSalesTestData$Outlet_Size)
BigMartSubsetSalesTestData <- subset(BigMartSalesTestData, BigMartSalesTestData$Outlet_Size != "")
spl <- sample.split(BigMartSubsetSalesTestData$Outlet_Size, SplitRatio = 0.8)
Train_t <- subset(BigMartSubsetSalesTestData, spl == TRUE)
Test_t <- subset(BigMartSubsetSalesTestData, spl == FALSE)
library(randomForest)
Train_t$Outlet_Size <- as.factor(Train_t$Outlet_Size)
Test_t$Outlet_Size <- as.factor(Test_t$Outlet_Size)
SizeForest_t <- randomForest(Outlet_Size ~.-Item_Identifier,data = Train_t,nodesize = 25, ntree = 100)
PredictForest_t <- predict(SizeForest_t, newdata = Test_t)
table(Test_t$Outlet_Size, PredictForest_t)
## PredictForest_t
## High Medium Small
## High 124 0 0
## Medium 0 372 0
## Small 0 0 318
BigMartSalesTestData$Outlet_Size <- predict(SizeForest_t, newdata =BigMartSalesTestData)
summary(BigMartSalesTestData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 Low Fat:3668 Min. :0.003591
## FDK57 : 8 1st Qu.: 8.630 Regular:2013 1st Qu.:0.033208
## FDN52 : 8 Median :12.350 Median :0.062137
## FDP15 : 8 Mean :12.676 Mean :0.069765
## FDQ60 : 8 3rd Qu.:16.700 3rd Qu.:0.093463
## FDW10 : 8 Max. :21.350 Max. :0.323637
## (Other):5633
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 High : 621 Tier 1:1592
## 1st Qu.:1987 Medium:3468 Tier 2:1856
## Median :1999 Small :1592 Tier 3:2233
## Mean :1998
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
write.csv(BigMartSalesTestData,"BigMartSalesCleanTestData.csv")
#Doing all analysis on Training Data
#Task 1: Read your dataset in R and visualize the length and breadth of your dataset.
BigMartsFinalData <- read.csv(file="BigMartSalesCleanData.csv",head=TRUE,sep=",")
head(BigMartsFinalData)
## X Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 1 1 FDA15 9.300 Low Fat 0.01604730
## 2 2 DRC01 5.920 Regular 0.01927822
## 3 3 FDN15 17.500 Low Fat 0.01676007
## 4 4 FDX07 19.200 Regular 0.06613000
## 5 5 NCD19 8.930 Low Fat 0.06613000
## 6 6 FDP36 10.395 Regular 0.06613000
## Item_Type Item_MRP Outlet_Identifier
## 1 Dairy 249.8092 OUT049
## 2 Soft Drinks 48.2692 OUT018
## 3 Meat 141.6180 OUT049
## 4 Fruits and Vegetables 182.0950 OUT010
## 5 Household 53.8614 OUT013
## 6 Baking Goods 51.4008 OUT018
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 1 1999 Medium Tier 1
## 2 2009 Medium Tier 3
## 3 1999 Medium Tier 1
## 4 1998 Medium Tier 3
## 5 1987 High Tier 3
## 6 2009 Medium Tier 3
## Outlet_Type Item_Outlet_Sales
## 1 Supermarket Type1 3735.1380
## 2 Supermarket Type2 443.4228
## 3 Supermarket Type1 2097.2700
## 4 Grocery Store 732.3800
## 5 Supermarket Type1 994.7052
## 6 Supermarket Type2 556.6088
nrow(BigMartsFinalData)
## [1] 8523
ncol(BigMartsFinalData)
## [1] 13
#Task 2:Create a descriptive statistics (min, max, median etc) of each variable.
summary(BigMartsFinalData)
## X Item_Identifier Item_Weight Item_Fat_Content
## Min. : 1 FDG33 : 10 Min. : 4.555 Low Fat:5517
## 1st Qu.:2132 FDW13 : 10 1st Qu.: 8.785 Regular:3006
## Median :4262 DRE49 : 9 Median :12.650
## Mean :4262 DRN47 : 9 Mean :12.875
## 3rd Qu.:6392 FDD38 : 9 3rd Qu.:16.850
## Max. :8523 FDF52 : 9 Max. :21.350
## (Other):8467
## Item_Visibility Item_Type Item_MRP
## Min. :0.003575 Fruits and Vegetables:1232 Min. : 31.29
## 1st Qu.:0.033085 Snack Foods :1200 1st Qu.: 93.83
## Median :0.062517 Household : 910 Median :143.01
## Mean :0.070213 Frozen Foods : 856 Mean :140.99
## 3rd Qu.:0.094585 Dairy : 682 3rd Qu.:185.64
## Max. :0.328391 Canned : 649 Max. :266.89
## (Other) :2994
## Outlet_Identifier Outlet_Establishment_Year Outlet_Size
## OUT027 : 935 Min. :1985 High : 932
## OUT013 : 932 1st Qu.:1987 Medium:5203
## OUT035 : 930 Median :1999 Small :2388
## OUT046 : 930 Mean :1998
## OUT049 : 930 3rd Qu.:2004
## OUT045 : 929 Max. :2009
## (Other):2937
## Outlet_Location_Type Outlet_Type Item_Outlet_Sales
## Tier 1:2388 Grocery Store :1083 Min. : 33.29
## Tier 2:2785 Supermarket Type1:5577 1st Qu.: 834.25
## Tier 3:3350 Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
#Task 3: Create one-way contingency tables for the categorical variables in your dataset.
# Category 1: Item Fat Content
my_item_fat_content_table <- with(BigMartsFinalData,table(BigMartsFinalData$Item_Fat_Content))
my_item_fat_content_table
##
## Low Fat Regular
## 5517 3006
prop.table(my_item_fat_content_table)*100
##
## Low Fat Regular
## 64.73073 35.26927
#Category 2:Item Type
my_item_type_table <- with(BigMartsFinalData,table(BigMartsFinalData$Item_Type))
my_item_type_table
##
## Baking Goods Breads Breakfast
## 648 251 110
## Canned Dairy Frozen Foods
## 649 682 856
## Fruits and Vegetables Hard Drinks Health and Hygiene
## 1232 214 520
## Household Meat Others
## 910 425 169
## Seafood Snack Foods Soft Drinks
## 64 1200 445
## Starchy Foods
## 148
prop.table(my_item_type_table)*100
##
## Baking Goods Breads Breakfast
## 7.6029567 2.9449724 1.2906254
## Canned Dairy Frozen Foods
## 7.6146897 8.0018773 10.0434119
## Fruits and Vegetables Hard Drinks Health and Hygiene
## 14.4550041 2.5108530 6.1011381
## Household Meat Others
## 10.6769917 4.9865071 1.9828699
## Seafood Snack Foods Soft Drinks
## 0.7509093 14.0795495 5.2211663
## Starchy Foods
## 1.7364778
#Category 3:Outlet Identifier
my_outlet_identifier_table <- with(BigMartsFinalData,table(BigMartsFinalData$Outlet_Identifier))
my_outlet_identifier_table
##
## OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035 OUT045 OUT046 OUT049
## 555 932 926 928 528 935 930 929 930 930
prop.table(my_outlet_identifier_table)*100
##
## OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035
## 6.511792 10.935117 10.864719 10.888185 6.195002 10.970316 10.911651
## OUT045 OUT046 OUT049
## 10.899918 10.911651 10.911651
#Category 4:Outlet Establishment Year
my_outlet_establishment_year_table <- with(BigMartsFinalData,table(BigMartsFinalData$Outlet_Establishment_Year))
my_outlet_establishment_year_table
##
## 1985 1987 1997 1998 1999 2002 2004 2007 2009
## 1463 932 930 555 930 929 930 926 928
prop.table(my_outlet_establishment_year_table)*100
##
## 1985 1987 1997 1998 1999 2002 2004
## 17.165317 10.935117 10.911651 6.511792 10.911651 10.899918 10.911651
## 2007 2009
## 10.864719 10.888185
#Category 5:Outlet Size
my_outlet_size_table <- with(BigMartsFinalData,table(BigMartsFinalData$Outlet_Size))
my_outlet_size_table
##
## High Medium Small
## 932 5203 2388
prop.table(my_outlet_size_table)*100
##
## High Medium Small
## 10.93512 61.04658 28.01830
#Category 6:Outlet_Location_Type
my_outlet_location_type_table <- with(BigMartsFinalData,table(BigMartsFinalData$Outlet_Location_Type))
my_outlet_location_type_table
##
## Tier 1 Tier 2 Tier 3
## 2388 2785 3350
prop.table(my_outlet_location_type_table)*100
##
## Tier 1 Tier 2 Tier 3
## 28.01830 32.67629 39.30541
#Category 7:Outlet_Type
my_outlet_type_table <- with(BigMartsFinalData,table(BigMartsFinalData$Outlet_Type))
my_outlet_type_table
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 1083 5577 928 935
prop.table(my_outlet_type_table)*100
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 12.70679 65.43471 10.88818 10.97032
#Task 4: Create two-way contingency tables for the categorical variables in your dataset.
#4.1 Item Fat Content vs Item Type
my_table_1 <- xtabs(~ Item_Fat_Content+Item_Type, data=BigMartsFinalData)
my_table_1
## Item_Type
## Item_Fat_Content Baking Goods Breads Breakfast Canned Dairy Frozen Foods
## Low Fat 329 140 41 341 418 450
## Regular 319 111 69 308 264 406
## Item_Type
## Item_Fat_Content Fruits and Vegetables Hard Drinks Health and Hygiene
## Low Fat 630 214 520
## Regular 602 0 0
## Item_Type
## Item_Fat_Content Household Meat Others Seafood Snack Foods Soft Drinks
## Low Fat 910 170 169 37 692 374
## Regular 0 255 0 27 508 71
## Item_Type
## Item_Fat_Content Starchy Foods
## Low Fat 82
## Regular 66
prop.table(my_table_1)
## Item_Type
## Item_Fat_Content Baking Goods Breads Breakfast Canned
## Low Fat 0.038601431 0.016426141 0.004810513 0.040009386
## Regular 0.037428136 0.013023583 0.008095741 0.036137510
## Item_Type
## Item_Fat_Content Dairy Frozen Foods Fruits and Vegetables
## Low Fat 0.049043764 0.052798310 0.073917635
## Regular 0.030975009 0.047635809 0.070632406
## Item_Type
## Item_Fat_Content Hard Drinks Health and Hygiene Household Meat
## Low Fat 0.025108530 0.061011381 0.106769917 0.019946028
## Regular 0.000000000 0.000000000 0.000000000 0.029919043
## Item_Type
## Item_Fat_Content Others Seafood Snack Foods Soft Drinks
## Low Fat 0.019828699 0.004341194 0.081192069 0.043881262
## Regular 0.000000000 0.003167899 0.059603426 0.008330400
## Item_Type
## Item_Fat_Content Starchy Foods
## Low Fat 0.009621025
## Regular 0.007743752
#4.2 Item Fat Content vs Outlet Identifier
my_table_2 <- xtabs(~ Item_Fat_Content+Outlet_Identifier, data=BigMartsFinalData)
my_table_2
## Outlet_Identifier
## Item_Fat_Content OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035 OUT045
## Low Fat 359 606 594 598 346 605 606 609
## Regular 196 326 332 330 182 330 324 320
## Outlet_Identifier
## Item_Fat_Content OUT046 OUT049
## Low Fat 598 596
## Regular 332 334
prop.table(my_table_2)
## Outlet_Identifier
## Item_Fat_Content OUT010 OUT013 OUT017 OUT018 OUT019
## Low Fat 0.04212132 0.07110172 0.06969377 0.07016309 0.04059603
## Regular 0.02299660 0.03824944 0.03895342 0.03871876 0.02135398
## Outlet_Identifier
## Item_Fat_Content OUT027 OUT035 OUT045 OUT046 OUT049
## Low Fat 0.07098440 0.07110172 0.07145371 0.07016309 0.06992843
## Regular 0.03871876 0.03801478 0.03754547 0.03895342 0.03918808
#4.3 Item Fat Content vs Outlet Establishment Year
my_table_3 <- xtabs(~ Item_Fat_Content+Outlet_Establishment_Year, data=BigMartsFinalData)
my_table_3
## Outlet_Establishment_Year
## Item_Fat_Content 1985 1987 1997 1998 1999 2002 2004 2007 2009
## Low Fat 951 606 598 359 596 609 606 594 598
## Regular 512 326 332 196 334 320 324 332 330
prop.table(my_table_3)
## Outlet_Establishment_Year
## Item_Fat_Content 1985 1987 1997 1998 1999
## Low Fat 0.11158043 0.07110172 0.07016309 0.04212132 0.06992843
## Regular 0.06007274 0.03824944 0.03895342 0.02299660 0.03918808
## Outlet_Establishment_Year
## Item_Fat_Content 2002 2004 2007 2009
## Low Fat 0.07145371 0.07110172 0.06969377 0.07016309
## Regular 0.03754547 0.03801478 0.03895342 0.03871876
#4.4 Item Fat Content vs Outlet Size
my_table_4 <- xtabs(~ Item_Fat_Content+Outlet_Size, data=BigMartsFinalData)
my_table_4
## Outlet_Size
## Item_Fat_Content High Medium Small
## Low Fat 606 3361 1550
## Regular 326 1842 838
prop.table(my_table_4)
## Outlet_Size
## Item_Fat_Content High Medium Small
## Low Fat 0.07110172 0.39434471 0.18186085
## Regular 0.03824944 0.21612108 0.09832219
#4.5 Item Fat Content vs Outlet_Location_Type
my_table_5 <- xtabs(~ Item_Fat_Content+Outlet_Location_Type, data=BigMartsFinalData)
my_table_5
## Outlet_Location_Type
## Item_Fat_Content Tier 1 Tier 2 Tier 3
## Low Fat 1540 1809 2168
## Regular 848 976 1182
prop.table(my_table_5)
## Outlet_Location_Type
## Item_Fat_Content Tier 1 Tier 2 Tier 3
## Low Fat 0.18068755 0.21224921 0.25437053
## Regular 0.09949548 0.11451367 0.13868356
#4.6 Item Fat Content vs Outlet_Type
my_table_6 <- xtabs(~ Item_Fat_Content+Outlet_Type, data=BigMartsFinalData)
my_table_6
## Outlet_Type
## Item_Fat_Content Grocery Store Supermarket Type1 Supermarket Type2
## Low Fat 705 3609 598
## Regular 378 1968 330
## Outlet_Type
## Item_Fat_Content Supermarket Type3
## Low Fat 605
## Regular 330
prop.table(my_table_6)
## Outlet_Type
## Item_Fat_Content Grocery Store Supermarket Type1 Supermarket Type2
## Low Fat 0.08271735 0.42344245 0.07016309
## Regular 0.04435058 0.23090461 0.03871876
## Outlet_Type
## Item_Fat_Content Supermarket Type3
## Low Fat 0.07098440
## Regular 0.03871876
#4.7 Item Type vs Outlet Identifier
my_table_7 <- xtabs(~ Item_Type+Outlet_Identifier, data=BigMartsFinalData)
my_table_7
## Outlet_Identifier
## Item_Type OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035
## Baking Goods 42 73 73 68 43 69 68
## Breads 17 25 22 27 16 31 29
## Breakfast 9 13 12 12 10 11 10
## Canned 35 65 69 78 38 72 79
## Dairy 43 80 74 73 49 67 71
## Frozen Foods 54 92 106 92 49 89 92
## Fruits and Vegetables 79 142 127 135 73 140 129
## Hard Drinks 16 23 22 22 8 23 22
## Health and Hygiene 37 61 61 58 30 60 50
## Household 67 103 95 95 52 99 102
## Meat 34 41 44 46 32 56 43
## Others 10 16 16 20 17 15 16
## Seafood 4 5 5 7 6 7 8
## Snack Foods 71 125 128 132 75 137 140
## Soft Drinks 28 49 54 46 26 45 52
## Starchy Foods 9 19 18 17 4 14 19
## Outlet_Identifier
## Item_Type OUT045 OUT046 OUT049
## Baking Goods 70 76 66
## Breads 33 26 25
## Breakfast 10 10 13
## Canned 74 72 67
## Dairy 69 78 78
## Frozen Foods 81 108 93
## Fruits and Vegetables 143 126 138
## Hard Drinks 28 20 30
## Health and Hygiene 55 56 52
## Household 99 103 95
## Meat 38 44 47
## Others 20 22 17
## Seafood 9 6 7
## Snack Foods 133 120 139
## Soft Drinks 51 48 46
## Starchy Foods 16 15 17
prop.table(my_table_7)
## Outlet_Identifier
## Item_Type OUT010 OUT013 OUT017
## Baking Goods 0.0049278423 0.0085650593 0.0085650593
## Breads 0.0019946028 0.0029332395 0.0025812507
## Breakfast 0.0010559662 0.0015252845 0.0014079549
## Canned 0.0041065353 0.0076264226 0.0080957409
## Dairy 0.0050451719 0.0093863663 0.0086823888
## Frozen Foods 0.0063357973 0.0107943212 0.0124369354
## Fruits and Vegetables 0.0092690367 0.0166608002 0.0149008565
## Hard Drinks 0.0018772733 0.0026985803 0.0025812507
## Health and Hygiene 0.0043411944 0.0071571043 0.0071571043
## Household 0.0078610818 0.0120849466 0.0111463100
## Meat 0.0039892057 0.0048105127 0.0051625015
## Others 0.0011732958 0.0018772733 0.0018772733
## Seafood 0.0004693183 0.0005866479 0.0005866479
## Snack Foods 0.0083304001 0.0146661973 0.0150181861
## Soft Drinks 0.0032852282 0.0057491494 0.0063357973
## Starchy Foods 0.0010559662 0.0022292620 0.0021119324
## Outlet_Identifier
## Item_Type OUT018 OUT019 OUT027
## Baking Goods 0.0079784114 0.0050451719 0.0080957409
## Breads 0.0031678986 0.0018772733 0.0036372169
## Breakfast 0.0014079549 0.0011732958 0.0012906254
## Canned 0.0091517071 0.0044585240 0.0084477297
## Dairy 0.0085650593 0.0057491494 0.0078610818
## Frozen Foods 0.0107943212 0.0057491494 0.0104423325
## Fruits and Vegetables 0.0158394931 0.0085650593 0.0164261410
## Hard Drinks 0.0025812507 0.0009386366 0.0026985803
## Health and Hygiene 0.0068051156 0.0035198874 0.0070397747
## Household 0.0111463100 0.0061011381 0.0116156283
## Meat 0.0053971606 0.0037545465 0.0065704564
## Others 0.0023465916 0.0019946028 0.0017599437
## Seafood 0.0008213071 0.0007039775 0.0008213071
## Snack Foods 0.0154875044 0.0087997184 0.0160741523
## Soft Drinks 0.0053971606 0.0030505690 0.0052798310
## Starchy Foods 0.0019946028 0.0004693183 0.0016426141
## Outlet_Identifier
## Item_Type OUT035 OUT045 OUT046
## Baking Goods 0.0079784114 0.0082130705 0.0089170480
## Breads 0.0034025578 0.0038718761 0.0030505690
## Breakfast 0.0011732958 0.0011732958 0.0011732958
## Canned 0.0092690367 0.0086823888 0.0084477297
## Dairy 0.0083304001 0.0080957409 0.0091517071
## Frozen Foods 0.0107943212 0.0095036959 0.0126715945
## Fruits and Vegetables 0.0151355157 0.0167781298 0.0147835269
## Hard Drinks 0.0025812507 0.0032852282 0.0023465916
## Health and Hygiene 0.0058664789 0.0064531268 0.0065704564
## Household 0.0119676170 0.0116156283 0.0120849466
## Meat 0.0050451719 0.0044585240 0.0051625015
## Others 0.0018772733 0.0023465916 0.0025812507
## Seafood 0.0009386366 0.0010559662 0.0007039775
## Snack Foods 0.0164261410 0.0156048340 0.0140795495
## Soft Drinks 0.0061011381 0.0059838085 0.0056318198
## Starchy Foods 0.0022292620 0.0018772733 0.0017599437
## Outlet_Identifier
## Item_Type OUT049
## Baking Goods 0.0077437522
## Breads 0.0029332395
## Breakfast 0.0015252845
## Canned 0.0078610818
## Dairy 0.0091517071
## Frozen Foods 0.0109116508
## Fruits and Vegetables 0.0161914819
## Hard Drinks 0.0035198874
## Health and Hygiene 0.0061011381
## Household 0.0111463100
## Meat 0.0055144902
## Others 0.0019946028
## Seafood 0.0008213071
## Snack Foods 0.0163088115
## Soft Drinks 0.0053971606
## Starchy Foods 0.0019946028
#4.8 Item Type vs Outlet Establishment Year
my_table_8 <- xtabs(~ Item_Type+Outlet_Establishment_Year, data=BigMartsFinalData)
my_table_8
## Outlet_Establishment_Year
## Item_Type 1985 1987 1997 1998 1999 2002 2004 2007 2009
## Baking Goods 112 73 76 42 66 70 68 73 68
## Breads 47 25 26 17 25 33 29 22 27
## Breakfast 21 13 10 9 13 10 10 12 12
## Canned 110 65 72 35 67 74 79 69 78
## Dairy 116 80 78 43 78 69 71 74 73
## Frozen Foods 138 92 108 54 93 81 92 106 92
## Fruits and Vegetables 213 142 126 79 138 143 129 127 135
## Hard Drinks 31 23 20 16 30 28 22 22 22
## Health and Hygiene 90 61 56 37 52 55 50 61 58
## Household 151 103 103 67 95 99 102 95 95
## Meat 88 41 44 34 47 38 43 44 46
## Others 32 16 22 10 17 20 16 16 20
## Seafood 13 5 6 4 7 9 8 5 7
## Snack Foods 212 125 120 71 139 133 140 128 132
## Soft Drinks 71 49 48 28 46 51 52 54 46
## Starchy Foods 18 19 15 9 17 16 19 18 17
prop.table(my_table_8)
## Outlet_Establishment_Year
## Item_Type 1985 1987 1997
## Baking Goods 0.0131409128 0.0085650593 0.0089170480
## Breads 0.0055144902 0.0029332395 0.0030505690
## Breakfast 0.0024639212 0.0015252845 0.0011732958
## Canned 0.0129062537 0.0076264226 0.0084477297
## Dairy 0.0136102311 0.0093863663 0.0091517071
## Frozen Foods 0.0161914819 0.0107943212 0.0126715945
## Fruits and Vegetables 0.0249912003 0.0166608002 0.0147835269
## Hard Drinks 0.0036372169 0.0026985803 0.0023465916
## Health and Hygiene 0.0105596621 0.0071571043 0.0065704564
## Household 0.0177167664 0.0120849466 0.0120849466
## Meat 0.0103250029 0.0048105127 0.0051625015
## Others 0.0037545465 0.0018772733 0.0025812507
## Seafood 0.0015252845 0.0005866479 0.0007039775
## Snack Foods 0.0248738707 0.0146661973 0.0140795495
## Soft Drinks 0.0083304001 0.0057491494 0.0056318198
## Starchy Foods 0.0021119324 0.0022292620 0.0017599437
## Outlet_Establishment_Year
## Item_Type 1998 1999 2002
## Baking Goods 0.0049278423 0.0077437522 0.0082130705
## Breads 0.0019946028 0.0029332395 0.0038718761
## Breakfast 0.0010559662 0.0015252845 0.0011732958
## Canned 0.0041065353 0.0078610818 0.0086823888
## Dairy 0.0050451719 0.0091517071 0.0080957409
## Frozen Foods 0.0063357973 0.0109116508 0.0095036959
## Fruits and Vegetables 0.0092690367 0.0161914819 0.0167781298
## Hard Drinks 0.0018772733 0.0035198874 0.0032852282
## Health and Hygiene 0.0043411944 0.0061011381 0.0064531268
## Household 0.0078610818 0.0111463100 0.0116156283
## Meat 0.0039892057 0.0055144902 0.0044585240
## Others 0.0011732958 0.0019946028 0.0023465916
## Seafood 0.0004693183 0.0008213071 0.0010559662
## Snack Foods 0.0083304001 0.0163088115 0.0156048340
## Soft Drinks 0.0032852282 0.0053971606 0.0059838085
## Starchy Foods 0.0010559662 0.0019946028 0.0018772733
## Outlet_Establishment_Year
## Item_Type 2004 2007 2009
## Baking Goods 0.0079784114 0.0085650593 0.0079784114
## Breads 0.0034025578 0.0025812507 0.0031678986
## Breakfast 0.0011732958 0.0014079549 0.0014079549
## Canned 0.0092690367 0.0080957409 0.0091517071
## Dairy 0.0083304001 0.0086823888 0.0085650593
## Frozen Foods 0.0107943212 0.0124369354 0.0107943212
## Fruits and Vegetables 0.0151355157 0.0149008565 0.0158394931
## Hard Drinks 0.0025812507 0.0025812507 0.0025812507
## Health and Hygiene 0.0058664789 0.0071571043 0.0068051156
## Household 0.0119676170 0.0111463100 0.0111463100
## Meat 0.0050451719 0.0051625015 0.0053971606
## Others 0.0018772733 0.0018772733 0.0023465916
## Seafood 0.0009386366 0.0005866479 0.0008213071
## Snack Foods 0.0164261410 0.0150181861 0.0154875044
## Soft Drinks 0.0061011381 0.0063357973 0.0053971606
## Starchy Foods 0.0022292620 0.0021119324 0.0019946028
#4.9 Item Type vs Outlet Size
my_table_9 <- xtabs(~ Item_Type+Outlet_Size, data=BigMartsFinalData)
my_table_9
## Outlet_Size
## Item_Type High Medium Small
## Baking Goods 73 388 187
## Breads 25 155 71
## Breakfast 13 67 30
## Canned 65 395 189
## Dairy 80 404 198
## Frozen Foods 92 515 249
## Fruits and Vegetables 142 762 328
## Hard Drinks 23 141 50
## Health and Hygiene 61 323 136
## Household 103 550 257
## Meat 41 265 119
## Others 16 98 55
## Seafood 5 39 20
## Snack Foods 125 740 335
## Soft Drinks 49 270 126
## Starchy Foods 19 91 38
prop.table(my_table_9)
## Outlet_Size
## Item_Type High Medium Small
## Baking Goods 0.0085650593 0.0455238766 0.0219406312
## Breads 0.0029332395 0.0181860847 0.0083304001
## Breakfast 0.0015252845 0.0078610818 0.0035198874
## Canned 0.0076264226 0.0463451836 0.0221752904
## Dairy 0.0093863663 0.0474011498 0.0232312566
## Frozen Foods 0.0107943212 0.0604247331 0.0292150651
## Fruits and Vegetables 0.0166608002 0.0894051390 0.0384841018
## Hard Drinks 0.0026985803 0.0165434706 0.0058664789
## Health and Hygiene 0.0071571043 0.0378974539 0.0159568227
## Household 0.0120849466 0.0645312683 0.0301537017
## Meat 0.0048105127 0.0310923384 0.0139622199
## Others 0.0018772733 0.0114982987 0.0064531268
## Seafood 0.0005866479 0.0045758536 0.0023465916
## Snack Foods 0.0146661973 0.0868238883 0.0393054089
## Soft Drinks 0.0057491494 0.0316789863 0.0147835269
## Starchy Foods 0.0022292620 0.0106769917 0.0044585240
#4.10 Item Type vs Outlet_Location_Type
my_table_10 <- xtabs(~ Item_Type+Outlet_Location_Type, data=BigMartsFinalData)
my_table_10
## Outlet_Location_Type
## Item_Type Tier 1 Tier 2 Tier 3
## Baking Goods 185 211 252
## Breads 67 84 100
## Breakfast 33 32 45
## Canned 177 222 250
## Dairy 205 214 263
## Frozen Foods 250 279 327
## Fruits and Vegetables 337 399 496
## Hard Drinks 58 72 84
## Health and Hygiene 138 166 216
## Household 250 296 364
## Meat 123 125 177
## Others 56 52 61
## Seafood 19 22 23
## Snack Foods 334 401 465
## Soft Drinks 120 157 168
## Starchy Foods 36 53 59
prop.table(my_table_10)
## Outlet_Location_Type
## Item_Type Tier 1 Tier 2 Tier 3
## Baking Goods 0.021705972 0.024756541 0.029567054
## Breads 0.007861082 0.009855685 0.011732958
## Breakfast 0.003871876 0.003754547 0.005279831
## Canned 0.020767335 0.026047166 0.029332395
## Dairy 0.024052564 0.025108530 0.030857679
## Frozen Foods 0.029332395 0.032734952 0.038366772
## Fruits and Vegetables 0.039540068 0.046814502 0.058195471
## Hard Drinks 0.006805116 0.008447730 0.009855685
## Health and Hygiene 0.016191482 0.019476710 0.025343189
## Household 0.029332395 0.034729555 0.042707967
## Meat 0.014431538 0.014666197 0.020767335
## Others 0.006570456 0.006101138 0.007157104
## Seafood 0.002229262 0.002581251 0.002698580
## Snack Foods 0.039188079 0.047049161 0.054558254
## Soft Drinks 0.014079549 0.018420744 0.019711369
## Starchy Foods 0.004223865 0.006218468 0.006922445
#4.11 Item Type vs Outlet_Type
my_table_11 <- xtabs(~ Item_Type+Outlet_Type, data=BigMartsFinalData)
my_table_11
## Outlet_Type
## Item_Type Grocery Store Supermarket Type1 Supermarket Type2
## Baking Goods 85 426 68
## Breads 33 160 27
## Breakfast 19 68 12
## Canned 73 426 78
## Dairy 92 450 73
## Frozen Foods 103 572 92
## Fruits and Vegetables 152 805 135
## Hard Drinks 24 145 22
## Health and Hygiene 67 335 58
## Household 119 597 95
## Meat 66 257 46
## Others 27 107 20
## Seafood 10 40 7
## Snack Foods 146 785 132
## Soft Drinks 54 300 46
## Starchy Foods 13 104 17
## Outlet_Type
## Item_Type Supermarket Type3
## Baking Goods 69
## Breads 31
## Breakfast 11
## Canned 72
## Dairy 67
## Frozen Foods 89
## Fruits and Vegetables 140
## Hard Drinks 23
## Health and Hygiene 60
## Household 99
## Meat 56
## Others 15
## Seafood 7
## Snack Foods 137
## Soft Drinks 45
## Starchy Foods 14
prop.table(my_table_11)
## Outlet_Type
## Item_Type Grocery Store Supermarket Type1 Supermarket Type2
## Baking Goods 0.0099730142 0.0499824006 0.0079784114
## Breads 0.0038718761 0.0187727326 0.0031678986
## Breakfast 0.0022292620 0.0079784114 0.0014079549
## Canned 0.0085650593 0.0499824006 0.0091517071
## Dairy 0.0107943212 0.0527983105 0.0085650593
## Frozen Foods 0.0120849466 0.0671125191 0.0107943212
## Fruits and Vegetables 0.0178340960 0.0944503109 0.0158394931
## Hard Drinks 0.0028159099 0.0170127889 0.0025812507
## Health and Hygiene 0.0078610818 0.0393054089 0.0068051156
## Household 0.0139622199 0.0700457585 0.0111463100
## Meat 0.0077437522 0.0301537017 0.0053971606
## Others 0.0031678986 0.0125542649 0.0023465916
## Seafood 0.0011732958 0.0046931832 0.0008213071
## Snack Foods 0.0171301185 0.0921037193 0.0154875044
## Soft Drinks 0.0063357973 0.0351988736 0.0053971606
## Starchy Foods 0.0015252845 0.0122022762 0.0019946028
## Outlet_Type
## Item_Type Supermarket Type3
## Baking Goods 0.0080957409
## Breads 0.0036372169
## Breakfast 0.0012906254
## Canned 0.0084477297
## Dairy 0.0078610818
## Frozen Foods 0.0104423325
## Fruits and Vegetables 0.0164261410
## Hard Drinks 0.0026985803
## Health and Hygiene 0.0070397747
## Household 0.0116156283
## Meat 0.0065704564
## Others 0.0017599437
## Seafood 0.0008213071
## Snack Foods 0.0160741523
## Soft Drinks 0.0052798310
## Starchy Foods 0.0016426141
#4.12 Outlet Identifier vs Outlet Establishment Year
my_table_12 <- xtabs(~ Outlet_Identifier+Outlet_Establishment_Year, data=BigMartsFinalData)
my_table_12
## Outlet_Establishment_Year
## Outlet_Identifier 1985 1987 1997 1998 1999 2002 2004 2007 2009
## OUT010 0 0 0 555 0 0 0 0 0
## OUT013 0 932 0 0 0 0 0 0 0
## OUT017 0 0 0 0 0 0 0 926 0
## OUT018 0 0 0 0 0 0 0 0 928
## OUT019 528 0 0 0 0 0 0 0 0
## OUT027 935 0 0 0 0 0 0 0 0
## OUT035 0 0 0 0 0 0 930 0 0
## OUT045 0 0 0 0 0 929 0 0 0
## OUT046 0 0 930 0 0 0 0 0 0
## OUT049 0 0 0 0 930 0 0 0 0
prop.table(my_table_12)
## Outlet_Establishment_Year
## Outlet_Identifier 1985 1987 1997 1998 1999
## OUT010 0.00000000 0.00000000 0.00000000 0.06511792 0.00000000
## OUT013 0.00000000 0.10935117 0.00000000 0.00000000 0.00000000
## OUT017 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## OUT018 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## OUT019 0.06195002 0.00000000 0.00000000 0.00000000 0.00000000
## OUT027 0.10970316 0.00000000 0.00000000 0.00000000 0.00000000
## OUT035 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## OUT045 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## OUT046 0.00000000 0.00000000 0.10911651 0.00000000 0.00000000
## OUT049 0.00000000 0.00000000 0.00000000 0.00000000 0.10911651
## Outlet_Establishment_Year
## Outlet_Identifier 2002 2004 2007 2009
## OUT010 0.00000000 0.00000000 0.00000000 0.00000000
## OUT013 0.00000000 0.00000000 0.00000000 0.00000000
## OUT017 0.00000000 0.00000000 0.10864719 0.00000000
## OUT018 0.00000000 0.00000000 0.00000000 0.10888185
## OUT019 0.00000000 0.00000000 0.00000000 0.00000000
## OUT027 0.00000000 0.00000000 0.00000000 0.00000000
## OUT035 0.00000000 0.10911651 0.00000000 0.00000000
## OUT045 0.10899918 0.00000000 0.00000000 0.00000000
## OUT046 0.00000000 0.00000000 0.00000000 0.00000000
## OUT049 0.00000000 0.00000000 0.00000000 0.00000000
#4.13 Outlet Identifier vs Outlet Size
my_table_13 <- xtabs(~ Outlet_Identifier+Outlet_Size, data=BigMartsFinalData)
my_table_13
## Outlet_Size
## Outlet_Identifier High Medium Small
## OUT010 0 555 0
## OUT013 932 0 0
## OUT017 0 926 0
## OUT018 0 928 0
## OUT019 0 0 528
## OUT027 0 935 0
## OUT035 0 0 930
## OUT045 0 929 0
## OUT046 0 0 930
## OUT049 0 930 0
prop.table(my_table_13)
## Outlet_Size
## Outlet_Identifier High Medium Small
## OUT010 0.00000000 0.06511792 0.00000000
## OUT013 0.10935117 0.00000000 0.00000000
## OUT017 0.00000000 0.10864719 0.00000000
## OUT018 0.00000000 0.10888185 0.00000000
## OUT019 0.00000000 0.00000000 0.06195002
## OUT027 0.00000000 0.10970316 0.00000000
## OUT035 0.00000000 0.00000000 0.10911651
## OUT045 0.00000000 0.10899918 0.00000000
## OUT046 0.00000000 0.00000000 0.10911651
## OUT049 0.00000000 0.10911651 0.00000000
#4.14 Outlet Identifier vs Outlet_Location_Type
my_table_14 <- xtabs(~ Outlet_Identifier+Outlet_Location_Type, data=BigMartsFinalData)
my_table_14
## Outlet_Location_Type
## Outlet_Identifier Tier 1 Tier 2 Tier 3
## OUT010 0 0 555
## OUT013 0 0 932
## OUT017 0 926 0
## OUT018 0 0 928
## OUT019 528 0 0
## OUT027 0 0 935
## OUT035 0 930 0
## OUT045 0 929 0
## OUT046 930 0 0
## OUT049 930 0 0
prop.table(my_table_14)
## Outlet_Location_Type
## Outlet_Identifier Tier 1 Tier 2 Tier 3
## OUT010 0.00000000 0.00000000 0.06511792
## OUT013 0.00000000 0.00000000 0.10935117
## OUT017 0.00000000 0.10864719 0.00000000
## OUT018 0.00000000 0.00000000 0.10888185
## OUT019 0.06195002 0.00000000 0.00000000
## OUT027 0.00000000 0.00000000 0.10970316
## OUT035 0.00000000 0.10911651 0.00000000
## OUT045 0.00000000 0.10899918 0.00000000
## OUT046 0.10911651 0.00000000 0.00000000
## OUT049 0.10911651 0.00000000 0.00000000
#4.15 Outlet Identifier vs Outlet_Type
my_table_15 <- xtabs(~ Outlet_Identifier+Outlet_Type, data=BigMartsFinalData)
my_table_15
## Outlet_Type
## Outlet_Identifier Grocery Store Supermarket Type1 Supermarket Type2
## OUT010 555 0 0
## OUT013 0 932 0
## OUT017 0 926 0
## OUT018 0 0 928
## OUT019 528 0 0
## OUT027 0 0 0
## OUT035 0 930 0
## OUT045 0 929 0
## OUT046 0 930 0
## OUT049 0 930 0
## Outlet_Type
## Outlet_Identifier Supermarket Type3
## OUT010 0
## OUT013 0
## OUT017 0
## OUT018 0
## OUT019 0
## OUT027 935
## OUT035 0
## OUT045 0
## OUT046 0
## OUT049 0
prop.table(my_table_15)
## Outlet_Type
## Outlet_Identifier Grocery Store Supermarket Type1 Supermarket Type2
## OUT010 0.06511792 0.00000000 0.00000000
## OUT013 0.00000000 0.10935117 0.00000000
## OUT017 0.00000000 0.10864719 0.00000000
## OUT018 0.00000000 0.00000000 0.10888185
## OUT019 0.06195002 0.00000000 0.00000000
## OUT027 0.00000000 0.00000000 0.00000000
## OUT035 0.00000000 0.10911651 0.00000000
## OUT045 0.00000000 0.10899918 0.00000000
## OUT046 0.00000000 0.10911651 0.00000000
## OUT049 0.00000000 0.10911651 0.00000000
## Outlet_Type
## Outlet_Identifier Supermarket Type3
## OUT010 0.00000000
## OUT013 0.00000000
## OUT017 0.00000000
## OUT018 0.00000000
## OUT019 0.00000000
## OUT027 0.10970316
## OUT035 0.00000000
## OUT045 0.00000000
## OUT046 0.00000000
## OUT049 0.00000000
#4.16 Outlet Establishment Year vs Outlet Size
my_table_16 <- xtabs(~ Outlet_Establishment_Year+Outlet_Size, data=BigMartsFinalData)
my_table_16
## Outlet_Size
## Outlet_Establishment_Year High Medium Small
## 1985 0 935 528
## 1987 932 0 0
## 1997 0 0 930
## 1998 0 555 0
## 1999 0 930 0
## 2002 0 929 0
## 2004 0 0 930
## 2007 0 926 0
## 2009 0 928 0
prop.table(my_table_16)
## Outlet_Size
## Outlet_Establishment_Year High Medium Small
## 1985 0.00000000 0.10970316 0.06195002
## 1987 0.10935117 0.00000000 0.00000000
## 1997 0.00000000 0.00000000 0.10911651
## 1998 0.00000000 0.06511792 0.00000000
## 1999 0.00000000 0.10911651 0.00000000
## 2002 0.00000000 0.10899918 0.00000000
## 2004 0.00000000 0.00000000 0.10911651
## 2007 0.00000000 0.10864719 0.00000000
## 2009 0.00000000 0.10888185 0.00000000
#4.17 Outlet Establishment Year vs Outlet_Location_Type
my_table_17 <- xtabs(~ Outlet_Establishment_Year+Outlet_Location_Type, data=BigMartsFinalData)
my_table_17
## Outlet_Location_Type
## Outlet_Establishment_Year Tier 1 Tier 2 Tier 3
## 1985 528 0 935
## 1987 0 0 932
## 1997 930 0 0
## 1998 0 0 555
## 1999 930 0 0
## 2002 0 929 0
## 2004 0 930 0
## 2007 0 926 0
## 2009 0 0 928
prop.table(my_table_17)
## Outlet_Location_Type
## Outlet_Establishment_Year Tier 1 Tier 2 Tier 3
## 1985 0.06195002 0.00000000 0.10970316
## 1987 0.00000000 0.00000000 0.10935117
## 1997 0.10911651 0.00000000 0.00000000
## 1998 0.00000000 0.00000000 0.06511792
## 1999 0.10911651 0.00000000 0.00000000
## 2002 0.00000000 0.10899918 0.00000000
## 2004 0.00000000 0.10911651 0.00000000
## 2007 0.00000000 0.10864719 0.00000000
## 2009 0.00000000 0.00000000 0.10888185
#4.18 Outlet Establishment Year vs Outlet_Type
my_table_18 <- xtabs(~ Outlet_Establishment_Year+Outlet_Type, data=BigMartsFinalData)
my_table_18
## Outlet_Type
## Outlet_Establishment_Year Grocery Store Supermarket Type1
## 1985 528 0
## 1987 0 932
## 1997 0 930
## 1998 555 0
## 1999 0 930
## 2002 0 929
## 2004 0 930
## 2007 0 926
## 2009 0 0
## Outlet_Type
## Outlet_Establishment_Year Supermarket Type2 Supermarket Type3
## 1985 0 935
## 1987 0 0
## 1997 0 0
## 1998 0 0
## 1999 0 0
## 2002 0 0
## 2004 0 0
## 2007 0 0
## 2009 928 0
prop.table(my_table_18)
## Outlet_Type
## Outlet_Establishment_Year Grocery Store Supermarket Type1
## 1985 0.06195002 0.00000000
## 1987 0.00000000 0.10935117
## 1997 0.00000000 0.10911651
## 1998 0.06511792 0.00000000
## 1999 0.00000000 0.10911651
## 2002 0.00000000 0.10899918
## 2004 0.00000000 0.10911651
## 2007 0.00000000 0.10864719
## 2009 0.00000000 0.00000000
## Outlet_Type
## Outlet_Establishment_Year Supermarket Type2 Supermarket Type3
## 1985 0.00000000 0.10970316
## 1987 0.00000000 0.00000000
## 1997 0.00000000 0.00000000
## 1998 0.00000000 0.00000000
## 1999 0.00000000 0.00000000
## 2002 0.00000000 0.00000000
## 2004 0.00000000 0.00000000
## 2007 0.00000000 0.00000000
## 2009 0.10888185 0.00000000
#4.19 Outlet Size vs Outlet_Location_Type
my_table_19 <- xtabs(~ Outlet_Size+Outlet_Location_Type, data=BigMartsFinalData)
my_table_19
## Outlet_Location_Type
## Outlet_Size Tier 1 Tier 2 Tier 3
## High 0 0 932
## Medium 930 1855 2418
## Small 1458 930 0
prop.table(my_table_19)
## Outlet_Location_Type
## Outlet_Size Tier 1 Tier 2 Tier 3
## High 0.0000000 0.0000000 0.1093512
## Medium 0.1091165 0.2176464 0.2837029
## Small 0.1710665 0.1091165 0.0000000
#4.20 Outlet Size vs Outlet_Type
my_table_20 <- xtabs(~ Outlet_Size+Outlet_Type, data=BigMartsFinalData)
my_table_20
## Outlet_Type
## Outlet_Size Grocery Store Supermarket Type1 Supermarket Type2
## High 0 932 0
## Medium 555 2785 928
## Small 528 1860 0
## Outlet_Type
## Outlet_Size Supermarket Type3
## High 0
## Medium 935
## Small 0
prop.table(my_table_20)
## Outlet_Type
## Outlet_Size Grocery Store Supermarket Type1 Supermarket Type2
## High 0.00000000 0.10935117 0.00000000
## Medium 0.06511792 0.32676288 0.10888185
## Small 0.06195002 0.21823302 0.00000000
## Outlet_Type
## Outlet_Size Supermarket Type3
## High 0.00000000
## Medium 0.10970316
## Small 0.00000000
#4.21 Outlet_Location_Type vs Outlet_Type
my_table_21 <- xtabs(~ Outlet_Location_Type+Outlet_Type, data=BigMartsFinalData)
my_table_21
## Outlet_Type
## Outlet_Location_Type Grocery Store Supermarket Type1 Supermarket Type2
## Tier 1 528 1860 0
## Tier 2 0 2785 0
## Tier 3 555 932 928
## Outlet_Type
## Outlet_Location_Type Supermarket Type3
## Tier 1 0
## Tier 2 0
## Tier 3 935
prop.table(my_table_21)
## Outlet_Type
## Outlet_Location_Type Grocery Store Supermarket Type1 Supermarket Type2
## Tier 1 0.06195002 0.21823302 0.00000000
## Tier 2 0.00000000 0.32676288 0.00000000
## Tier 3 0.06511792 0.10935117 0.10888185
## Outlet_Type
## Outlet_Location_Type Supermarket Type3
## Tier 1 0.00000000
## Tier 2 0.00000000
## Tier 3 0.10970316
#task 5:Draw a boxplot of the variables that belong to your study.
library(lattice)
boxplot(BigMartsFinalData$Item_Weight,data=BigMartsFinalData, main="Distribution of Weights of items", xlab="Items Weight", ylab="Weight in Kg", varwidth=TRUE)

boxplot(BigMartsFinalData$Item_Visibility,data=BigMartsFinalData, main="Distribution of visibility of items", xlab="Items Visibility", ylab="Range of Visibility", varwidth=TRUE)

boxplot(BigMartsFinalData$Item_MRP,data=BigMartsFinalData, main="Distribution of Retail Price of items", xlab="Items Retail Price", ylab="Range of Retail Price", varwidth=TRUE)

#task 6:Draw Histograms for your suitable data fields.
library(lattice)
histogram(~as.factor(BigMartsFinalData$Item_Identifier),xlab="Item Identifier")

histogram(~as.factor(BigMartsFinalData$Item_Fat_Content),xlab="Fat contents")

histogram(~as.factor(BigMartsFinalData$Item_Type),xlab="Item Type", varwidth=TRUE)

histogram(~as.factor(BigMartsFinalData$Outlet_Identifier),xlab="Outlet Identifier", varwidth=TRUE)

histogram(~as.factor(BigMartsFinalData$Outlet_Establishment_Year),xlab="Outlet establishment year", varwidth=TRUE)

histogram(~as.factor(BigMartsFinalData$Outlet_Size),xlab="Outlet Size", varwidth=TRUE)

histogram(~as.factor(BigMartsFinalData$Outlet_Location_Type),xlab="Outlet Location Type", varwidth=TRUE)

histogram(~as.factor(BigMartsFinalData$Outlet_Type),xlab="Outlet Type", varwidth=TRUE)

#task 7: Draw suitable plot for your data fields.
library(car)
#7.1 Scatterplot between Item_Weight and Its Outlet_Type
scatterplot(BigMartsFinalData$Item_Weight~BigMartsFinalData$Outlet_Type,spread=FALSE, smoother.args=list(lty=2), pch=19,main="Scatterplot of Items Weight vs.Outlet Type ",xlab="Outlet Type",ylab="Items Weight")

#7.2 Boxplot of Items Weights vs Items Type
boxplot(BigMartsFinalData$Item_Weight ~BigMartsFinalData$Item_Type , xlab = "Item Type", ylab ="Items Weight", main = "Items Weight vs Items Type")

#7.3 Scatterplot between Items Visibility and Its Retail Price
scatterplot(BigMartsFinalData$Item_Visibility~BigMartsFinalData$Item_MRP,spread=FALSE, smoother.args=list(lty=2), pch=19,main="Scatterplot of Items Visibility vs. Item MRP ",xlab="MRP",ylab="Visibility")

#7.4 Boxplot between Items Outlet Sales and Its Outlets location Type
boxplot(BigMartsFinalData$Item_Outlet_Sales ~BigMartsFinalData$Outlet_Location_Type , xlab = "Item Outlet Sales", ylab ="Outlet Location Type", main = "Sales vs location type")

#7.5 Boxplot between Items Outlet Sales and Its Outlet_Identifier
boxplot(BigMartsFinalData$Item_Outlet_Sales ~BigMartsFinalData$Outlet_Identifier , xlab = "Outlet Identifier", ylab ="Outlet Sales", main = "Items Outlet Sales vs Outlet Identifier")

#task 8:Create a correlation matrix.
BigMartsFinalData$Item_Identifier <- as.numeric(BigMartsFinalData$Item_Identifier)
BigMartsFinalData$Item_Fat_Content <- as.numeric(BigMartsFinalData$Item_Fat_Content)
BigMartsFinalData$Item_Type <- as.numeric(BigMartsFinalData$Item_Type)
BigMartsFinalData$Outlet_Identifier <- as.numeric(BigMartsFinalData$Outlet_Identifier)
BigMartsFinalData$Outlet_Establishment_Year <- as.numeric(BigMartsFinalData$Outlet_Establishment_Year)
BigMartsFinalData$Outlet_Size <- as.numeric(BigMartsFinalData$Outlet_Size)
BigMartsFinalData$Outlet_Location_Type <- as.numeric(BigMartsFinalData$Outlet_Location_Type)
BigMartsFinalData$Outlet_Type<- as.numeric(BigMartsFinalData$Outlet_Type)
CorrelationForBigMartsData <- cor(BigMartsFinalData)
CorrelationForBigMartsData
## X Item_Identifier Item_Weight
## X 1.0000000000 0.019648276 -0.0244558710
## Item_Identifier 0.0196482757 1.000000000 0.0482819395
## Item_Weight -0.0244558710 0.048281939 1.0000000000
## Item_Fat_Content 0.0078955710 -0.114660488 -0.0267895290
## Item_Visibility 0.0012152670 -0.029188698 -0.0174774811
## Item_Type 0.0001094730 -0.017973341 0.0356887486
## Item_MRP 0.0025008782 0.012852955 0.0259732486
## Outlet_Identifier -0.0065009850 -0.008601773 -0.0075913830
## Outlet_Establishment_Year 0.0003367819 -0.012771776 -0.0134156596
## Outlet_Size 0.0050191419 0.001388956 -0.0043051576
## Outlet_Location_Type -0.0018663490 0.003655932 0.0029358518
## Outlet_Type 0.0021580047 -0.001177750 0.0005343981
## Item_Outlet_Sales -0.0053861796 0.002868828 0.0131643568
## Item_Fat_Content Item_Visibility Item_Type
## X 0.0078955710 0.001215267 0.000109473
## Item_Identifier -0.1146604881 -0.029188698 -0.017973341
## Item_Weight -0.0267895290 -0.017477481 0.035688749
## Item_Fat_Content 1.0000000000 0.049914978 -0.139434246
## Item_Visibility 0.0499149777 1.000000000 -0.035999729
## Item_Type -0.1394342456 -0.035999729 1.000000000
## Item_MRP 0.0060628994 -0.005258788 0.032650737
## Outlet_Identifier 0.0007637264 -0.106376550 0.001655864
## Outlet_Establishment_Year 0.0031506634 -0.078354718 0.004970179
## Outlet_Size -0.0006220193 0.072347257 -0.001859350
## Outlet_Location_Type -0.0015984765 -0.027859509 0.003084154
## Outlet_Type 0.0021990092 -0.179603892 0.003053107
## Item_Outlet_Sales 0.0187185336 -0.134137692 0.017047670
## Item_MRP Outlet_Identifier
## X 0.0025008782 -0.0065009850
## Item_Identifier 0.0128529549 -0.0086017730
## Item_Weight 0.0259732486 -0.0075913830
## Item_Fat_Content 0.0060628994 0.0007637264
## Item_Visibility -0.0052587878 -0.1063765503
## Item_Type 0.0326507373 0.0016558637
## Item_MRP 1.0000000000 0.0033193595
## Outlet_Identifier 0.0033193595 1.0000000000
## Outlet_Establishment_Year 0.0050199162 0.0790347340
## Outlet_Size 0.0060588872 0.5046029624
## Outlet_Location_Type 0.0002322058 -0.7161760042
## Outlet_Type -0.0019746190 0.0998732477
## Item_Outlet_Sales 0.5675744467 0.1623248975
## Outlet_Establishment_Year Outlet_Size
## X 0.0003367819 0.0050191419
## Item_Identifier -0.0127717759 0.0013889555
## Item_Weight -0.0134156596 -0.0043051576
## Item_Fat_Content 0.0031506634 -0.0006220193
## Item_Visibility -0.0783547179 0.0723472567
## Item_Type 0.0049701787 -0.0018593496
## Item_MRP 0.0050199162 0.0060588872
## Outlet_Identifier 0.0790347340 0.5046029624
## Outlet_Establishment_Year 1.0000000000 0.1933885750
## Outlet_Size 0.1933885750 1.0000000000
## Outlet_Location_Type -0.0892163898 -0.6143107047
## Outlet_Type -0.1223041428 -0.2014826222
## Item_Outlet_Sales -0.0491349704 -0.0861821954
## Outlet_Location_Type Outlet_Type
## X -0.0018663490 0.0021580047
## Item_Identifier 0.0036559317 -0.0011777502
## Item_Weight 0.0029358518 0.0005343981
## Item_Fat_Content -0.0015984765 0.0021990092
## Item_Visibility -0.0278595089 -0.1796038921
## Item_Type 0.0030841544 0.0030531075
## Item_MRP 0.0002322058 -0.0019746190
## Outlet_Identifier -0.7161760042 0.0998732477
## Outlet_Establishment_Year -0.0892163898 -0.1223041428
## Outlet_Size -0.6143107047 -0.2014826222
## Outlet_Location_Type 1.0000000000 0.4672186616
## Outlet_Type 0.4672186616 1.0000000000
## Item_Outlet_Sales 0.0893667468 0.4015225000
## Item_Outlet_Sales
## X -0.005386180
## Item_Identifier 0.002868828
## Item_Weight 0.013164357
## Item_Fat_Content 0.018718534
## Item_Visibility -0.134137692
## Item_Type 0.017047670
## Item_MRP 0.567574447
## Outlet_Identifier 0.162324898
## Outlet_Establishment_Year -0.049134970
## Outlet_Size -0.086182195
## Outlet_Location_Type 0.089366747
## Outlet_Type 0.401522500
## Item_Outlet_Sales 1.000000000
#task 9:
library('corrplot')
## corrplot 0.84 loaded
corrplot(CorrelationForBigMartsData, method = "circle")

#task 10: Create a scatter plot matrix for your data set.
BigMartsFinalData$Item_Identifier <- as.factor(BigMartsFinalData$Item_Identifier)
BigMartsFinalData$Item_Fat_Content <- as.factor(BigMartsFinalData$Item_Fat_Content)
BigMartsFinalData$Item_Type <- as.factor(BigMartsFinalData$Item_Type)
BigMartsFinalData$Outlet_Identifier <- as.factor(BigMartsFinalData$Outlet_Identifier)
BigMartsFinalData$Outlet_Establishment_Year <- as.factor(BigMartsFinalData$Outlet_Establishment_Year)
BigMartsFinalData$Outlet_Size <- as.factor(BigMartsFinalData$Outlet_Size)
BigMartsFinalData$Outlet_Location_Type <- as.factor(BigMartsFinalData$Outlet_Location_Type)
BigMartsFinalData$Outlet_Type<- as.factor(BigMartsFinalData$Outlet_Type)
pairs(BigMartsFinalData[,c(3,5,7,13)], pch = 19)

#task 11: Run a suitable test to check your hypothesis for your suitable assumptions.
#Using Chi square Test
# loading the two way contigency table for outlet type versus outlet size
my_table_20 <- xtabs(~ Outlet_Size+Outlet_Type, data=BigMartsFinalData)
my_table_20
## Outlet_Type
## Outlet_Size 1 2 3 4
## 1 0 932 0 0
## 2 555 2785 928 935
## 3 528 1860 0 0
# Null Hypothesis: The row and the column variables of the contingency table are independent.
chisq <- chisq.test(my_table_20)
chisq
##
## Pearson's Chi-squared test
##
## data: my_table_20
## X-squared = 1830, df = 6, p-value < 2.2e-16
# as the p value is less than 0.05 the null hypothesis is rejected and there is a dependence between Outlet size and Outlet Type
#task 12:Run a t-test to analyse your hypothesis.
# Null Hypothesis: There no significance between an Items Weight and an Items Fat Content.
t.test(BigMartsFinalData$Item_Weight~BigMartsFinalData$Item_Fat_Content)$p.value
## [1] 0.01324973
#As pval is < 0.05 suggests a significant difference between an Items Weight and an Items Fat Content and we would reject our null hypothesis.
#Task 13
#Formulate a Regression Model:
#y = b0 + b1*x1 + b2*x2 + ..
#Think about what should 'y' be?
#Think about what could x = {x1, x2, ..} be?
#Fit Linear Regression Models using lm()
#Use the lm() model outputs to test your Hypotheses and draw inferences
#Prepare a list of insights based on your Regression Analysis
#Let Y be the Item_Outlet_Sales_Price as it is a dependent variable
# Finding out the"X" Variables positively correlated to this variable
for(i in 1:NCOL(CorrelationForBigMartsData))
{
for(j in 1:NROW(CorrelationForBigMartsData))
{
if(CorrelationForBigMartsData[i,j] > 0 && CorrelationForBigMartsData[i,j] < 1 && j>=i)
{
cat(colnames(CorrelationForBigMartsData)[[i]],rownames(CorrelationForBigMartsData)[[j]],"\n")
}
}
}
## X Item_Identifier
## X Item_Fat_Content
## X Item_Visibility
## X Item_Type
## X Item_MRP
## X Outlet_Establishment_Year
## X Outlet_Size
## X Outlet_Type
## Item_Identifier Item_Weight
## Item_Identifier Item_MRP
## Item_Identifier Outlet_Size
## Item_Identifier Outlet_Location_Type
## Item_Identifier Item_Outlet_Sales
## Item_Weight Item_Type
## Item_Weight Item_MRP
## Item_Weight Outlet_Location_Type
## Item_Weight Outlet_Type
## Item_Weight Item_Outlet_Sales
## Item_Fat_Content Item_Visibility
## Item_Fat_Content Item_MRP
## Item_Fat_Content Outlet_Identifier
## Item_Fat_Content Outlet_Establishment_Year
## Item_Fat_Content Outlet_Type
## Item_Fat_Content Item_Outlet_Sales
## Item_Visibility Outlet_Size
## Item_Type Item_MRP
## Item_Type Outlet_Identifier
## Item_Type Outlet_Establishment_Year
## Item_Type Outlet_Location_Type
## Item_Type Outlet_Type
## Item_Type Item_Outlet_Sales
## Item_MRP Outlet_Identifier
## Item_MRP Outlet_Establishment_Year
## Item_MRP Outlet_Size
## Item_MRP Outlet_Location_Type
## Item_MRP Item_Outlet_Sales
## Outlet_Identifier Outlet_Establishment_Year
## Outlet_Identifier Outlet_Size
## Outlet_Identifier Outlet_Type
## Outlet_Identifier Item_Outlet_Sales
## Outlet_Establishment_Year Outlet_Size
## Outlet_Location_Type Outlet_Type
## Outlet_Location_Type Item_Outlet_Sales
## Outlet_Type Item_Outlet_Sales
#Following are the "X" variables affecting the Item_Outlet_Sales
#1.Item_MRP
#2.Outlet_Identifier
#3.Outlet_Type
#4.Item_Weight
#5.Item_Fat_Content
#6.Outlet_Size
#7.Item_Type
summary(BigMartSalesData)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 Low Fat:5517 Min. :0.003575
## FDW13 : 10 1st Qu.: 8.785 Regular:3006 1st Qu.:0.033085
## DRE49 : 9 Median :12.650 Median :0.062517
## DRN47 : 9 Mean :12.875 Mean :0.070213
## FDD38 : 9 3rd Qu.:16.850 3rd Qu.:0.094585
## FDF52 : 9 Max. :21.350 Max. :0.328391
## (Other):8467
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 High : 932 Tier 1:2388
## 1st Qu.:1987 Medium:5203 Tier 2:2785
## Median :1999 Small :2388 Tier 3:3350
## Mean :1998
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
BigMartsFinalData$Outlet_Identifier <- as.factor(as.numeric(BigMartsFinalData$Outlet_Identifier))
BigMartsFinalData$Outlet_Type <- as.factor(as.numeric(BigMartsFinalData$Outlet_Type))
BigMartsFinalData$Item_Fat_Content <- as.factor(as.numeric(BigMartsFinalData$Item_Fat_Content))
BigMartsFinalData$Outlet_Size <- as.factor(as.numeric(BigMartsFinalData$Outlet_Size))
BigMartsFinalData$Item_Type <- as.factor(as.numeric(BigMartsFinalData$Item_Type))
modellinearregression <- lm(BigMartsFinalData$Item_Outlet_Sales~BigMartsFinalData$Item_MRP+BigMartsFinalData$Outlet_Identifier+BigMartsFinalData$Outlet_Type + BigMartsFinalData$Item_Weight+BigMartsFinalData$Item_Fat_Content+BigMartsFinalData$Outlet_Size + BigMartsFinalData$Item_Type, data=BigMartsFinalData)
summary(modellinearregression)
##
## Call:
## lm(formula = BigMartsFinalData$Item_Outlet_Sales ~ BigMartsFinalData$Item_MRP +
## BigMartsFinalData$Outlet_Identifier + BigMartsFinalData$Outlet_Type +
## BigMartsFinalData$Item_Weight + BigMartsFinalData$Item_Fat_Content +
## BigMartsFinalData$Outlet_Size + BigMartsFinalData$Item_Type,
## data = BigMartsFinalData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4355.8 -680.0 -89.5 568.5 7957.2
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -1861.2888 77.4211 -24.041
## BigMartsFinalData$Item_MRP 15.5651 0.1977 78.736
## BigMartsFinalData$Outlet_Identifier2 1949.6316 60.5499 32.199
## BigMartsFinalData$Outlet_Identifier3 2022.6309 60.6301 33.360
## BigMartsFinalData$Outlet_Identifier4 1640.9231 60.6031 27.077
## BigMartsFinalData$Outlet_Identifier5 16.0509 68.6789 0.234
## BigMartsFinalData$Outlet_Identifier6 3368.5088 60.5119 55.667
## BigMartsFinalData$Outlet_Identifier7 2062.3074 60.5902 34.037
## BigMartsFinalData$Outlet_Identifier8 1848.8596 60.6033 30.508
## BigMartsFinalData$Outlet_Identifier9 1918.9800 60.5827 31.675
## BigMartsFinalData$Outlet_Identifier10 2015.5570 60.5759 33.273
## BigMartsFinalData$Outlet_Type2 NA NA NA
## BigMartsFinalData$Outlet_Type3 NA NA NA
## BigMartsFinalData$Outlet_Type4 NA NA NA
## BigMartsFinalData$Item_Weight 0.1291 2.6561 0.049
## BigMartsFinalData$Item_Fat_Content2 39.8469 28.2222 1.412
## BigMartsFinalData$Outlet_Size2 NA NA NA
## BigMartsFinalData$Outlet_Size3 NA NA NA
## BigMartsFinalData$Item_Type2 6.4785 84.0569 0.077
## BigMartsFinalData$Item_Type3 2.9086 116.5525 0.025
## BigMartsFinalData$Item_Type4 25.5585 62.7683 0.407
## BigMartsFinalData$Item_Type5 -42.3478 62.2271 -0.681
## BigMartsFinalData$Item_Type6 -27.4226 58.8607 -0.466
## BigMartsFinalData$Item_Type7 29.3690 54.9688 0.534
## BigMartsFinalData$Item_Type8 0.7781 90.1866 0.009
## BigMartsFinalData$Item_Type9 -7.2753 67.9414 -0.107
## BigMartsFinalData$Item_Type10 -37.9153 59.9276 -0.633
## BigMartsFinalData$Item_Type11 1.7483 70.6198 0.025
## BigMartsFinalData$Item_Type12 -19.6856 98.6220 -0.200
## BigMartsFinalData$Item_Type13 183.2622 148.0030 1.238
## BigMartsFinalData$Item_Type14 -11.2740 55.2557 -0.204
## BigMartsFinalData$Item_Type15 -26.3248 70.1642 -0.375
## BigMartsFinalData$Item_Type16 20.4637 103.0600 0.199
## Pr(>|t|)
## (Intercept) <2e-16 ***
## BigMartsFinalData$Item_MRP <2e-16 ***
## BigMartsFinalData$Outlet_Identifier2 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier3 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier4 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier5 0.815
## BigMartsFinalData$Outlet_Identifier6 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier7 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier8 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier9 <2e-16 ***
## BigMartsFinalData$Outlet_Identifier10 <2e-16 ***
## BigMartsFinalData$Outlet_Type2 NA
## BigMartsFinalData$Outlet_Type3 NA
## BigMartsFinalData$Outlet_Type4 NA
## BigMartsFinalData$Item_Weight 0.961
## BigMartsFinalData$Item_Fat_Content2 0.158
## BigMartsFinalData$Outlet_Size2 NA
## BigMartsFinalData$Outlet_Size3 NA
## BigMartsFinalData$Item_Type2 0.939
## BigMartsFinalData$Item_Type3 0.980
## BigMartsFinalData$Item_Type4 0.684
## BigMartsFinalData$Item_Type5 0.496
## BigMartsFinalData$Item_Type6 0.641
## BigMartsFinalData$Item_Type7 0.593
## BigMartsFinalData$Item_Type8 0.993
## BigMartsFinalData$Item_Type9 0.915
## BigMartsFinalData$Item_Type10 0.527
## BigMartsFinalData$Item_Type11 0.980
## BigMartsFinalData$Item_Type12 0.842
## BigMartsFinalData$Item_Type13 0.216
## BigMartsFinalData$Item_Type14 0.838
## BigMartsFinalData$Item_Type15 0.708
## BigMartsFinalData$Item_Type16 0.843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1129 on 8495 degrees of freedom
## Multiple R-squared: 0.5637, Adjusted R-squared: 0.5623
## F-statistic: 406.5 on 27 and 8495 DF, p-value: < 2.2e-16
modellinearregression$coefficients
## (Intercept)
## -1861.2888112
## BigMartsFinalData$Item_MRP
## 15.5651024
## BigMartsFinalData$Outlet_Identifier2
## 1949.6316047
## BigMartsFinalData$Outlet_Identifier3
## 2022.6308929
## BigMartsFinalData$Outlet_Identifier4
## 1640.9231062
## BigMartsFinalData$Outlet_Identifier5
## 16.0509280
## BigMartsFinalData$Outlet_Identifier6
## 3368.5087650
## BigMartsFinalData$Outlet_Identifier7
## 2062.3074106
## BigMartsFinalData$Outlet_Identifier8
## 1848.8596243
## BigMartsFinalData$Outlet_Identifier9
## 1918.9800048
## BigMartsFinalData$Outlet_Identifier10
## 2015.5569710
## BigMartsFinalData$Outlet_Type2
## NA
## BigMartsFinalData$Outlet_Type3
## NA
## BigMartsFinalData$Outlet_Type4
## NA
## BigMartsFinalData$Item_Weight
## 0.1290995
## BigMartsFinalData$Item_Fat_Content2
## 39.8469111
## BigMartsFinalData$Outlet_Size2
## NA
## BigMartsFinalData$Outlet_Size3
## NA
## BigMartsFinalData$Item_Type2
## 6.4784563
## BigMartsFinalData$Item_Type3
## 2.9085821
## BigMartsFinalData$Item_Type4
## 25.5584539
## BigMartsFinalData$Item_Type5
## -42.3478266
## BigMartsFinalData$Item_Type6
## -27.4226453
## BigMartsFinalData$Item_Type7
## 29.3690084
## BigMartsFinalData$Item_Type8
## 0.7781132
## BigMartsFinalData$Item_Type9
## -7.2752954
## BigMartsFinalData$Item_Type10
## -37.9153032
## BigMartsFinalData$Item_Type11
## 1.7483115
## BigMartsFinalData$Item_Type12
## -19.6856429
## BigMartsFinalData$Item_Type13
## 183.2622413
## BigMartsFinalData$Item_Type14
## -11.2739514
## BigMartsFinalData$Item_Type15
## -26.3247854
## BigMartsFinalData$Item_Type16
## 20.4637022
#Hence the model is
#Item_Outlet_Sales = -1861.288 + 15.565*Item_MRP + a*Outlet_Identifier + b*Outlet_Type + c*Item_Weight + 0.129*Item_Weight + e*Item_Fat_Content + f*Item_Type
ItemOutletSalesTestPrediction <- predict(modellinearregression, BigMartSalesTestData)
## Warning: 'newdata' had 5681 rows but variables found have 8523 rows
## Warning in predict.lm(modellinearregression, BigMartSalesTestData):
## prediction from a rank-deficient fit may be misleading
head(ItemOutletSalesTestPrediction)
## 1 2 3 4 5 6
## 4001.4267 545.2357 2362.5744 1044.7331 889.9386 620.8819
# Diagnostics of prediction.
ActualTestPrediction <- data.frame(cbind(actuals=BigMartSalesTestData$Item_Outlet_Sales,predicted=ItemOutletSalesTestPrediction))
correlation_accuracy <- cor(ActualTestPrediction)
head(ActualTestPrediction)
## predicted
## 1 4001.4267
## 2 545.2357
## 3 2362.5744
## 4 1044.7331
## 5 889.9386
## 6 620.8819
#The list of analysis are
#A. Item Outlet Sales depends on the following seven parameters
#1.Item_MRP
#2.Outlet_Identifier
#3.Outlet_Type
#4.Item_Weight
#5.Item_Fat_Content
#6.Outlet_Size
#7.Item_Type
#B. The two biggest positive correlation Values Obtained Are:
#Item_MRP and Outlet_Type
#C. The pvalues obtained of Item MRP and Outlet Identifiers are less than the significant value of 0.05 hence contribute significantly to the values of the Item Outlet Sales