IMPORTING VARIOUS LIBRARIES WHICH MIGHT BE USEFUL
library(readr) #CSV file I/O, e.g. the read_csv function
library(plyr) #for mapvalues() function #always install plyr package before dplyr otherwise it will give warning messages.
## Warning: package 'plyr' was built under R version 3.5.1
library(dplyr) #for fast data manipulation(functions like mutate,select,arrange,filter,...)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)#for data visualisation
## Warning: package 'ggplot2' was built under R version 3.5.1
library(VIM) #for KNN imputation
## Warning: package 'VIM' was built under R version 3.5.1
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.5.1
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
#READING THE DATASETs IN R
BMStrain <- read.csv('D:\\Projects\\Level1\\BigMart Sale\\Train.csv')
BMStest <- read.csv('D:\\Projects\\Level1\\BigMart Sale\\Test.csv')
#A QUICK VIEW AT STRUCTURE AND SUMMARY OF BOTH DATASETS
str(BMStrain) #BMStrain is a dataframe with 8523 obs. of 12 variables.
## 'data.frame': 8523 obs. of 12 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "","High","Medium",..: 3 3 3 1 2 3 2 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
str(BMStest) #BMStest is a dataframe with 8523 obs. of 11 variables.
## 'data.frame': 5681 obs. of 11 variables:
## $ Item_Identifier : Factor w/ 1543 levels "DRA12","DRA24",..: 1104 1068 1407 810 1185 462 605 267 669 171 ...
## $ Item_Weight : num 20.75 8.3 14.6 7.32 NA ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 4 3 3 5 5 5 3 5 3 ...
## $ Item_Visibility : num 0.00756 0.03843 0.09957 0.01539 0.1186 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 14 5 12 14 5 7 1 1 14 1 ...
## $ Item_MRP : num 107.9 87.3 241.8 155 234.2 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 3 1 3 6 9 4 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2007 1998 2007 1985 1997 2009 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "","High","Medium",..: 3 1 1 1 3 4 3 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 2 3 2 3 1 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 2 1 2 4 2 3 4 2 2 ...
## Notice that the variables like Item_Type are already categorical(factor, not character), so
## there is no need to change them
## We have 7 categorical variables, 4 numerical variables and 1 integer variable.
#since in the test dataset Item_Outlet_Sales is the missing variable so it will be our response
#or dependent variable which we will predict by fitting suitable machine learning models on the
#proper set of features.
#checking for any MISSING VALUES in train and test dataset>>best option is summary()
summary(BMStrain)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.858 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
##
summary(BMStest)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRF48 : 8 Min. : 4.555 LF : 206 Min. :0.00000
## FDK57 : 8 1st Qu.: 8.645 low fat: 66 1st Qu.:0.02705
## FDN52 : 8 Median :12.500 Low Fat:3396 Median :0.05415
## FDP15 : 8 Mean :12.696 reg : 78 Mean :0.06568
## FDQ60 : 8 3rd Qu.:16.700 Regular:1935 3rd Qu.:0.09346
## FDW10 : 8 Max. :21.350 Max. :0.32364
## (Other):5633 NA's :976
## Item_Type Item_MRP Outlet_Identifier
## Snack Foods : 789 Min. : 31.99 OUT027 : 624
## Fruits and Vegetables: 781 1st Qu.: 94.41 OUT013 : 621
## Household : 638 Median :141.42 OUT035 : 620
## Frozen Foods : 570 Mean :141.02 OUT046 : 620
## Dairy : 454 3rd Qu.:186.03 OUT049 : 620
## Baking Goods : 438 Max. :266.59 OUT045 : 619
## (Other) :2011 (Other):1957
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :1606 Tier 1:1592
## 1st Qu.:1987 High : 621 Tier 2:1856
## Median :1999 Medium:1862 Tier 3:2233
## Mean :1998 Small :1592
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type
## Grocery Store : 722
## Supermarket Type1:3717
## Supermarket Type2: 618
## Supermarket Type3: 624
##
##
##
#clearly we can see that only the variable Item_Weight has missing values in it,
#which can be imputed using KNN imputation or mean imputation methods.
#DIMENSION OF DATASETS
dim(BMStrain)
## [1] 8523 12
dim(BMStest)
## [1] 5681 11
#Item_Idetifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size ,Outlet_Location_Type,Outlet_Type are the variables which are categorical
#Appending the train and test dataset
## We can't append the datasets unless they have the same number of columns. Therefore, we will
## add another column(Item_outlet_Sales)(the dependent variable), to the test dataset.
BMStest$Item_Outlet_Sales <- NA
## Now, we will append the datasets, and create a new dataset named 'BMSdata'
BMSdata <- rbind(BMStrain, BMStest)
dim(BMSdata)
## [1] 14204 12
#Checking the missing(NA) values of the data
summary(is.na(BMSdata))
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:14204 FALSE:11765 FALSE:14204 FALSE:14204
## TRUE :2439
## Item_Type Item_MRP Outlet_Identifier
## Mode :logical Mode :logical Mode :logical
## FALSE:14204 FALSE:14204 FALSE:14204
##
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Mode :logical Mode :logical Mode :logical
## FALSE:14204 FALSE:14204 FALSE:14204
##
## Outlet_Type Item_Outlet_Sales
## Mode :logical Mode :logical
## FALSE:14204 FALSE:8523
## TRUE :5681
## is.na() returns a logical vector which indicates the number of datapoints which are missing
## TRUE indicates missing.
#SOME DATA VISUALISTION
# we will first plot and infer from the HISTOGRAMS of each continuous variable
hist(BMSdata$Item_Weight) #we can see there is no as such skewness and the frequency is more or less constant

hist(BMSdata$Item_Visibility)

#as we can see the histogram for Item_Visibility is right skewed with a long right tail we should have mean greater than the median which we can also see from summary(A102train)
hist(BMSdata$Item_Outlet_Sales)

#also as we can see the histogram for Item_Outlet_Sales is right skewed with a long right tail we should have mean greater than the median which we can also see from summary(A102train) .
#from our shear logic the Item_Outlet_Sales and Item_Visibility should be related(the item which has more sale will obviously be more visible) which is evident from the fact that both Item_Visibility and Item_Outlet_Sales are right skewed.
library(ggplot2)
ggplot(BMSdata, aes(Outlet_Identifier, Item_Weight)) + geom_boxplot()
## Warning: Removed 2439 rows containing non-finite values (stat_boxplot).

# we can see that only the weights of items relted to OUTO19 and OUTO27 are missing.
ggplot(BMSdata, aes(Item_Type, Item_Weight)) + geom_boxplot()
## Warning: Removed 2439 rows containing non-finite values (stat_boxplot).

# we can see that the weights all types of items are there.
#similarly for,
ggplot(BMSdata, aes(Item_Fat_Content, Item_Weight)) + geom_boxplot()
## Warning: Removed 2439 rows containing non-finite values (stat_boxplot).

boxplot(BMSdata$Item_Visibility)

#THERE ARE SO MANY OUTLIERS FOR Item_Visibility VARIABLE which is also evident from the fact that its hist. was right skewed.
ggplot(BMSdata, aes(Item_Type,Item_Visibility)) + geom_boxplot()

# thus the dots above each item type repesent outliers in item visibility corresponding to that particular item.similarly,
ggplot(BMSdata, aes(Outlet_Identifier,Item_Visibility)) + geom_boxplot()

# its no point to show the boxplot of Item_Visibility against the Item_Identifier.
ggplot(BMSdata, aes(Item_Outlet_Sales,Item_Visibility,col = Item_Type)) + geom_point()
## Warning: Removed 5681 rows containing missing values (geom_point).

ggplot(BMSdata, aes(Item_Outlet_Sales,Item_Visibility,col = Outlet_Type)) + geom_point()
## Warning: Removed 5681 rows containing missing values (geom_point).

#IMPUTING THE MISSING VALUES FOR DATA SET USING KNN IMPUTATION
#FOR kNN IMPUTATION WE WILL REQUIRE "VIM" LIBRARY WHICH WE HAVE ALREADY IMPORTED
#imputing the missing values for Item_Weight.
#we can also see that Outlet_Size has observations like "_", SO WE WILL FIRST CONVERT THESE VALUES TO 'NA' VALUES SO THAT WE CAN USE,KNN IMPUTATION METHOD SINCE THIS METHOD IS ALSO APPLICABLE FOR CATEGORICAL VARIABLES..
BMSdata[BMSdata$Outlet_Size =="","Outlet_Size"] <- NA
summary(BMSdata)# all the empty values were replaced by NA
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRA24 : 10 Min. : 4.555 LF : 522 Min. :0.00000
## DRA59 : 10 1st Qu.: 8.710 low fat: 178 1st Qu.:0.02704
## DRB25 : 10 Median :12.600 Low Fat:8485 Median :0.05402
## DRC25 : 10 Mean :12.793 reg : 195 Mean :0.06595
## DRC27 : 10 3rd Qu.:16.750 Regular:4824 3rd Qu.:0.09404
## DRC36 : 10 Max. :21.350 Max. :0.32839
## (Other):14144 NA's :2439
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:2013 Min. : 31.29 OUT027 :1559
## Snack Foods :1989 1st Qu.: 94.01 OUT013 :1553
## Household :1548 Median :142.25 OUT035 :1550
## Frozen Foods :1426 Mean :141.00 OUT046 :1550
## Dairy :1136 3rd Qu.:185.86 OUT049 :1550
## Baking Goods :1086 Max. :266.89 OUT045 :1548
## (Other) :5006 (Other):4894
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 : 0 Tier 1:3980
## 1st Qu.:1987 High :1553 Tier 2:4641
## Median :1999 Medium:4655 Tier 3:5583
## Mean :1998 Small :3980
## 3rd Qu.:2004 NA's :4016
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1805 Min. : 33.29
## Supermarket Type1:9294 1st Qu.: 834.25
## Supermarket Type2:1546 Median : 1794.33
## Supermarket Type3:1559 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
## NA's :5681
imputdata1 <- BMSdata
imputdata1 <- kNN(BMSdata, variable = c("Item_Weight","Outlet_Size"), k = 90)
# k is generally choosen to be squareroot of number of observations.
summary(imputdata1)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRA24 : 10 Min. : 4.555 LF : 522 Min. :0.00000
## DRA59 : 10 1st Qu.: 9.300 low fat: 178 1st Qu.:0.02704
## DRB25 : 10 Median :12.600 Low Fat:8485 Median :0.05402
## DRC25 : 10 Mean :12.761 reg : 195 Mean :0.06595
## DRC27 : 10 3rd Qu.:16.000 Regular:4824 3rd Qu.:0.09404
## DRC36 : 10 Max. :21.350 Max. :0.32839
## (Other):14144
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:2013 Min. : 31.29 OUT027 :1559
## Snack Foods :1989 1st Qu.: 94.01 OUT013 :1553
## Household :1548 Median :142.25 OUT035 :1550
## Frozen Foods :1426 Mean :141.00 OUT046 :1550
## Dairy :1136 3rd Qu.:185.86 OUT049 :1550
## Baking Goods :1086 Max. :266.89 OUT045 :1548
## (Other) :5006 (Other):4894
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 : 0 Tier 1:3980
## 1st Qu.:1987 High :1553 Tier 2:4641
## Median :1999 Medium:6584 Tier 3:5583
## Mean :1998 Small :6067
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales Item_Weight_imp
## Grocery Store :1805 Min. : 33.29 Mode :logical
## Supermarket Type1:9294 1st Qu.: 834.25 FALSE:11765
## Supermarket Type2:1546 Median : 1794.33 TRUE :2439
## Supermarket Type3:1559 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
## NA's :5681
## Outlet_Size_imp
## Mode :logical
## FALSE:10188
## TRUE :4016
##
##
##
##
ncol(imputdata1) #you will see there are two additional logical columns that got created we have to remove them
## [1] 14
imputdata1 <- subset(imputdata1,select = Item_Identifier:Item_Outlet_Sales)
summary(imputdata1)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRA24 : 10 Min. : 4.555 LF : 522 Min. :0.00000
## DRA59 : 10 1st Qu.: 9.300 low fat: 178 1st Qu.:0.02704
## DRB25 : 10 Median :12.600 Low Fat:8485 Median :0.05402
## DRC25 : 10 Mean :12.761 reg : 195 Mean :0.06595
## DRC27 : 10 3rd Qu.:16.000 Regular:4824 3rd Qu.:0.09404
## DRC36 : 10 Max. :21.350 Max. :0.32839
## (Other):14144
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:2013 Min. : 31.29 OUT027 :1559
## Snack Foods :1989 1st Qu.: 94.01 OUT013 :1553
## Household :1548 Median :142.25 OUT035 :1550
## Frozen Foods :1426 Mean :141.00 OUT046 :1550
## Dairy :1136 3rd Qu.:185.86 OUT049 :1550
## Baking Goods :1086 Max. :266.89 OUT045 :1548
## (Other) :5006 (Other):4894
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 : 0 Tier 1:3980
## 1st Qu.:1987 High :1553 Tier 2:4641
## Median :1999 Medium:6584 Tier 3:5583
## Mean :1998 Small :6067
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1805 Min. : 33.29
## Supermarket Type1:9294 1st Qu.: 834.25
## Supermarket Type2:1546 Median : 1794.33
## Supermarket Type3:1559 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
## NA's :5681
plot(imputdata1$Item_MRP,imputdata1$Item_Weight)

BMSdata <- imputdata1
#ALSO ONE THING WHICH IS TO BE NOTED IS THAT THE VARIABLE Item_Fat_Content contains same observations with different names which need to be tackled::
BMSdata$Item_Fat_Content <- mapvalues(BMSdata$Item_Fat_Content, from = c("LF","Low Fat","low fat","Regular"), to = c("lf","lf","lf","reg"))
levels(BMSdata$Item_Fat_Content)
## [1] "lf" "reg"
#lets see the boxplots of different variables.
#now we will look into the features having outliers and implement proper outlier detection technique
#we will only look onto the variables which are not categorical.
boxplot(BMSdata$Item_Weight)

#we can clearly see there are no outliers for this variable.
boxplot(BMSdata$Item_Visibility)

#BUT THERE ARE SO MANY OUTLIERS FOR Item_Visibility VARIABLE which is also evident from the fact that its hist. was right skewed.
#SO Item_Outlet_Sales SHOULD ALSO HAVE MANY OUTLIERS, LETS CHECK,,
boxplot(BMSdata$Item_Outlet_Sales)

#there are indeed many outliers
#WE GENERALLY USE THREE METHODS TO DETECT OUTLIERS 1)DISCARDING OUTLIERS 2)WINSORISATION 3)VARIABLE TRANSFORMATION
#DISCARDING OUTLIERS WILL RESULT IN REDUCTION IN NO. OF OBSERVATIONS AND VARIABLE TRANSFORMATION WILL RESULT IN VERY SMALL VALUES(skewness will still exist)
#SO THE BEST METHOD SHOULD BE WINSORISATION
#DETECTING OUTLIERS:
dataoutlier <- BMSdata
bench <- 0.09459 + 1.5*IQR(BMSdata$Item_Visibility)
# 0.09459 is the third quartile
bench
## [1] 0.1950924
#value comes out to be 0.1959837
dataoutlier$Item_Visibility[dataoutlier$Item_Visibility > bench] <- bench
boxplot(dataoutlier$Item_Visibility)

# as we can see all the outliers have been removed
BMSdata <- dataoutlier
# Notice that Item_Type has factors which are not food items. So, Item_Fat_Content makes nosense.
# Hence we will add a new factor(level): "None", which will correspond to the non-food items in Item_Type.
#Adding new level in Item_Fat_Content "None".
levels(BMSdata$Item_Fat_Content) <- c(levels(BMSdata$Item_Fat_Content), "None")
## Based on Item_Type, for "health and Hygiene", "Household" and "Others",
## we will change the Item_Fat_Content factor to "None".
BMSdata[which(BMSdata$Item_Type=="Health and Hygiene"), ]$Item_Fat_Content = "None"
BMSdata[which(BMSdata$Item_Type=="Household"), ]$Item_Fat_Content = "None"
BMSdata[which(BMSdata$Item_Type=="Others"), ]$Item_Fat_Content = "None"
BMSdata$Item_Fat_Content <- as.factor(BMSdata$Item_Fat_Content)
table(BMSdata$Item_Fat_Content) # Viewing the variable
##
## lf reg None
## 6499 5019 2686
## Since we are only concerned with how old the outlet is, and not the establishment year
BMSdata$Outlet_Year <- 2018 - BMSdata$Outlet_Establishment_Year
table(BMSdata$Outlet_Year)
##
## 9 11 14 16 19 20 21 31 33
## 1546 1543 1550 1548 1550 925 1550 1553 2439
BMSdata$Outlet_Year <- as.factor(BMSdata$Outlet_Year)
## Visualizing Item_MRP with ggplot
library(ggplot2)
ggplot(BMSdata, aes(Item_MRP)) + geom_density(adjust = 1/5)

## It is obvious that we would be better off by converting Item_MRP to Categorical variable
#creating a new feature:
#lets create a new feature named "price" which is a categorical variable..
BMSdata$price <- "low"
BMSdata$price[BMSdata$Item_MRP >200] <-"high"
BMSdata$price[BMSdata$Item_MRP>70 & BMSdata$Item_MRP <=200] <- "medium"
summary(BMSdata)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## DRA24 : 10 Min. : 4.555 lf :6499 Min. :0.00000
## DRA59 : 10 1st Qu.: 9.300 reg :5019 1st Qu.:0.02704
## DRB25 : 10 Median :12.600 None:2686 Median :0.05402
## DRC25 : 10 Mean :12.761 Mean :0.06506
## DRC27 : 10 3rd Qu.:16.000 3rd Qu.:0.09404
## DRC36 : 10 Max. :21.350 Max. :0.19509
## (Other):14144
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:2013 Min. : 31.29 OUT027 :1559
## Snack Foods :1989 1st Qu.: 94.01 OUT013 :1553
## Household :1548 Median :142.25 OUT035 :1550
## Frozen Foods :1426 Mean :141.00 OUT046 :1550
## Dairy :1136 3rd Qu.:185.86 OUT049 :1550
## Baking Goods :1086 Max. :266.89 OUT045 :1548
## (Other) :5006 (Other):4894
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 : 0 Tier 1:3980
## 1st Qu.:1987 High :1553 Tier 2:4641
## Median :1999 Medium:6584 Tier 3:5583
## Mean :1998 Small :6067
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales Outlet_Year
## Grocery Store :1805 Min. : 33.29 33 :2439
## Supermarket Type1:9294 1st Qu.: 834.25 31 :1553
## Supermarket Type2:1546 Median : 1794.33 14 :1550
## Supermarket Type3:1559 Mean : 2181.29 19 :1550
## 3rd Qu.: 3101.30 21 :1550
## Max. :13086.97 16 :1548
## NA's :5681 (Other):4014
## price
## Length:14204
## Class :character
## Mode :character
##
##
##
##
#we can see a new variable named price is created with the property that,whenever,
#MRP <=70, price = "low"
#MRP >70 and MRP <= 200, price = "medium"
#MRP >200, price ="High"
## We are done with the data cleaning and feature engineering.
# Dividing data into train and test
BMStrain <- BMSdata[1:8523, ]
BMStest <- BMSdata[8524:14204, ]
#now we are ready to build and use our predictive model which is simply #####linear regression#####.
#one thing to be noted is that Item_Identifier and outlet_Identifier clearly have no role in predicting Item_Outlet_Sales ,so they are not significant variables.
##lets develop our linear model.
model1 <- lm(Item_Outlet_Sales~., data = BMStrain[-c(1,7,8)])
summary(model1)
##
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = BMStrain[-c(1, 7,
## 8)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4324.9 -680.1 -89.4 569.1 7947.0
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1814.5976 167.7231 -10.819 < 2e-16 ***
## Item_Weight -0.3040 2.8988 -0.105 0.91647
## Item_Fat_Contentreg 40.5100 28.2571 1.434 0.15172
## Item_Fat_ContentNone -23.4622 98.7039 -0.238 0.81212
## Item_Visibility -355.3823 260.9380 -1.362 0.17325
## Item_TypeBreads 3.2830 84.1008 0.039 0.96886
## Item_TypeBreakfast 4.4071 116.7200 0.038 0.96988
## Item_TypeCanned 25.9874 62.7722 0.414 0.67889
## Item_TypeDairy -40.8285 62.4271 -0.654 0.51312
## Item_TypeFrozen Foods -28.6851 58.9810 -0.486 0.62674
## Item_TypeFruits and Vegetables 28.5747 55.0277 0.519 0.60358
## Item_TypeHard Drinks -2.8135 90.3882 -0.031 0.97517
## Item_TypeHealth and Hygiene 12.6656 100.0451 0.127 0.89926
## Item_TypeHousehold -15.5418 94.6877 -0.164 0.86963
## Item_TypeMeat -2.6029 70.7146 -0.037 0.97064
## Item_TypeOthers NA NA NA NA
## Item_TypeSeafood 183.4691 148.2236 1.238 0.21583
## Item_TypeSnack Foods -11.6618 55.3048 -0.211 0.83300
## Item_TypeSoft Drinks -28.7016 70.3683 -0.408 0.68337
## Item_TypeStarchy Foods 23.2072 103.1930 0.225 0.82207
## Item_MRP 15.6101 0.3974 39.279 < 2e-16 ***
## Outlet_SizeMedium -126.7214 100.2683 -1.264 0.20633
## Outlet_SizeSmall -4.0261 90.9873 -0.044 0.96471
## Outlet_Location_TypeTier 2 -63.9278 52.5672 -1.216 0.22397
## Outlet_Location_TypeTier 3 26.4651 74.4320 0.356 0.72218
## Outlet_TypeSupermarket Type1 1888.7865 62.4408 30.249 < 2e-16 ***
## Outlet_TypeSupermarket Type2 1707.1264 79.5297 21.465 < 2e-16 ***
## Outlet_TypeSupermarket Type3 3433.7370 79.4986 43.192 < 2e-16 ***
## Outlet_Year11 286.4721 89.8299 3.189 0.00143 **
## Outlet_Year14 207.3421 52.5524 3.945 8.03e-05 ***
## Outlet_Year16 NA NA NA NA
## Outlet_Year19 219.2926 95.3150 2.301 0.02143 *
## Outlet_Year20 NA NA NA NA
## Outlet_Year21 NA NA NA NA
## Outlet_Year31 NA NA NA NA
## Outlet_Year33 NA NA NA NA
## pricelow 14.8425 86.5527 0.171 0.86385
## pricemedium 10.0640 51.6574 0.195 0.84554
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1129 on 8491 degrees of freedom
## Multiple R-squared: 0.5639, Adjusted R-squared: 0.5623
## F-statistic: 354.2 on 31 and 8491 DF, p-value: < 2.2e-16
model2 <- lm(log(Item_Outlet_Sales)~., data = BMStrain[-c(1,7,8)])
summary(model2)
##
## Call:
## lm(formula = log(Item_Outlet_Sales) ~ ., data = BMStrain[-c(1,
## 7, 8)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.17817 -0.27354 0.06162 0.36246 1.37742
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3399351 0.0775579 55.957 < 2e-16 ***
## Item_Weight -0.0006422 0.0013404 -0.479 0.631883
## Item_Fat_Contentreg 0.0148206 0.0130666 1.134 0.256725
## Item_Fat_ContentNone 0.0158332 0.0456423 0.347 0.728677
## Item_Visibility -0.0477385 0.1206620 -0.396 0.692382
## Item_TypeBreads 0.0456649 0.0388896 1.174 0.240341
## Item_TypeBreakfast -0.0461945 0.0539732 -0.856 0.392090
## Item_TypeCanned 0.0300592 0.0290269 1.036 0.300436
## Item_TypeDairy -0.0206354 0.0288673 -0.715 0.474729
## Item_TypeFrozen Foods -0.0192780 0.0272738 -0.707 0.479691
## Item_TypeFruits and Vegetables 0.0113726 0.0254457 0.447 0.654932
## Item_TypeHard Drinks 0.0029078 0.0417970 0.070 0.944538
## Item_TypeHealth and Hygiene 0.0131150 0.0462625 0.283 0.776807
## Item_TypeHousehold -0.0190341 0.0437851 -0.435 0.663779
## Item_TypeMeat 0.0313844 0.0326996 0.960 0.337193
## Item_TypeOthers NA NA NA NA
## Item_TypeSeafood 0.0162538 0.0685410 0.237 0.812554
## Item_TypeSnack Foods 0.0098772 0.0255738 0.386 0.699341
## Item_TypeSoft Drinks -0.0003628 0.0325394 -0.011 0.991105
## Item_TypeStarchy Foods 0.0078494 0.0477181 0.164 0.869346
## Item_MRP 0.0077719 0.0001838 42.292 < 2e-16 ***
## Outlet_SizeMedium -0.0732145 0.0463657 -1.579 0.114358
## Outlet_SizeSmall 0.0304026 0.0420740 0.723 0.469947
## Outlet_Location_TypeTier 2 -0.0289207 0.0243079 -1.190 0.234172
## Outlet_Location_TypeTier 3 0.0096982 0.0344186 0.282 0.778125
## Outlet_TypeSupermarket Type1 1.9238599 0.0288737 66.630 < 2e-16 ***
## Outlet_TypeSupermarket Type2 1.8540829 0.0367758 50.416 < 2e-16 ***
## Outlet_TypeSupermarket Type3 2.5606327 0.0367615 69.655 < 2e-16 ***
## Outlet_Year11 0.1684760 0.0415388 4.056 5.04e-05 ***
## Outlet_Year14 0.0877455 0.0243011 3.611 0.000307 ***
## Outlet_Year16 NA NA NA NA
## Outlet_Year19 0.1443292 0.0440752 3.275 0.001062 **
## Outlet_Year20 NA NA NA NA
## Outlet_Year21 NA NA NA NA
## Outlet_Year31 NA NA NA NA
## Outlet_Year33 NA NA NA NA
## pricelow -0.1618333 0.0400234 -4.043 5.31e-05 ***
## pricemedium 0.1891954 0.0238872 7.920 2.67e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5221 on 8491 degrees of freedom
## Multiple R-squared: 0.7375, Adjusted R-squared: 0.7365
## F-statistic: 769.5 on 31 and 8491 DF, p-value: < 2.2e-16
model3 <- lm(sqrt(Item_Outlet_Sales)~., data = BMStrain[-c(1,7,8)])
summary(model3)
##
## Call:
## lm(formula = sqrt(Item_Outlet_Sales) ~ ., data = BMStrain[-c(1,
## 7, 8)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.834 -6.714 0.119 7.028 42.290
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.757984 1.590356 -4.249 2.17e-05 ***
## Item_Weight -0.006619 0.027486 -0.241 0.809706
## Item_Fat_Contentreg 0.436906 0.267935 1.631 0.103003
## Item_Fat_ContentNone 0.161713 0.935914 0.173 0.862824
## Item_Visibility -2.444369 2.474223 -0.988 0.323213
## Item_TypeBreads 0.476393 0.797447 0.597 0.550258
## Item_TypeBreakfast -0.447587 1.106743 -0.404 0.685916
## Item_TypeCanned 0.453237 0.595208 0.761 0.446393
## Item_TypeDairy -0.464520 0.591936 -0.785 0.432624
## Item_TypeFrozen Foods -0.292273 0.559260 -0.523 0.601262
## Item_TypeFruits and Vegetables 0.248512 0.521774 0.476 0.633885
## Item_TypeHard Drinks 0.172029 0.857064 0.201 0.840923
## Item_TypeHealth and Hygiene 0.069602 0.948631 0.073 0.941513
## Item_TypeHousehold -0.331192 0.897832 -0.369 0.712226
## Item_TypeMeat 0.228763 0.670518 0.341 0.732982
## Item_TypeOthers NA NA NA NA
## Item_TypeSeafood 1.277690 1.405461 0.909 0.363329
## Item_TypeSnack Foods 0.051242 0.524402 0.098 0.922161
## Item_TypeSoft Drinks -0.253610 0.667234 -0.380 0.703887
## Item_TypeStarchy Foods 0.285738 0.978480 0.292 0.770276
## Item_MRP 0.162845 0.003768 43.214 < 2e-16 ***
## Outlet_SizeMedium -1.295415 0.950747 -1.363 0.173069
## Outlet_SizeSmall 0.205086 0.862745 0.238 0.812109
## Outlet_Location_TypeTier 2 -0.672525 0.498443 -1.349 0.177293
## Outlet_Location_TypeTier 3 0.248797 0.705766 0.353 0.724456
## Outlet_TypeSupermarket Type1 27.466278 0.592066 46.391 < 2e-16 ***
## Outlet_TypeSupermarket Type2 25.592984 0.754103 33.938 < 2e-16 ***
## Outlet_TypeSupermarket Type3 42.010478 0.753809 55.731 < 2e-16 ***
## Outlet_Year11 3.103005 0.851771 3.643 0.000271 ***
## Outlet_Year14 2.094436 0.498304 4.203 2.66e-05 ***
## Outlet_Year16 NA NA NA NA
## Outlet_Year19 2.444484 0.903780 2.705 0.006850 **
## Outlet_Year20 NA NA NA NA
## Outlet_Year21 NA NA NA NA
## Outlet_Year31 NA NA NA NA
## Outlet_Year33 NA NA NA NA
## pricelow -1.045114 0.820696 -1.273 0.202894
## pricemedium 2.069835 0.489817 4.226 2.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.71 on 8491 degrees of freedom
## Multiple R-squared: 0.6613, Adjusted R-squared: 0.66
## F-statistic: 534.7 on 31 and 8491 DF, p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(model1)




#Lets check RMSE values
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.5.1
rmse(BMStrain$Item_Outlet_Sales, model1$fitted.values)
## [1] 1126.871
#RMSE value is 1126.891 (model which is suffering from heteroskedasticity)
rmse(BMStrain$Item_Outlet_Sales, exp(model2$fitted.values))
## [1] 1105.288
#New RMSE value is 1105.367 Thats Awesome! we improved a lot.
#Prediction on test_dataset
#BMStest$Item_Outlet_Sales <- predict(model2 , newdata = BMStest[-c(1,7,8,12)])
BMStest2 <- BMStest[c(1:11,13,14)]
options(warn = -1)
predicted <- predict(model2,newdata = BMStest2)
BMStest2$Item_Outlet_Sales <-exp(predicted)
Item_Identifier <- BMStest$Item_Identifier
Outlet_Identifier <- BMStest$Outlet_Identifier
output.df <- as.data.frame(Item_Identifier)
output.df$Outlet_Identifier <- Outlet_Identifier
output.df$Item_Outlet_Sales <- exp(predicted)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
some(output.df)
## Item_Identifier Outlet_Identifier Item_Outlet_Sales
## 823 FDU45 OUT010 233.2086
## 1260 NCS41 OUT019 400.9496
## 1419 FDG40 OUT035 618.1314
## 2252 FDN01 OUT017 2512.3861
## 2312 NCI17 OUT035 1511.3176
## 2478 NCF55 OUT027 1039.2112
## 2673 FDN49 OUT027 1038.5849
## 2830 FDE53 OUT019 213.7131
## 4004 FDQ04 OUT045 535.9542
## 4234 FDO44 OUT049 1629.7312