DATA WRANGLING AND MANIPULATION IN R

IMPORTING VARIOUS LIBRARIES WHICH MIGHT BE USEFUL

library(readr)  #CSV file I/O, e.g. the read_csv function 
library(plyr)   #for mapvalues() function #always install plyr package before dplyr otherwise it will give warning messages. 
## Warning: package 'plyr' was built under R version 3.5.1
library(dplyr)  #for fast data manipulation(functions like mutate,select,arrange,filter,...)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)#for data visualisation
## Warning: package 'ggplot2' was built under R version 3.5.1
library(VIM)    #for KNN imputation
## Warning: package 'VIM' was built under R version 3.5.1
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.5.1
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
#READING THE DATASETs IN R 
BMStrain <- read.csv('D:\\Projects\\Level1\\BigMart Sale\\Train.csv')
BMStest <- read.csv('D:\\Projects\\Level1\\BigMart Sale\\Test.csv')
#A QUICK VIEW AT STRUCTURE AND SUMMARY OF BOTH DATASETS
str(BMStrain)  #BMStrain is a dataframe with 8523 obs. of  12 variables.
## 'data.frame':    8523 obs. of  12 variables:
##  $ Item_Identifier          : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
##  $ Item_Weight              : num  9.3 5.92 17.5 19.2 8.93 ...
##  $ Item_Fat_Content         : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
##  $ Item_Visibility          : num  0.016 0.0193 0.0168 0 0 ...
##  $ Item_Type                : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
##  $ Item_MRP                 : num  249.8 48.3 141.6 182.1 53.9 ...
##  $ Outlet_Identifier        : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
##  $ Outlet_Establishment_Year: int  1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
##  $ Outlet_Size              : Factor w/ 4 levels "","High","Medium",..: 3 3 3 1 2 3 2 3 1 1 ...
##  $ Outlet_Location_Type     : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
##  $ Outlet_Type              : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
##  $ Item_Outlet_Sales        : num  3735 443 2097 732 995 ...
str(BMStest)  #BMStest is a dataframe with 8523 obs. of  11 variables.
## 'data.frame':    5681 obs. of  11 variables:
##  $ Item_Identifier          : Factor w/ 1543 levels "DRA12","DRA24",..: 1104 1068 1407 810 1185 462 605 267 669 171 ...
##  $ Item_Weight              : num  20.75 8.3 14.6 7.32 NA ...
##  $ Item_Fat_Content         : Factor w/ 5 levels "LF","low fat",..: 3 4 3 3 5 5 5 3 5 3 ...
##  $ Item_Visibility          : num  0.00756 0.03843 0.09957 0.01539 0.1186 ...
##  $ Item_Type                : Factor w/ 16 levels "Baking Goods",..: 14 5 12 14 5 7 1 1 14 1 ...
##  $ Item_MRP                 : num  107.9 87.3 241.8 155 234.2 ...
##  $ Outlet_Identifier        : Factor w/ 10 levels "OUT010","OUT013",..: 10 3 1 3 6 9 4 6 8 3 ...
##  $ Outlet_Establishment_Year: int  1999 2007 1998 2007 1985 1997 2009 1985 2002 2007 ...
##  $ Outlet_Size              : Factor w/ 4 levels "","High","Medium",..: 3 1 1 1 3 4 3 3 1 1 ...
##  $ Outlet_Location_Type     : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 2 3 2 3 1 3 3 2 2 ...
##  $ Outlet_Type              : Factor w/ 4 levels "Grocery Store",..: 2 2 1 2 4 2 3 4 2 2 ...
## Notice that the variables like Item_Type are already categorical(factor, not character), so 
## there is no need to change them
## We have 7 categorical variables, 4 numerical variables and 1 integer variable.
#since in the test dataset Item_Outlet_Sales is the missing variable so it will be our response
#or dependent variable which we will predict by fitting suitable machine learning models on the 
#proper set of features.
#checking for any MISSING VALUES in train and test dataset>>best option is summary()
summary(BMStrain)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  FDG33  :  10    Min.   : 4.555   LF     : 316     Min.   :0.00000  
##  FDW13  :  10    1st Qu.: 8.774   low fat: 112     1st Qu.:0.02699  
##  DRE49  :   9    Median :12.600   Low Fat:5089     Median :0.05393  
##  DRN47  :   9    Mean   :12.858   reg    : 117     Mean   :0.06613  
##  FDD38  :   9    3rd Qu.:16.850   Regular:2889     3rd Qu.:0.09459  
##  FDF52  :   9    Max.   :21.350                    Max.   :0.32839  
##  (Other):8467    NA's   :1463                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:1232   Min.   : 31.29   OUT027 : 935     
##  Snack Foods          :1200   1st Qu.: 93.83   OUT013 : 932     
##  Household            : 910   Median :143.01   OUT035 : 930     
##  Frozen Foods         : 856   Mean   :140.99   OUT046 : 930     
##  Dairy                : 682   3rd Qu.:185.64   OUT049 : 930     
##  Canned               : 649   Max.   :266.89   OUT045 : 929     
##  (Other)              :2994                    (Other):2937     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :2410   Tier 1:2388         
##  1st Qu.:1987              High  : 932   Tier 2:2785         
##  Median :1999              Medium:2793   Tier 3:3350         
##  Mean   :1998              Small :2388                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1083   Min.   :   33.29  
##  Supermarket Type1:5577   1st Qu.:  834.25  
##  Supermarket Type2: 928   Median : 1794.33  
##  Supermarket Type3: 935   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
## 
summary(BMStest)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  DRF48  :   8    Min.   : 4.555   LF     : 206     Min.   :0.00000  
##  FDK57  :   8    1st Qu.: 8.645   low fat:  66     1st Qu.:0.02705  
##  FDN52  :   8    Median :12.500   Low Fat:3396     Median :0.05415  
##  FDP15  :   8    Mean   :12.696   reg    :  78     Mean   :0.06568  
##  FDQ60  :   8    3rd Qu.:16.700   Regular:1935     3rd Qu.:0.09346  
##  FDW10  :   8    Max.   :21.350                    Max.   :0.32364  
##  (Other):5633    NA's   :976                                        
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Snack Foods          : 789   Min.   : 31.99   OUT027 : 624     
##  Fruits and Vegetables: 781   1st Qu.: 94.41   OUT013 : 621     
##  Household            : 638   Median :141.42   OUT035 : 620     
##  Frozen Foods         : 570   Mean   :141.02   OUT046 : 620     
##  Dairy                : 454   3rd Qu.:186.03   OUT049 : 620     
##  Baking Goods         : 438   Max.   :266.59   OUT045 : 619     
##  (Other)              :2011                    (Other):1957     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :1606   Tier 1:1592         
##  1st Qu.:1987              High  : 621   Tier 2:1856         
##  Median :1999              Medium:1862   Tier 3:2233         
##  Mean   :1998              Small :1592                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type  
##  Grocery Store    : 722  
##  Supermarket Type1:3717  
##  Supermarket Type2: 618  
##  Supermarket Type3: 624  
##                          
##                          
## 
#clearly we can see that only the variable Item_Weight has missing values in it,
#which can be imputed using KNN imputation or mean imputation methods.
#DIMENSION OF DATASETS
dim(BMStrain)
## [1] 8523   12
dim(BMStest)
## [1] 5681   11
#Item_Idetifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size ,Outlet_Location_Type,Outlet_Type are the variables which are categorical
#Appending the train and test dataset
## We can't append the datasets unless they have the same number of columns. Therefore, we will
## add another column(Item_outlet_Sales)(the dependent variable), to the test dataset.
BMStest$Item_Outlet_Sales <- NA
## Now, we will append the datasets, and create a new dataset named 'BMSdata'
BMSdata <- rbind(BMStrain, BMStest)
dim(BMSdata)
## [1] 14204    12
#Checking the missing(NA) values of the data
summary(is.na(BMSdata)) 
##  Item_Identifier Item_Weight     Item_Fat_Content Item_Visibility
##  Mode :logical   Mode :logical   Mode :logical    Mode :logical  
##  FALSE:14204     FALSE:11765     FALSE:14204      FALSE:14204    
##                  TRUE :2439                                      
##  Item_Type        Item_MRP       Outlet_Identifier
##  Mode :logical   Mode :logical   Mode :logical    
##  FALSE:14204     FALSE:14204     FALSE:14204      
##                                                   
##  Outlet_Establishment_Year Outlet_Size     Outlet_Location_Type
##  Mode :logical             Mode :logical   Mode :logical       
##  FALSE:14204               FALSE:14204     FALSE:14204         
##                                                                
##  Outlet_Type     Item_Outlet_Sales
##  Mode :logical   Mode :logical    
##  FALSE:14204     FALSE:8523       
##                  TRUE :5681
## is.na() returns a logical vector which indicates the number of datapoints which are missing
## TRUE indicates missing.
#SOME DATA VISUALISTION
# we will first plot and infer from the HISTOGRAMS of each continuous variable
hist(BMSdata$Item_Weight) #we can see there is no as such skewness and the frequency is more or less constant

hist(BMSdata$Item_Visibility)

#as we can see the histogram for Item_Visibility is right skewed with a long right tail we should have mean greater than the median which we can also see from summary(A102train) 
hist(BMSdata$Item_Outlet_Sales)

#also as we can see the histogram for Item_Outlet_Sales is right skewed with a long right tail we should have mean greater than the median which we can also see from summary(A102train) .
#from our shear logic the Item_Outlet_Sales and Item_Visibility should be related(the item which has more sale will obviously be more visible) which is evident from the fact that both Item_Visibility and Item_Outlet_Sales are right skewed.
library(ggplot2)
ggplot(BMSdata, aes(Outlet_Identifier, Item_Weight)) + geom_boxplot()
## Warning: Removed 2439 rows containing non-finite values (stat_boxplot).

# we can see that only the weights of items relted to OUTO19 and OUTO27 are missing.
ggplot(BMSdata, aes(Item_Type, Item_Weight)) + geom_boxplot() 
## Warning: Removed 2439 rows containing non-finite values (stat_boxplot).

# we can see that the weights all types of items are there. 
#similarly for,
ggplot(BMSdata, aes(Item_Fat_Content, Item_Weight)) + geom_boxplot() 
## Warning: Removed 2439 rows containing non-finite values (stat_boxplot).

boxplot(BMSdata$Item_Visibility)

#THERE ARE SO MANY OUTLIERS FOR Item_Visibility VARIABLE which is also evident from the fact that its hist. was right skewed.
ggplot(BMSdata, aes(Item_Type,Item_Visibility)) + geom_boxplot()

# thus the dots above each item type repesent outliers in item visibility corresponding to that particular item.similarly,
ggplot(BMSdata, aes(Outlet_Identifier,Item_Visibility)) + geom_boxplot() 

# its no point to show the boxplot of Item_Visibility against the Item_Identifier.
ggplot(BMSdata, aes(Item_Outlet_Sales,Item_Visibility,col = Item_Type)) + geom_point()  
## Warning: Removed 5681 rows containing missing values (geom_point).

ggplot(BMSdata, aes(Item_Outlet_Sales,Item_Visibility,col = Outlet_Type)) + geom_point()
## Warning: Removed 5681 rows containing missing values (geom_point).

#IMPUTING THE MISSING VALUES FOR DATA SET USING KNN IMPUTATION
#FOR kNN IMPUTATION WE WILL REQUIRE "VIM" LIBRARY WHICH WE HAVE ALREADY IMPORTED
#imputing the missing values for Item_Weight.
#we can also see that Outlet_Size has observations like "_", SO WE WILL FIRST CONVERT THESE VALUES TO 'NA' VALUES SO THAT WE CAN USE,KNN IMPUTATION METHOD SINCE THIS METHOD IS ALSO APPLICABLE FOR CATEGORICAL VARIABLES..
BMSdata[BMSdata$Outlet_Size =="","Outlet_Size"] <- NA
summary(BMSdata)# all the empty values were replaced by NA
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  DRA24  :   10   Min.   : 4.555   LF     : 522     Min.   :0.00000  
##  DRA59  :   10   1st Qu.: 8.710   low fat: 178     1st Qu.:0.02704  
##  DRB25  :   10   Median :12.600   Low Fat:8485     Median :0.05402  
##  DRC25  :   10   Mean   :12.793   reg    : 195     Mean   :0.06595  
##  DRC27  :   10   3rd Qu.:16.750   Regular:4824     3rd Qu.:0.09404  
##  DRC36  :   10   Max.   :21.350                    Max.   :0.32839  
##  (Other):14144   NA's   :2439                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:2013   Min.   : 31.29   OUT027 :1559     
##  Snack Foods          :1989   1st Qu.: 94.01   OUT013 :1553     
##  Household            :1548   Median :142.25   OUT035 :1550     
##  Frozen Foods         :1426   Mean   :141.00   OUT046 :1550     
##  Dairy                :1136   3rd Qu.:185.86   OUT049 :1550     
##  Baking Goods         :1086   Max.   :266.89   OUT045 :1548     
##  (Other)              :5006                    (Other):4894     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :   0   Tier 1:3980         
##  1st Qu.:1987              High  :1553   Tier 2:4641         
##  Median :1999              Medium:4655   Tier 3:5583         
##  Mean   :1998              Small :3980                       
##  3rd Qu.:2004              NA's  :4016                       
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1805   Min.   :   33.29  
##  Supermarket Type1:9294   1st Qu.:  834.25  
##  Supermarket Type2:1546   Median : 1794.33  
##  Supermarket Type3:1559   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
##                           NA's   :5681
imputdata1 <- BMSdata
imputdata1 <- kNN(BMSdata, variable = c("Item_Weight","Outlet_Size"), k = 90)
# k is generally choosen to be squareroot of number of observations.
summary(imputdata1)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  DRA24  :   10   Min.   : 4.555   LF     : 522     Min.   :0.00000  
##  DRA59  :   10   1st Qu.: 9.300   low fat: 178     1st Qu.:0.02704  
##  DRB25  :   10   Median :12.600   Low Fat:8485     Median :0.05402  
##  DRC25  :   10   Mean   :12.761   reg    : 195     Mean   :0.06595  
##  DRC27  :   10   3rd Qu.:16.000   Regular:4824     3rd Qu.:0.09404  
##  DRC36  :   10   Max.   :21.350                    Max.   :0.32839  
##  (Other):14144                                                      
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:2013   Min.   : 31.29   OUT027 :1559     
##  Snack Foods          :1989   1st Qu.: 94.01   OUT013 :1553     
##  Household            :1548   Median :142.25   OUT035 :1550     
##  Frozen Foods         :1426   Mean   :141.00   OUT046 :1550     
##  Dairy                :1136   3rd Qu.:185.86   OUT049 :1550     
##  Baking Goods         :1086   Max.   :266.89   OUT045 :1548     
##  (Other)              :5006                    (Other):4894     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :   0   Tier 1:3980         
##  1st Qu.:1987              High  :1553   Tier 2:4641         
##  Median :1999              Medium:6584   Tier 3:5583         
##  Mean   :1998              Small :6067                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales  Item_Weight_imp
##  Grocery Store    :1805   Min.   :   33.29   Mode :logical  
##  Supermarket Type1:9294   1st Qu.:  834.25   FALSE:11765    
##  Supermarket Type2:1546   Median : 1794.33   TRUE :2439     
##  Supermarket Type3:1559   Mean   : 2181.29                  
##                           3rd Qu.: 3101.30                  
##                           Max.   :13086.97                  
##                           NA's   :5681                      
##  Outlet_Size_imp
##  Mode :logical  
##  FALSE:10188    
##  TRUE :4016     
##                 
##                 
##                 
## 
ncol(imputdata1) #you will see there are two additional logical columns that got created we have to remove them
## [1] 14
imputdata1 <- subset(imputdata1,select = Item_Identifier:Item_Outlet_Sales)
summary(imputdata1)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  DRA24  :   10   Min.   : 4.555   LF     : 522     Min.   :0.00000  
##  DRA59  :   10   1st Qu.: 9.300   low fat: 178     1st Qu.:0.02704  
##  DRB25  :   10   Median :12.600   Low Fat:8485     Median :0.05402  
##  DRC25  :   10   Mean   :12.761   reg    : 195     Mean   :0.06595  
##  DRC27  :   10   3rd Qu.:16.000   Regular:4824     3rd Qu.:0.09404  
##  DRC36  :   10   Max.   :21.350                    Max.   :0.32839  
##  (Other):14144                                                      
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:2013   Min.   : 31.29   OUT027 :1559     
##  Snack Foods          :1989   1st Qu.: 94.01   OUT013 :1553     
##  Household            :1548   Median :142.25   OUT035 :1550     
##  Frozen Foods         :1426   Mean   :141.00   OUT046 :1550     
##  Dairy                :1136   3rd Qu.:185.86   OUT049 :1550     
##  Baking Goods         :1086   Max.   :266.89   OUT045 :1548     
##  (Other)              :5006                    (Other):4894     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :   0   Tier 1:3980         
##  1st Qu.:1987              High  :1553   Tier 2:4641         
##  Median :1999              Medium:6584   Tier 3:5583         
##  Mean   :1998              Small :6067                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1805   Min.   :   33.29  
##  Supermarket Type1:9294   1st Qu.:  834.25  
##  Supermarket Type2:1546   Median : 1794.33  
##  Supermarket Type3:1559   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
##                           NA's   :5681
plot(imputdata1$Item_MRP,imputdata1$Item_Weight)

BMSdata <- imputdata1
#ALSO ONE THING WHICH IS TO BE NOTED IS THAT THE VARIABLE Item_Fat_Content contains same observations with different names which need to be tackled::
BMSdata$Item_Fat_Content <- mapvalues(BMSdata$Item_Fat_Content, from = c("LF","Low Fat","low fat","Regular"), to = c("lf","lf","lf","reg"))
levels(BMSdata$Item_Fat_Content)
## [1] "lf"  "reg"
#lets see the boxplots of different variables.
#now we will look into the features having outliers and implement proper outlier detection technique
#we will only look onto the variables which are not categorical.
boxplot(BMSdata$Item_Weight)

#we can clearly see there are no outliers for this variable.
boxplot(BMSdata$Item_Visibility)

#BUT THERE ARE SO MANY OUTLIERS FOR Item_Visibility VARIABLE which is also evident from the fact that its hist. was right skewed.
#SO Item_Outlet_Sales SHOULD ALSO HAVE MANY OUTLIERS, LETS CHECK,,
boxplot(BMSdata$Item_Outlet_Sales)

#there are indeed many outliers
#WE GENERALLY USE THREE METHODS TO DETECT OUTLIERS 1)DISCARDING OUTLIERS 2)WINSORISATION 3)VARIABLE TRANSFORMATION
#DISCARDING OUTLIERS WILL RESULT IN REDUCTION IN NO. OF OBSERVATIONS AND VARIABLE TRANSFORMATION WILL RESULT IN VERY SMALL VALUES(skewness will still exist)
#SO THE BEST METHOD SHOULD BE WINSORISATION
#DETECTING OUTLIERS:
dataoutlier <- BMSdata 
bench <- 0.09459 + 1.5*IQR(BMSdata$Item_Visibility)
# 0.09459 is the third quartile
bench
## [1] 0.1950924
#value comes out to be 0.1959837
dataoutlier$Item_Visibility[dataoutlier$Item_Visibility > bench] <- bench
boxplot(dataoutlier$Item_Visibility)

# as we can see all the outliers have been removed
BMSdata <- dataoutlier
# Notice that Item_Type has factors which are not food items. So, Item_Fat_Content makes nosense. 
# Hence we will add a new factor(level): "None", which will correspond to the non-food items in Item_Type.
#Adding new level in Item_Fat_Content "None".
levels(BMSdata$Item_Fat_Content) <- c(levels(BMSdata$Item_Fat_Content), "None")
## Based on Item_Type, for "health and Hygiene", "Household" and "Others",
## we will change the Item_Fat_Content factor to "None".
BMSdata[which(BMSdata$Item_Type=="Health and Hygiene"), ]$Item_Fat_Content = "None"
BMSdata[which(BMSdata$Item_Type=="Household"), ]$Item_Fat_Content = "None"
BMSdata[which(BMSdata$Item_Type=="Others"), ]$Item_Fat_Content = "None"
BMSdata$Item_Fat_Content <- as.factor(BMSdata$Item_Fat_Content)
table(BMSdata$Item_Fat_Content) # Viewing the variable
## 
##   lf  reg None 
## 6499 5019 2686
## Since we are only concerned with how old the outlet is, and not the establishment year
BMSdata$Outlet_Year <- 2018 - BMSdata$Outlet_Establishment_Year
table(BMSdata$Outlet_Year)
## 
##    9   11   14   16   19   20   21   31   33 
## 1546 1543 1550 1548 1550  925 1550 1553 2439
BMSdata$Outlet_Year <- as.factor(BMSdata$Outlet_Year)
## Visualizing Item_MRP with ggplot
library(ggplot2)
ggplot(BMSdata, aes(Item_MRP)) + geom_density(adjust = 1/5)

## It is obvious that we would be better off by converting Item_MRP to Categorical variable

#creating a new feature:
#lets create a new feature named "price" which is a categorical variable..
BMSdata$price <- "low"
BMSdata$price[BMSdata$Item_MRP >200] <-"high"
BMSdata$price[BMSdata$Item_MRP>70 & BMSdata$Item_MRP <=200] <- "medium"
summary(BMSdata)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  DRA24  :   10   Min.   : 4.555   lf  :6499        Min.   :0.00000  
##  DRA59  :   10   1st Qu.: 9.300   reg :5019        1st Qu.:0.02704  
##  DRB25  :   10   Median :12.600   None:2686        Median :0.05402  
##  DRC25  :   10   Mean   :12.761                    Mean   :0.06506  
##  DRC27  :   10   3rd Qu.:16.000                    3rd Qu.:0.09404  
##  DRC36  :   10   Max.   :21.350                    Max.   :0.19509  
##  (Other):14144                                                      
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:2013   Min.   : 31.29   OUT027 :1559     
##  Snack Foods          :1989   1st Qu.: 94.01   OUT013 :1553     
##  Household            :1548   Median :142.25   OUT035 :1550     
##  Frozen Foods         :1426   Mean   :141.00   OUT046 :1550     
##  Dairy                :1136   3rd Qu.:185.86   OUT049 :1550     
##  Baking Goods         :1086   Max.   :266.89   OUT045 :1548     
##  (Other)              :5006                    (Other):4894     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :   0   Tier 1:3980         
##  1st Qu.:1987              High  :1553   Tier 2:4641         
##  Median :1999              Medium:6584   Tier 3:5583         
##  Mean   :1998              Small :6067                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales   Outlet_Year  
##  Grocery Store    :1805   Min.   :   33.29   33     :2439  
##  Supermarket Type1:9294   1st Qu.:  834.25   31     :1553  
##  Supermarket Type2:1546   Median : 1794.33   14     :1550  
##  Supermarket Type3:1559   Mean   : 2181.29   19     :1550  
##                           3rd Qu.: 3101.30   21     :1550  
##                           Max.   :13086.97   16     :1548  
##                           NA's   :5681       (Other):4014  
##     price          
##  Length:14204      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
#we can see a new variable named price is created with the property that,whenever,
#MRP <=70,  price =  "low"
#MRP >70 and MRP <= 200,  price = "medium"
#MRP >200,  price ="High"
## We are done with the data cleaning and feature engineering.
# Dividing data into train and test
BMStrain <- BMSdata[1:8523, ]
BMStest <- BMSdata[8524:14204, ]
#now we are ready to build and use our predictive model which is simply #####linear regression#####.
#one thing to be noted is that Item_Identifier and outlet_Identifier clearly have no role in predicting Item_Outlet_Sales ,so they are not significant variables.
##lets develop our linear model. 
model1 <- lm(Item_Outlet_Sales~., data = BMStrain[-c(1,7,8)])
summary(model1)
## 
## Call:
## lm(formula = Item_Outlet_Sales ~ ., data = BMStrain[-c(1, 7, 
##     8)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4324.9  -680.1   -89.4   569.1  7947.0 
## 
## Coefficients: (6 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1814.5976   167.7231 -10.819  < 2e-16 ***
## Item_Weight                       -0.3040     2.8988  -0.105  0.91647    
## Item_Fat_Contentreg               40.5100    28.2571   1.434  0.15172    
## Item_Fat_ContentNone             -23.4622    98.7039  -0.238  0.81212    
## Item_Visibility                 -355.3823   260.9380  -1.362  0.17325    
## Item_TypeBreads                    3.2830    84.1008   0.039  0.96886    
## Item_TypeBreakfast                 4.4071   116.7200   0.038  0.96988    
## Item_TypeCanned                   25.9874    62.7722   0.414  0.67889    
## Item_TypeDairy                   -40.8285    62.4271  -0.654  0.51312    
## Item_TypeFrozen Foods            -28.6851    58.9810  -0.486  0.62674    
## Item_TypeFruits and Vegetables    28.5747    55.0277   0.519  0.60358    
## Item_TypeHard Drinks              -2.8135    90.3882  -0.031  0.97517    
## Item_TypeHealth and Hygiene       12.6656   100.0451   0.127  0.89926    
## Item_TypeHousehold               -15.5418    94.6877  -0.164  0.86963    
## Item_TypeMeat                     -2.6029    70.7146  -0.037  0.97064    
## Item_TypeOthers                        NA         NA      NA       NA    
## Item_TypeSeafood                 183.4691   148.2236   1.238  0.21583    
## Item_TypeSnack Foods             -11.6618    55.3048  -0.211  0.83300    
## Item_TypeSoft Drinks             -28.7016    70.3683  -0.408  0.68337    
## Item_TypeStarchy Foods            23.2072   103.1930   0.225  0.82207    
## Item_MRP                          15.6101     0.3974  39.279  < 2e-16 ***
## Outlet_SizeMedium               -126.7214   100.2683  -1.264  0.20633    
## Outlet_SizeSmall                  -4.0261    90.9873  -0.044  0.96471    
## Outlet_Location_TypeTier 2       -63.9278    52.5672  -1.216  0.22397    
## Outlet_Location_TypeTier 3        26.4651    74.4320   0.356  0.72218    
## Outlet_TypeSupermarket Type1    1888.7865    62.4408  30.249  < 2e-16 ***
## Outlet_TypeSupermarket Type2    1707.1264    79.5297  21.465  < 2e-16 ***
## Outlet_TypeSupermarket Type3    3433.7370    79.4986  43.192  < 2e-16 ***
## Outlet_Year11                    286.4721    89.8299   3.189  0.00143 ** 
## Outlet_Year14                    207.3421    52.5524   3.945 8.03e-05 ***
## Outlet_Year16                          NA         NA      NA       NA    
## Outlet_Year19                    219.2926    95.3150   2.301  0.02143 *  
## Outlet_Year20                          NA         NA      NA       NA    
## Outlet_Year21                          NA         NA      NA       NA    
## Outlet_Year31                          NA         NA      NA       NA    
## Outlet_Year33                          NA         NA      NA       NA    
## pricelow                          14.8425    86.5527   0.171  0.86385    
## pricemedium                       10.0640    51.6574   0.195  0.84554    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1129 on 8491 degrees of freedom
## Multiple R-squared:  0.5639, Adjusted R-squared:  0.5623 
## F-statistic: 354.2 on 31 and 8491 DF,  p-value: < 2.2e-16
model2 <- lm(log(Item_Outlet_Sales)~., data = BMStrain[-c(1,7,8)])
summary(model2)
## 
## Call:
## lm(formula = log(Item_Outlet_Sales) ~ ., data = BMStrain[-c(1, 
##     7, 8)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17817 -0.27354  0.06162  0.36246  1.37742 
## 
## Coefficients: (6 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.3399351  0.0775579  55.957  < 2e-16 ***
## Item_Weight                    -0.0006422  0.0013404  -0.479 0.631883    
## Item_Fat_Contentreg             0.0148206  0.0130666   1.134 0.256725    
## Item_Fat_ContentNone            0.0158332  0.0456423   0.347 0.728677    
## Item_Visibility                -0.0477385  0.1206620  -0.396 0.692382    
## Item_TypeBreads                 0.0456649  0.0388896   1.174 0.240341    
## Item_TypeBreakfast             -0.0461945  0.0539732  -0.856 0.392090    
## Item_TypeCanned                 0.0300592  0.0290269   1.036 0.300436    
## Item_TypeDairy                 -0.0206354  0.0288673  -0.715 0.474729    
## Item_TypeFrozen Foods          -0.0192780  0.0272738  -0.707 0.479691    
## Item_TypeFruits and Vegetables  0.0113726  0.0254457   0.447 0.654932    
## Item_TypeHard Drinks            0.0029078  0.0417970   0.070 0.944538    
## Item_TypeHealth and Hygiene     0.0131150  0.0462625   0.283 0.776807    
## Item_TypeHousehold             -0.0190341  0.0437851  -0.435 0.663779    
## Item_TypeMeat                   0.0313844  0.0326996   0.960 0.337193    
## Item_TypeOthers                        NA         NA      NA       NA    
## Item_TypeSeafood                0.0162538  0.0685410   0.237 0.812554    
## Item_TypeSnack Foods            0.0098772  0.0255738   0.386 0.699341    
## Item_TypeSoft Drinks           -0.0003628  0.0325394  -0.011 0.991105    
## Item_TypeStarchy Foods          0.0078494  0.0477181   0.164 0.869346    
## Item_MRP                        0.0077719  0.0001838  42.292  < 2e-16 ***
## Outlet_SizeMedium              -0.0732145  0.0463657  -1.579 0.114358    
## Outlet_SizeSmall                0.0304026  0.0420740   0.723 0.469947    
## Outlet_Location_TypeTier 2     -0.0289207  0.0243079  -1.190 0.234172    
## Outlet_Location_TypeTier 3      0.0096982  0.0344186   0.282 0.778125    
## Outlet_TypeSupermarket Type1    1.9238599  0.0288737  66.630  < 2e-16 ***
## Outlet_TypeSupermarket Type2    1.8540829  0.0367758  50.416  < 2e-16 ***
## Outlet_TypeSupermarket Type3    2.5606327  0.0367615  69.655  < 2e-16 ***
## Outlet_Year11                   0.1684760  0.0415388   4.056 5.04e-05 ***
## Outlet_Year14                   0.0877455  0.0243011   3.611 0.000307 ***
## Outlet_Year16                          NA         NA      NA       NA    
## Outlet_Year19                   0.1443292  0.0440752   3.275 0.001062 ** 
## Outlet_Year20                          NA         NA      NA       NA    
## Outlet_Year21                          NA         NA      NA       NA    
## Outlet_Year31                          NA         NA      NA       NA    
## Outlet_Year33                          NA         NA      NA       NA    
## pricelow                       -0.1618333  0.0400234  -4.043 5.31e-05 ***
## pricemedium                     0.1891954  0.0238872   7.920 2.67e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5221 on 8491 degrees of freedom
## Multiple R-squared:  0.7375, Adjusted R-squared:  0.7365 
## F-statistic: 769.5 on 31 and 8491 DF,  p-value: < 2.2e-16
model3 <- lm(sqrt(Item_Outlet_Sales)~., data = BMStrain[-c(1,7,8)])
summary(model3)
## 
## Call:
## lm(formula = sqrt(Item_Outlet_Sales) ~ ., data = BMStrain[-c(1, 
##     7, 8)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.834  -6.714   0.119   7.028  42.290 
## 
## Coefficients: (6 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -6.757984   1.590356  -4.249 2.17e-05 ***
## Item_Weight                    -0.006619   0.027486  -0.241 0.809706    
## Item_Fat_Contentreg             0.436906   0.267935   1.631 0.103003    
## Item_Fat_ContentNone            0.161713   0.935914   0.173 0.862824    
## Item_Visibility                -2.444369   2.474223  -0.988 0.323213    
## Item_TypeBreads                 0.476393   0.797447   0.597 0.550258    
## Item_TypeBreakfast             -0.447587   1.106743  -0.404 0.685916    
## Item_TypeCanned                 0.453237   0.595208   0.761 0.446393    
## Item_TypeDairy                 -0.464520   0.591936  -0.785 0.432624    
## Item_TypeFrozen Foods          -0.292273   0.559260  -0.523 0.601262    
## Item_TypeFruits and Vegetables  0.248512   0.521774   0.476 0.633885    
## Item_TypeHard Drinks            0.172029   0.857064   0.201 0.840923    
## Item_TypeHealth and Hygiene     0.069602   0.948631   0.073 0.941513    
## Item_TypeHousehold             -0.331192   0.897832  -0.369 0.712226    
## Item_TypeMeat                   0.228763   0.670518   0.341 0.732982    
## Item_TypeOthers                       NA         NA      NA       NA    
## Item_TypeSeafood                1.277690   1.405461   0.909 0.363329    
## Item_TypeSnack Foods            0.051242   0.524402   0.098 0.922161    
## Item_TypeSoft Drinks           -0.253610   0.667234  -0.380 0.703887    
## Item_TypeStarchy Foods          0.285738   0.978480   0.292 0.770276    
## Item_MRP                        0.162845   0.003768  43.214  < 2e-16 ***
## Outlet_SizeMedium              -1.295415   0.950747  -1.363 0.173069    
## Outlet_SizeSmall                0.205086   0.862745   0.238 0.812109    
## Outlet_Location_TypeTier 2     -0.672525   0.498443  -1.349 0.177293    
## Outlet_Location_TypeTier 3      0.248797   0.705766   0.353 0.724456    
## Outlet_TypeSupermarket Type1   27.466278   0.592066  46.391  < 2e-16 ***
## Outlet_TypeSupermarket Type2   25.592984   0.754103  33.938  < 2e-16 ***
## Outlet_TypeSupermarket Type3   42.010478   0.753809  55.731  < 2e-16 ***
## Outlet_Year11                   3.103005   0.851771   3.643 0.000271 ***
## Outlet_Year14                   2.094436   0.498304   4.203 2.66e-05 ***
## Outlet_Year16                         NA         NA      NA       NA    
## Outlet_Year19                   2.444484   0.903780   2.705 0.006850 ** 
## Outlet_Year20                         NA         NA      NA       NA    
## Outlet_Year21                         NA         NA      NA       NA    
## Outlet_Year31                         NA         NA      NA       NA    
## Outlet_Year33                         NA         NA      NA       NA    
## pricelow                       -1.045114   0.820696  -1.273 0.202894    
## pricemedium                     2.069835   0.489817   4.226 2.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.71 on 8491 degrees of freedom
## Multiple R-squared:  0.6613, Adjusted R-squared:   0.66 
## F-statistic: 534.7 on 31 and 8491 DF,  p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(model1)

#Lets check RMSE values
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.5.1
rmse(BMStrain$Item_Outlet_Sales, model1$fitted.values) 
## [1] 1126.871
#RMSE value is 1126.891 (model which is suffering from heteroskedasticity)
rmse(BMStrain$Item_Outlet_Sales, exp(model2$fitted.values))
## [1] 1105.288
#New RMSE value is 1105.367 Thats Awesome! we improved a lot.
#Prediction on test_dataset
#BMStest$Item_Outlet_Sales <- predict(model2 , newdata = BMStest[-c(1,7,8,12)])
BMStest2 <- BMStest[c(1:11,13,14)]
options(warn = -1)
predicted <- predict(model2,newdata = BMStest2)
BMStest2$Item_Outlet_Sales <-exp(predicted)
Item_Identifier <- BMStest$Item_Identifier
Outlet_Identifier <- BMStest$Outlet_Identifier
output.df <- as.data.frame(Item_Identifier)
output.df$Outlet_Identifier <- Outlet_Identifier 
output.df$Item_Outlet_Sales <- exp(predicted)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
some(output.df)
##      Item_Identifier Outlet_Identifier Item_Outlet_Sales
## 823            FDU45            OUT010          233.2086
## 1260           NCS41            OUT019          400.9496
## 1419           FDG40            OUT035          618.1314
## 2252           FDN01            OUT017         2512.3861
## 2312           NCI17            OUT035         1511.3176
## 2478           NCF55            OUT027         1039.2112
## 2673           FDN49            OUT027         1038.5849
## 2830           FDE53            OUT019          213.7131
## 4004           FDQ04            OUT045          535.9542
## 4234           FDO44            OUT049         1629.7312